FreeRDP
Loading...
Searching...
No Matches
unicode_builtin.c
1/*
2 * Copyright 2001-2004 Unicode, Inc.
3 *
4 * Disclaimer
5 *
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
13 *
14 * Limitations on Rights to Redistribute This Code
15 *
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
21 */
22
23/* ---------------------------------------------------------------------
24
25Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26Author: Mark E. Davis, 1994.
27Rev History: Rick McGowan, fixes & updates May 2001.
28Sept 2001: fixed const & error conditions per
29mods suggested by S. Parent & A. Lillich.
30June 2002: Tim Dodd added detection and handling of incomplete
31source sequences, enhanced error detection, added casts
32to eliminate compiler warnings.
33July 2003: slight mods to back out aggressive FFFE detection.
34Jan 2004: updated switches in from-UTF8 conversions.
35Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37See the header file "utf.h" for complete documentation.
38
39------------------------------------------------------------------------ */
40
41#include <winpr/wtypes.h>
42#include <winpr/string.h>
43#include <winpr/assert.h>
44#include <winpr/cast.h>
45
46#include "unicode.h"
47
48#include "../log.h"
49#define TAG WINPR_TAG("unicode")
50
51/*
52 * Character Types:
53 *
54 * UTF8: uint8_t 8 bits
55 * UTF16: uint16_t 16 bits
56 * UTF32: uint32_t 32 bits
57 */
58
59/* Some fundamental constants */
60#define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
61#define UNI_MAX_BMP (uint32_t)0x0000FFFF
62#define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
63#define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
64#define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
65
66typedef enum
67{
68 conversionOK, /* conversion successful */
69 sourceExhausted, /* partial character in source, but hit end */
70 targetExhausted, /* insuff. room in target for conversion */
71 sourceIllegal /* source sequence is illegal/malformed */
72} ConversionResult;
73
74typedef enum
75{
76 strictConversion = 0,
77 lenientConversion
78} ConversionFlags;
79
80static const int halfShift = 10; /* used for shifting by 10 bits */
81
82static const uint32_t halfBase = 0x0010000UL;
83static const uint32_t halfMask = 0x3FFUL;
84
85#define UNI_SUR_HIGH_START (uint32_t)0xD800
86#define UNI_SUR_HIGH_END (uint32_t)0xDBFF
87#define UNI_SUR_LOW_START (uint32_t)0xDC00
88#define UNI_SUR_LOW_END (uint32_t)0xDFFF
89
90/* --------------------------------------------------------------------- */
91
92/*
93 * Index into the table below with the first byte of a UTF-8 sequence to
94 * get the number of trailing bytes that are supposed to follow it.
95 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
96 * left as-is for anyone who may want to do such conversion, which was
97 * allowed in earlier algorithms.
98 */
99static const char trailingBytesForUTF8[256] = {
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
108};
109
110/*
111 * Magic values subtracted from a buffer value during UTF8 conversion.
112 * This table contains as many values as there might be trailing bytes
113 * in a UTF-8 sequence.
114 */
115static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
116 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
117
118/*
119 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
120 * into the first byte, depending on how many bytes follow. There are
121 * as many entries in this table as there are UTF-8 sequence types.
122 * (I.e., one byte sequence, two byte... etc.). Remember that sequence
123 * for *legal* UTF-8 will be 4 or fewer bytes total.
124 */
125static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
126
127/* We always need UTF-16LE, even on big endian systems! */
128static WCHAR setWcharFrom(WCHAR w)
129{
130#if defined(__BIG_ENDIAN__)
131 union
132 {
133 WCHAR w;
134 char c[2];
135 } cnv;
136
137 cnv.w = w;
138 const char c = cnv.c[0];
139 cnv.c[0] = cnv.c[1];
140 cnv.c[1] = c;
141 return cnv.w;
142#else
143 return w;
144#endif
145}
146
147/* --------------------------------------------------------------------- */
148
149/* The interface converts a whole buffer to avoid function-call overhead.
150 * Constants have been gathered. Loops & conditionals have been removed as
151 * much as possible for efficiency, in favor of drop-through switches.
152 * (See "Note A" at the bottom of the file for equivalent code.)
153 * If your compiler supports it, the "isLegalUTF8" call can be turned
154 * into an inline function.
155 */
156
157/* --------------------------------------------------------------------- */
158
159static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
160 const uint16_t* sourceEnd,
161 uint8_t** targetStart,
162 const uint8_t* targetEnd,
163 ConversionFlags flags)
164{
165 bool computeLength = (!targetEnd) ? true : false;
166 const uint16_t* source = *sourceStart;
167 uint8_t* target = *targetStart;
168 ConversionResult result = conversionOK;
169
170 while (source < sourceEnd)
171 {
172 uint32_t ch = 0;
173 unsigned short bytesToWrite = 0;
174 const uint32_t byteMask = 0xBF;
175 const uint32_t byteMark = 0x80;
176 const uint16_t* oldSource =
177 source; /* In case we have to back up because of target overflow. */
178
179 ch = setWcharFrom(*source++);
180
181 /* If we have a surrogate pair, convert to UTF32 first. */
182 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
183 {
184 /* If the 16 bits following the high surrogate are in the source buffer... */
185 if (source < sourceEnd)
186 {
187 uint32_t ch2 = setWcharFrom(*source);
188
189 /* If it's a low surrogate, convert to UTF32. */
190 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
191 {
192 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
193 halfBase;
194 ++source;
195 }
196 else if (flags == strictConversion)
197 {
198 /* it's an unpaired high surrogate */
199 --source; /* return to the illegal value itself */
200 result = sourceIllegal;
201 break;
202 }
203 }
204 else
205 {
206 /* We don't have the 16 bits following the high surrogate. */
207 --source; /* return to the high surrogate */
208 result = sourceExhausted;
209 break;
210 }
211 }
212 else if (flags == strictConversion)
213 {
214 /* UTF-16 surrogate values are illegal in UTF-32 */
215 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
216 {
217 --source; /* return to the illegal value itself */
218 result = sourceIllegal;
219 break;
220 }
221 }
222
223 /* Figure out how many bytes the result will require */
224 if (ch < (uint32_t)0x80)
225 {
226 bytesToWrite = 1;
227 }
228 else if (ch < (uint32_t)0x800)
229 {
230 bytesToWrite = 2;
231 }
232 else if (ch < (uint32_t)0x10000)
233 {
234 bytesToWrite = 3;
235 }
236 else if (ch < (uint32_t)0x110000)
237 {
238 bytesToWrite = 4;
239 }
240 else
241 {
242 bytesToWrite = 3;
243 ch = UNI_REPLACEMENT_CHAR;
244 }
245
246 target += bytesToWrite;
247
248 if ((target > targetEnd) && (!computeLength))
249 {
250 source = oldSource; /* Back up source pointer! */
251 target -= bytesToWrite;
252 result = targetExhausted;
253 break;
254 }
255
256 if (!computeLength)
257 {
258 switch (bytesToWrite)
259 {
260 /* note: everything falls through. */
261 case 4:
262 *--target = (uint8_t)((ch | byteMark) & byteMask);
263 ch >>= 6;
264 /* fallthrough */
265 WINPR_FALLTHROUGH
266 case 3:
267 *--target = (uint8_t)((ch | byteMark) & byteMask);
268 ch >>= 6;
269 /* fallthrough */
270 WINPR_FALLTHROUGH
271
272 case 2:
273 *--target = (uint8_t)((ch | byteMark) & byteMask);
274 ch >>= 6;
275 /* fallthrough */
276 WINPR_FALLTHROUGH
277
278 case 1:
279 *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
280 break;
281 default:
282 return sourceIllegal;
283 }
284 }
285 else
286 {
287 switch (bytesToWrite)
288 {
289 /* note: everything falls through. */
290 case 4:
291 --target;
292 /* fallthrough */
293 WINPR_FALLTHROUGH
294
295 case 3:
296 --target;
297 /* fallthrough */
298 WINPR_FALLTHROUGH
299
300 case 2:
301 --target;
302 /* fallthrough */
303 WINPR_FALLTHROUGH
304
305 case 1:
306 --target;
307 break;
308 default:
309 return sourceIllegal;
310 }
311 }
312
313 target += bytesToWrite;
314 }
315
316 *sourceStart = source;
317 *targetStart = target;
318 return result;
319}
320
321/* --------------------------------------------------------------------- */
322
323/*
324 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
325 * This must be called with the length pre-determined by the first byte.
326 * If not calling this from ConvertUTF8to*, then the length can be set by:
327 * length = trailingBytesForUTF8[*source]+1;
328 * and the sequence is illegal right away if there aren't that many bytes
329 * available.
330 * If presented with a length > 4, this returns false. The Unicode
331 * definition of UTF-8 goes up to 4-byte sequences.
332 */
333
334static bool isLegalUTF8(const uint8_t* source, int length)
335{
336 uint8_t a = 0;
337 const uint8_t* srcptr = source + length;
338
339 switch (length)
340 {
341 default:
342 return false;
343
344 /* Everything else falls through when "true"... */
345 case 4:
346 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
347 return false;
348 /* fallthrough */
349 WINPR_FALLTHROUGH
350
351 case 3:
352 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
353 return false;
354 /* fallthrough */
355 WINPR_FALLTHROUGH
356
357 case 2:
358 if ((a = (*--srcptr)) > 0xBF)
359 return false;
360
361 switch (*source)
362 {
363 /* no fall-through in this inner switch */
364 case 0xE0:
365 if (a < 0xA0)
366 return false;
367
368 break;
369
370 case 0xED:
371 if (a > 0x9F)
372 return false;
373
374 break;
375
376 case 0xF0:
377 if (a < 0x90)
378 return false;
379
380 break;
381
382 case 0xF4:
383 if (a > 0x8F)
384 return false;
385
386 break;
387
388 default:
389 if (a < 0x80)
390 return false;
391 break;
392 }
393 /* fallthrough */
394 WINPR_FALLTHROUGH
395
396 case 1:
397 if (*source >= 0x80 && *source < 0xC2)
398 return false;
399 }
400
401 if (*source > 0xF4)
402 return false;
403
404 return true;
405}
406
407/* --------------------------------------------------------------------- */
408
409static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
410 const uint8_t* sourceEnd,
411 uint16_t** targetStart,
412 const uint16_t* targetEnd,
413 ConversionFlags flags)
414{
415 bool computeLength = (!targetEnd) ? true : false;
416 ConversionResult result = conversionOK;
417 const uint8_t* source = *sourceStart;
418 uint16_t* target = *targetStart;
419
420 while (source < sourceEnd)
421 {
422 uint32_t ch = 0;
423 unsigned short extraBytesToRead =
424 WINPR_ASSERTING_INT_CAST(unsigned short, trailingBytesForUTF8[*source]);
425
426 if ((source + extraBytesToRead) >= sourceEnd)
427 {
428 result = sourceExhausted;
429 break;
430 }
431
432 /* Do this check whether lenient or strict */
433 if (!isLegalUTF8(source, extraBytesToRead + 1))
434 {
435 result = sourceIllegal;
436 break;
437 }
438
439 /*
440 * The cases all fall through. See "Note A" below.
441 */
442 switch (extraBytesToRead)
443 {
444 case 5:
445 ch += *source++;
446 ch <<= 6; /* remember, illegal UTF-8 */
447 /* fallthrough */
448 WINPR_FALLTHROUGH
449
450 case 4:
451 ch += *source++;
452 ch <<= 6; /* remember, illegal UTF-8 */
453 /* fallthrough */
454 WINPR_FALLTHROUGH
455
456 case 3:
457 ch += *source++;
458 ch <<= 6;
459 /* fallthrough */
460 WINPR_FALLTHROUGH
461
462 case 2:
463 ch += *source++;
464 ch <<= 6;
465 /* fallthrough */
466 WINPR_FALLTHROUGH
467
468 case 1:
469 ch += *source++;
470 ch <<= 6;
471 /* fallthrough */
472 WINPR_FALLTHROUGH
473
474 case 0:
475 ch += *source++;
476 break;
477 default:
478 return sourceIllegal;
479 }
480
481 ch -= offsetsFromUTF8[extraBytesToRead];
482
483 if ((target >= targetEnd) && (!computeLength))
484 {
485 source -= (extraBytesToRead + 1); /* Back up source pointer! */
486 result = targetExhausted;
487 break;
488 }
489
490 if (ch <= UNI_MAX_BMP)
491 {
492 /* Target is a character <= 0xFFFF */
493 /* UTF-16 surrogate values are illegal in UTF-32 */
494 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
495 {
496 if (flags == strictConversion)
497 {
498 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
499 result = sourceIllegal;
500 break;
501 }
502 else
503 {
504 if (!computeLength)
505 *target++ = setWcharFrom(UNI_REPLACEMENT_CHAR);
506 else
507 target++;
508 }
509 }
510 else
511 {
512 if (!computeLength)
513 *target++ = setWcharFrom((WCHAR)ch); /* normal case */
514 else
515 target++;
516 }
517 }
518 else if (ch > UNI_MAX_UTF16)
519 {
520 if (flags == strictConversion)
521 {
522 result = sourceIllegal;
523 source -= (extraBytesToRead + 1); /* return to the start */
524 break; /* Bail out; shouldn't continue */
525 }
526 else
527 {
528 if (!computeLength)
529 *target++ = setWcharFrom(UNI_REPLACEMENT_CHAR);
530 else
531 target++;
532 }
533 }
534 else
535 {
536 /* target is a character in range 0xFFFF - 0x10FFFF. */
537 if ((target + 1 >= targetEnd) && (!computeLength))
538 {
539 source -= (extraBytesToRead + 1); /* Back up source pointer! */
540 result = targetExhausted;
541 break;
542 }
543
544 ch -= halfBase;
545
546 if (!computeLength)
547 {
548 *target++ = setWcharFrom((WCHAR)((ch >> halfShift) + UNI_SUR_HIGH_START));
549 *target++ = setWcharFrom((WCHAR)((ch & halfMask) + UNI_SUR_LOW_START));
550 }
551 else
552 {
553 target++;
554 target++;
555 }
556 }
557 }
558
559 *sourceStart = source;
560 *targetStart = target;
561 return result;
562}
563
568static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
569{
570 size_t length = 0;
571 uint16_t* dstBeg = NULL;
572 uint16_t* dstEnd = NULL;
573 const uint8_t* srcBeg = NULL;
574 const uint8_t* srcEnd = NULL;
575 ConversionResult result = sourceIllegal;
576
577 if (cchSrc == -1)
578 cchSrc = (int)strnlen((const char*)src, INT32_MAX - 1) + 1;
579
580 srcBeg = src;
581 srcEnd = &src[cchSrc];
582
583 if (cchDst == 0)
584 {
585 result =
586 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
587
588 length = dstBeg - (uint16_t*)NULL;
589 }
590 else
591 {
592 dstBeg = dst;
593 dstEnd = &dst[cchDst];
594
595 result =
596 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
597
598 length = dstBeg - dst;
599 }
600
601 if (result == targetExhausted)
602 {
603 SetLastError(ERROR_INSUFFICIENT_BUFFER);
604 return 0;
605 }
606
607 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
608}
609
610static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
611{
612 size_t length = 0;
613 uint8_t* dstBeg = NULL;
614 uint8_t* dstEnd = NULL;
615 const uint16_t* srcBeg = NULL;
616 const uint16_t* srcEnd = NULL;
617 ConversionResult result = sourceIllegal;
618
619 if (cchSrc == -1)
620 cchSrc = (int)_wcsnlen((const WCHAR*)src, INT32_MAX - 1) + 1;
621
622 srcBeg = src;
623 srcEnd = &src[cchSrc];
624
625 if (cchDst == 0)
626 {
627 result =
628 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
629
630 length = dstBeg - ((uint8_t*)NULL);
631 }
632 else
633 {
634 dstBeg = dst;
635 dstEnd = &dst[cchDst];
636
637 result =
638 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
639
640 length = dstBeg - dst;
641 }
642
643 if (result == targetExhausted)
644 {
645 SetLastError(ERROR_INSUFFICIENT_BUFFER);
646 return 0;
647 }
648
649 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
650}
651
652/* --------------------------------------------------------------------- */
653
654int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
655 LPWSTR lpWideCharStr, int cchWideChar)
656{
657 size_t cbCharLen = (size_t)cbMultiByte;
658
659 WINPR_UNUSED(dwFlags);
660
661 /* If cbMultiByte is 0, the function fails */
662 if ((cbMultiByte == 0) || (cbMultiByte < -1))
663 return 0;
664
665 if (cchWideChar < 0)
666 return -1;
667
668 if (cbMultiByte < 0)
669 {
670 const size_t len = strlen(lpMultiByteStr);
671 if (len >= INT32_MAX)
672 return 0;
673 cbCharLen = (int)len + 1;
674 }
675 else
676 cbCharLen = cbMultiByte;
677
678 WINPR_ASSERT(lpMultiByteStr);
679 switch (CodePage)
680 {
681 case CP_ACP:
682 case CP_UTF8:
683 break;
684
685 default:
686 WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
687 return 0;
688 }
689
690 return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr,
691 WINPR_ASSERTING_INT_CAST(int, cbCharLen),
692 (uint16_t*)lpWideCharStr, cchWideChar);
693}
694
695int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
696 LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
697 LPBOOL lpUsedDefaultChar)
698{
699 size_t cbCharLen = (size_t)cchWideChar;
700
701 WINPR_UNUSED(dwFlags);
702 /* If cchWideChar is 0, the function fails */
703 if ((cchWideChar == 0) || (cchWideChar < -1))
704 return 0;
705
706 if (cbMultiByte < 0)
707 return -1;
708
709 WINPR_ASSERT(lpWideCharStr);
710 /* If cchWideChar is -1, the string is null-terminated */
711 if (cchWideChar == -1)
712 {
713 const size_t len = _wcslen(lpWideCharStr);
714 if (len >= INT32_MAX)
715 return 0;
716 cbCharLen = (int)len + 1;
717 }
718 else
719 cbCharLen = cchWideChar;
720
721 /*
722 * if cbMultiByte is 0, the function returns the required buffer size
723 * in bytes for lpMultiByteStr and makes no use of the output parameter itself.
724 */
725
726 return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr,
727 WINPR_ASSERTING_INT_CAST(int, cbCharLen),
728 (uint8_t*)lpMultiByteStr, cbMultiByte);
729}