41 #include <winpr/wtypes.h>
42 #include <winpr/string.h>
43 #include <winpr/assert.h>
44 #include <winpr/cast.h>
49 #define TAG WINPR_TAG("unicode")
60 #define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
61 #define UNI_MAX_BMP (uint32_t)0x0000FFFF
62 #define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
63 #define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
64 #define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
80 static const int halfShift = 10;
82 static const uint32_t halfBase = 0x0010000UL;
83 static const uint32_t halfMask = 0x3FFUL;
85 #define UNI_SUR_HIGH_START (uint32_t)0xD800
86 #define UNI_SUR_HIGH_END (uint32_t)0xDBFF
87 #define UNI_SUR_LOW_START (uint32_t)0xDC00
88 #define UNI_SUR_LOW_END (uint32_t)0xDFFF
99 static const char trailingBytesForUTF8[256] = {
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
115 static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
116 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
125 static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
139 static ConversionResult winpr_ConvertUTF16toUTF8_Internal(
const uint16_t** sourceStart,
140 const uint16_t* sourceEnd,
141 uint8_t** targetStart, uint8_t* targetEnd,
142 ConversionFlags flags)
144 bool computeLength = (!targetEnd) ?
true :
false;
145 const uint16_t* source = *sourceStart;
146 uint8_t* target = *targetStart;
147 ConversionResult result = conversionOK;
149 while (source < sourceEnd)
152 unsigned short bytesToWrite = 0;
153 const uint32_t byteMask = 0xBF;
154 const uint32_t byteMark = 0x80;
155 const uint16_t* oldSource =
161 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
164 if (source < sourceEnd)
166 uint32_t ch2 = *source;
169 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
171 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
175 else if (flags == strictConversion)
179 result = sourceIllegal;
187 result = sourceExhausted;
191 else if (flags == strictConversion)
194 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
197 result = sourceIllegal;
203 if (ch < (uint32_t)0x80)
207 else if (ch < (uint32_t)0x800)
211 else if (ch < (uint32_t)0x10000)
215 else if (ch < (uint32_t)0x110000)
222 ch = UNI_REPLACEMENT_CHAR;
225 target += bytesToWrite;
227 if ((target > targetEnd) && (!computeLength))
230 target -= bytesToWrite;
231 result = targetExhausted;
237 switch (bytesToWrite)
241 *--target = (uint8_t)((ch | byteMark) & byteMask);
246 *--target = (uint8_t)((ch | byteMark) & byteMask);
252 *--target = (uint8_t)((ch | byteMark) & byteMask);
258 *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
263 switch (bytesToWrite)
286 target += bytesToWrite;
289 *sourceStart = source;
290 *targetStart = target;
307 static bool isLegalUTF8(
const uint8_t* source,
int length)
310 const uint8_t* srcptr = source + length;
319 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
325 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
331 if ((a = (*--srcptr)) > 0xBF)
370 if (*source >= 0x80 && *source < 0xC2)
382 static ConversionResult winpr_ConvertUTF8toUTF16_Internal(
const uint8_t** sourceStart,
383 const uint8_t* sourceEnd,
384 uint16_t** targetStart,
386 ConversionFlags flags)
388 bool computeLength = (!targetEnd) ?
true :
false;
389 ConversionResult result = conversionOK;
390 const uint8_t* source = *sourceStart;
391 uint16_t* target = *targetStart;
393 while (source < sourceEnd)
396 unsigned short extraBytesToRead =
397 WINPR_ASSERTING_INT_CAST(
unsigned short, trailingBytesForUTF8[*source]);
399 if ((source + extraBytesToRead) >= sourceEnd)
401 result = sourceExhausted;
406 if (!isLegalUTF8(source, extraBytesToRead + 1))
408 result = sourceIllegal;
415 switch (extraBytesToRead)
451 ch -= offsetsFromUTF8[extraBytesToRead];
453 if ((target >= targetEnd) && (!computeLength))
455 source -= (extraBytesToRead + 1);
456 result = targetExhausted;
460 if (ch <= UNI_MAX_BMP)
464 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
466 if (flags == strictConversion)
468 source -= (extraBytesToRead + 1);
469 result = sourceIllegal;
475 *target++ = UNI_REPLACEMENT_CHAR;
483 *target++ = (uint16_t)ch;
488 else if (ch > UNI_MAX_UTF16)
490 if (flags == strictConversion)
492 result = sourceIllegal;
493 source -= (extraBytesToRead + 1);
499 *target++ = UNI_REPLACEMENT_CHAR;
507 if ((target + 1 >= targetEnd) && (!computeLength))
509 source -= (extraBytesToRead + 1);
510 result = targetExhausted;
518 *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
519 *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
529 *sourceStart = source;
530 *targetStart = target;
538 static int winpr_ConvertUTF8toUTF16(
const uint8_t* src,
int cchSrc, uint16_t* dst,
int cchDst)
541 uint16_t* dstBeg = NULL;
542 uint16_t* dstEnd = NULL;
543 const uint8_t* srcBeg = NULL;
544 const uint8_t* srcEnd = NULL;
545 ConversionResult result = sourceIllegal;
548 cchSrc = (int)strnlen((
const char*)src, INT32_MAX - 1) + 1;
551 srcEnd = &src[cchSrc];
556 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
558 length = dstBeg - (uint16_t*)NULL;
563 dstEnd = &dst[cchDst];
566 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
568 length = dstBeg - dst;
571 if (result == targetExhausted)
573 SetLastError(ERROR_INSUFFICIENT_BUFFER);
577 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(
int, length) : 0;
580 static int winpr_ConvertUTF16toUTF8(
const uint16_t* src,
int cchSrc, uint8_t* dst,
int cchDst)
583 uint8_t* dstBeg = NULL;
584 uint8_t* dstEnd = NULL;
585 const uint16_t* srcBeg = NULL;
586 const uint16_t* srcEnd = NULL;
587 ConversionResult result = sourceIllegal;
590 cchSrc = (int)_wcsnlen((
const WCHAR*)src, INT32_MAX - 1) + 1;
593 srcEnd = &src[cchSrc];
598 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
600 length = dstBeg - ((uint8_t*)NULL);
605 dstEnd = &dst[cchDst];
608 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
610 length = dstBeg - dst;
613 if (result == targetExhausted)
615 SetLastError(ERROR_INSUFFICIENT_BUFFER);
619 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(
int, length) : 0;
624 int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
int cbMultiByte,
625 LPWSTR lpWideCharStr,
int cchWideChar)
627 size_t cbCharLen = (size_t)cbMultiByte;
629 WINPR_UNUSED(dwFlags);
632 if ((cbMultiByte == 0) || (cbMultiByte < -1))
640 const size_t len = strlen(lpMultiByteStr);
641 if (len >= INT32_MAX)
643 cbCharLen = (int)len + 1;
646 cbCharLen = cbMultiByte;
648 WINPR_ASSERT(lpMultiByteStr);
656 WLog_ERR(TAG,
"Unsupported encoding %u", CodePage);
660 return winpr_ConvertUTF8toUTF16((
const uint8_t*)lpMultiByteStr,
661 WINPR_ASSERTING_INT_CAST(
int, cbCharLen),
662 (uint16_t*)lpWideCharStr, cchWideChar);
665 int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
int cchWideChar,
666 LPSTR lpMultiByteStr,
int cbMultiByte, LPCSTR lpDefaultChar,
667 LPBOOL lpUsedDefaultChar)
669 size_t cbCharLen = (size_t)cchWideChar;
671 WINPR_UNUSED(dwFlags);
673 if ((cchWideChar == 0) || (cchWideChar < -1))
679 WINPR_ASSERT(lpWideCharStr);
681 if (cchWideChar == -1)
683 const size_t len = _wcslen(lpWideCharStr);
684 if (len >= INT32_MAX)
686 cbCharLen = (int)len + 1;
689 cbCharLen = cchWideChar;
696 return winpr_ConvertUTF16toUTF8((
const uint16_t*)lpWideCharStr,
697 WINPR_ASSERTING_INT_CAST(
int, cbCharLen),
698 (uint8_t*)lpMultiByteStr, cbMultiByte);