41 #include <winpr/wtypes.h>
42 #include <winpr/string.h>
43 #include <winpr/assert.h>
48 #define TAG WINPR_TAG("unicode")
59 #define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
60 #define UNI_MAX_BMP (uint32_t)0x0000FFFF
61 #define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
62 #define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
63 #define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
79 static const int halfShift = 10;
81 static const uint32_t halfBase = 0x0010000UL;
82 static const uint32_t halfMask = 0x3FFUL;
84 #define UNI_SUR_HIGH_START (uint32_t)0xD800
85 #define UNI_SUR_HIGH_END (uint32_t)0xDBFF
86 #define UNI_SUR_LOW_START (uint32_t)0xDC00
87 #define UNI_SUR_LOW_END (uint32_t)0xDFFF
98 static const char trailingBytesForUTF8[256] = {
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
114 static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
115 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
124 static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
138 static ConversionResult winpr_ConvertUTF16toUTF8_Internal(
const uint16_t** sourceStart,
139 const uint16_t* sourceEnd,
140 uint8_t** targetStart, uint8_t* targetEnd,
141 ConversionFlags flags)
143 bool computeLength = (!targetEnd) ?
true :
false;
144 const uint16_t* source = *sourceStart;
145 uint8_t* target = *targetStart;
146 ConversionResult result = conversionOK;
148 while (source < sourceEnd)
151 unsigned short bytesToWrite = 0;
152 const uint32_t byteMask = 0xBF;
153 const uint32_t byteMark = 0x80;
154 const uint16_t* oldSource =
160 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
163 if (source < sourceEnd)
165 uint32_t ch2 = *source;
168 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
170 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
174 else if (flags == strictConversion)
178 result = sourceIllegal;
186 result = sourceExhausted;
190 else if (flags == strictConversion)
193 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
196 result = sourceIllegal;
202 if (ch < (uint32_t)0x80)
206 else if (ch < (uint32_t)0x800)
210 else if (ch < (uint32_t)0x10000)
214 else if (ch < (uint32_t)0x110000)
221 ch = UNI_REPLACEMENT_CHAR;
224 target += bytesToWrite;
226 if ((target > targetEnd) && (!computeLength))
229 target -= bytesToWrite;
230 result = targetExhausted;
236 switch (bytesToWrite)
240 *--target = (uint8_t)((ch | byteMark) & byteMask);
245 *--target = (uint8_t)((ch | byteMark) & byteMask);
251 *--target = (uint8_t)((ch | byteMark) & byteMask);
257 *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
262 switch (bytesToWrite)
285 target += bytesToWrite;
288 *sourceStart = source;
289 *targetStart = target;
306 static bool isLegalUTF8(
const uint8_t* source,
int length)
309 const uint8_t* srcptr = source + length;
318 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
324 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
330 if ((a = (*--srcptr)) > 0xBF)
369 if (*source >= 0x80 && *source < 0xC2)
381 static ConversionResult winpr_ConvertUTF8toUTF16_Internal(
const uint8_t** sourceStart,
382 const uint8_t* sourceEnd,
383 uint16_t** targetStart,
385 ConversionFlags flags)
387 bool computeLength = (!targetEnd) ?
true :
false;
388 ConversionResult result = conversionOK;
389 const uint8_t* source = *sourceStart;
390 uint16_t* target = *targetStart;
392 while (source < sourceEnd)
395 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
397 if ((source + extraBytesToRead) >= sourceEnd)
399 result = sourceExhausted;
404 if (!isLegalUTF8(source, extraBytesToRead + 1))
406 result = sourceIllegal;
413 switch (extraBytesToRead)
449 ch -= offsetsFromUTF8[extraBytesToRead];
451 if ((target >= targetEnd) && (!computeLength))
453 source -= (extraBytesToRead + 1);
454 result = targetExhausted;
458 if (ch <= UNI_MAX_BMP)
462 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
464 if (flags == strictConversion)
466 source -= (extraBytesToRead + 1);
467 result = sourceIllegal;
473 *target++ = UNI_REPLACEMENT_CHAR;
481 *target++ = (uint16_t)ch;
486 else if (ch > UNI_MAX_UTF16)
488 if (flags == strictConversion)
490 result = sourceIllegal;
491 source -= (extraBytesToRead + 1);
497 *target++ = UNI_REPLACEMENT_CHAR;
505 if ((target + 1 >= targetEnd) && (!computeLength))
507 source -= (extraBytesToRead + 1);
508 result = targetExhausted;
516 *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
517 *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
527 *sourceStart = source;
528 *targetStart = target;
536 static int winpr_ConvertUTF8toUTF16(
const uint8_t* src,
int cchSrc, uint16_t* dst,
int cchDst)
539 uint16_t* dstBeg = NULL;
540 uint16_t* dstEnd = NULL;
541 const uint8_t* srcBeg = NULL;
542 const uint8_t* srcEnd = NULL;
543 ConversionResult result = sourceIllegal;
546 cchSrc = (int)strnlen((
const char*)src, INT32_MAX - 1) + 1;
549 srcEnd = &src[cchSrc];
554 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
556 length = dstBeg - (uint16_t*)NULL;
561 dstEnd = &dst[cchDst];
564 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
566 length = dstBeg - dst;
569 if (result == targetExhausted)
571 SetLastError(ERROR_INSUFFICIENT_BUFFER);
575 return (result == conversionOK) ? length : 0;
578 static int winpr_ConvertUTF16toUTF8(
const uint16_t* src,
int cchSrc, uint8_t* dst,
int cchDst)
581 uint8_t* dstBeg = NULL;
582 uint8_t* dstEnd = NULL;
583 const uint16_t* srcBeg = NULL;
584 const uint16_t* srcEnd = NULL;
585 ConversionResult result = sourceIllegal;
588 cchSrc = (int)_wcsnlen((
const WCHAR*)src, INT32_MAX - 1) + 1;
591 srcEnd = &src[cchSrc];
596 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
598 length = dstBeg - ((uint8_t*)NULL);
603 dstEnd = &dst[cchDst];
606 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
608 length = dstBeg - dst;
611 if (result == targetExhausted)
613 SetLastError(ERROR_INSUFFICIENT_BUFFER);
617 return (result == conversionOK) ? length : 0;
622 int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
int cbMultiByte,
623 LPWSTR lpWideCharStr,
int cchWideChar)
625 size_t cbCharLen = (size_t)cbMultiByte;
627 WINPR_UNUSED(dwFlags);
630 if ((cbMultiByte == 0) || (cbMultiByte < -1))
638 const size_t len = strlen(lpMultiByteStr);
639 if (len >= INT32_MAX)
641 cbCharLen = (int)len + 1;
644 cbCharLen = cbMultiByte;
646 WINPR_ASSERT(lpMultiByteStr);
654 WLog_ERR(TAG,
"Unsupported encoding %u", CodePage);
658 return winpr_ConvertUTF8toUTF16((
const uint8_t*)lpMultiByteStr, cbCharLen,
659 (uint16_t*)lpWideCharStr, cchWideChar);
662 int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
int cchWideChar,
663 LPSTR lpMultiByteStr,
int cbMultiByte, LPCSTR lpDefaultChar,
664 LPBOOL lpUsedDefaultChar)
666 size_t cbCharLen = (size_t)cchWideChar;
668 WINPR_UNUSED(dwFlags);
670 if ((cchWideChar == 0) || (cchWideChar < -1))
676 WINPR_ASSERT(lpWideCharStr);
678 if (cchWideChar == -1)
680 const size_t len = _wcslen(lpWideCharStr);
681 if (len >= INT32_MAX)
683 cbCharLen = (int)len + 1;
686 cbCharLen = cchWideChar;
693 return winpr_ConvertUTF16toUTF8((
const uint16_t*)lpWideCharStr, cbCharLen,
694 (uint8_t*)lpMultiByteStr, cbMultiByte);