41#include <winpr/wtypes.h>
42#include <winpr/string.h>
43#include <winpr/assert.h>
44#include <winpr/cast.h>
49#define TAG WINPR_TAG("unicode")
60#define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
61#define UNI_MAX_BMP (uint32_t)0x0000FFFF
62#define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
63#define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
64#define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
80static const int halfShift = 10;
82static const uint32_t halfBase = 0x0010000UL;
83static const uint32_t halfMask = 0x3FFUL;
85#define UNI_SUR_HIGH_START (uint32_t)0xD800
86#define UNI_SUR_HIGH_END (uint32_t)0xDBFF
87#define UNI_SUR_LOW_START (uint32_t)0xDC00
88#define UNI_SUR_LOW_END (uint32_t)0xDFFF
99static const char trailingBytesForUTF8[256] = {
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
115static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
116 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
125static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
128static WCHAR setWcharFrom(WCHAR w)
130#if defined(__BIG_ENDIAN__)
138 const char c = cnv.c[0];
159static ConversionResult winpr_ConvertUTF16toUTF8_Internal(
const uint16_t** sourceStart,
160 const uint16_t* sourceEnd,
161 uint8_t** targetStart,
162 const uint8_t* targetEnd,
163 ConversionFlags flags)
165 bool computeLength = (!targetEnd) ?
true : false;
166 const uint16_t* source = *sourceStart;
167 uint8_t* target = *targetStart;
168 ConversionResult result = conversionOK;
170 while (source < sourceEnd)
173 unsigned short bytesToWrite = 0;
174 const uint32_t byteMask = 0xBF;
175 const uint32_t byteMark = 0x80;
176 const uint16_t* oldSource =
179 ch = setWcharFrom(*source++);
182 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
185 if (source < sourceEnd)
187 uint32_t ch2 = setWcharFrom(*source);
190 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
192 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
196 else if (flags == strictConversion)
200 result = sourceIllegal;
208 result = sourceExhausted;
212 else if (flags == strictConversion)
215 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
218 result = sourceIllegal;
224 if (ch < (uint32_t)0x80)
228 else if (ch < (uint32_t)0x800)
232 else if (ch < (uint32_t)0x10000)
236 else if (ch < (uint32_t)0x110000)
243 ch = UNI_REPLACEMENT_CHAR;
246 target += bytesToWrite;
248 if ((target > targetEnd) && (!computeLength))
251 target -= bytesToWrite;
252 result = targetExhausted;
258 switch (bytesToWrite)
262 *--target = (uint8_t)((ch | byteMark) & byteMask);
267 *--target = (uint8_t)((ch | byteMark) & byteMask);
273 *--target = (uint8_t)((ch | byteMark) & byteMask);
279 *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
282 return sourceIllegal;
287 switch (bytesToWrite)
309 return sourceIllegal;
313 target += bytesToWrite;
316 *sourceStart = source;
317 *targetStart = target;
334static bool isLegalUTF8(
const uint8_t* source,
int length)
337 const uint8_t* srcptr = source + length;
346 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
352 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
358 if ((a = (*--srcptr)) > 0xBF)
397 if (*source >= 0x80 && *source < 0xC2)
409static ConversionResult winpr_ConvertUTF8toUTF16_Internal(
const uint8_t** sourceStart,
410 const uint8_t* sourceEnd,
411 uint16_t** targetStart,
412 const uint16_t* targetEnd,
413 ConversionFlags flags)
415 bool computeLength = (!targetEnd) ?
true : false;
416 ConversionResult result = conversionOK;
417 const uint8_t* source = *sourceStart;
418 uint16_t* target = *targetStart;
420 while (source < sourceEnd)
423 unsigned short extraBytesToRead =
424 WINPR_ASSERTING_INT_CAST(
unsigned short, trailingBytesForUTF8[*source]);
426 if ((source + extraBytesToRead) >= sourceEnd)
428 result = sourceExhausted;
433 if (!isLegalUTF8(source, extraBytesToRead + 1))
435 result = sourceIllegal;
442 switch (extraBytesToRead)
478 return sourceIllegal;
481 ch -= offsetsFromUTF8[extraBytesToRead];
483 if ((target >= targetEnd) && (!computeLength))
485 source -= (extraBytesToRead + 1);
486 result = targetExhausted;
490 if (ch <= UNI_MAX_BMP)
494 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
496 if (flags == strictConversion)
498 source -= (extraBytesToRead + 1);
499 result = sourceIllegal;
505 *target++ = setWcharFrom(UNI_REPLACEMENT_CHAR);
513 *target++ = setWcharFrom((WCHAR)ch);
518 else if (ch > UNI_MAX_UTF16)
520 if (flags == strictConversion)
522 result = sourceIllegal;
523 source -= (extraBytesToRead + 1);
529 *target++ = setWcharFrom(UNI_REPLACEMENT_CHAR);
537 if ((target + 1 >= targetEnd) && (!computeLength))
539 source -= (extraBytesToRead + 1);
540 result = targetExhausted;
548 *target++ = setWcharFrom((WCHAR)((ch >> halfShift) + UNI_SUR_HIGH_START));
549 *target++ = setWcharFrom((WCHAR)((ch & halfMask) + UNI_SUR_LOW_START));
559 *sourceStart = source;
560 *targetStart = target;
568static int winpr_ConvertUTF8toUTF16(
const uint8_t* src,
int cchSrc, uint16_t* dst,
int cchDst)
571 uint16_t* dstBeg = NULL;
572 uint16_t* dstEnd = NULL;
573 const uint8_t* srcBeg = NULL;
574 const uint8_t* srcEnd = NULL;
575 ConversionResult result = sourceIllegal;
578 cchSrc = (int)strnlen((
const char*)src, INT32_MAX - 1) + 1;
581 srcEnd = &src[cchSrc];
586 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
588 length = dstBeg - (uint16_t*)NULL;
593 dstEnd = &dst[cchDst];
596 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
598 length = dstBeg - dst;
601 if (result == targetExhausted)
603 SetLastError(ERROR_INSUFFICIENT_BUFFER);
607 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(
int, length) : 0;
610static int winpr_ConvertUTF16toUTF8(
const uint16_t* src,
int cchSrc, uint8_t* dst,
int cchDst)
613 uint8_t* dstBeg = NULL;
614 uint8_t* dstEnd = NULL;
615 const uint16_t* srcBeg = NULL;
616 const uint16_t* srcEnd = NULL;
617 ConversionResult result = sourceIllegal;
620 cchSrc = (int)_wcsnlen((
const WCHAR*)src, INT32_MAX - 1) + 1;
623 srcEnd = &src[cchSrc];
628 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
630 length = dstBeg - ((uint8_t*)NULL);
635 dstEnd = &dst[cchDst];
638 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
640 length = dstBeg - dst;
643 if (result == targetExhausted)
645 SetLastError(ERROR_INSUFFICIENT_BUFFER);
649 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(
int, length) : 0;
654int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
int cbMultiByte,
655 LPWSTR lpWideCharStr,
int cchWideChar)
657 size_t cbCharLen = (size_t)cbMultiByte;
659 WINPR_UNUSED(dwFlags);
662 if ((cbMultiByte == 0) || (cbMultiByte < -1))
670 const size_t len = strlen(lpMultiByteStr);
671 if (len >= INT32_MAX)
673 cbCharLen = (int)len + 1;
676 cbCharLen = cbMultiByte;
678 WINPR_ASSERT(lpMultiByteStr);
686 WLog_ERR(TAG,
"Unsupported encoding %u", CodePage);
690 return winpr_ConvertUTF8toUTF16((
const uint8_t*)lpMultiByteStr,
691 WINPR_ASSERTING_INT_CAST(
int, cbCharLen),
692 (uint16_t*)lpWideCharStr, cchWideChar);
695int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
int cchWideChar,
696 LPSTR lpMultiByteStr,
int cbMultiByte, LPCSTR lpDefaultChar,
697 LPBOOL lpUsedDefaultChar)
699 size_t cbCharLen = (size_t)cchWideChar;
701 WINPR_UNUSED(dwFlags);
703 if ((cchWideChar == 0) || (cchWideChar < -1))
709 WINPR_ASSERT(lpWideCharStr);
711 if (cchWideChar == -1)
713 const size_t len = _wcslen(lpWideCharStr);
714 if (len >= INT32_MAX)
716 cbCharLen = (int)len + 1;
719 cbCharLen = cchWideChar;
726 return winpr_ConvertUTF16toUTF8((
const uint16_t*)lpWideCharStr,
727 WINPR_ASSERTING_INT_CAST(
int, cbCharLen),
728 (uint8_t*)lpMultiByteStr, cbMultiByte);