FreeRDP
unicode_builtin.c
1 /*
2  * Copyright 2001-2004 Unicode, Inc.
3  *
4  * Disclaimer
5  *
6  * This source code is provided as is by Unicode, Inc. No claims are
7  * made as to fitness for any particular purpose. No warranties of any
8  * kind are expressed or implied. The recipient agrees to determine
9  * applicability of information provided. If this file has been
10  * purchased on magnetic or optical media from Unicode, Inc., the
11  * sole remedy for any claim will be exchange of defective media
12  * within 90 days of receipt.
13  *
14  * Limitations on Rights to Redistribute This Code
15  *
16  * Unicode, Inc. hereby grants the right to freely use the information
17  * supplied in this file in the creation of products supporting the
18  * Unicode Standard, and to make copies of this file in any form
19  * for internal or external distribution as long as this notice
20  * remains attached.
21  */
22 
23 /* ---------------------------------------------------------------------
24 
25 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26 Author: Mark E. Davis, 1994.
27 Rev History: Rick McGowan, fixes & updates May 2001.
28 Sept 2001: fixed const & error conditions per
29 mods suggested by S. Parent & A. Lillich.
30 June 2002: Tim Dodd added detection and handling of incomplete
31 source sequences, enhanced error detection, added casts
32 to eliminate compiler warnings.
33 July 2003: slight mods to back out aggressive FFFE detection.
34 Jan 2004: updated switches in from-UTF8 conversions.
35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36 
37 See the header file "utf.h" for complete documentation.
38 
39 ------------------------------------------------------------------------ */
40 
41 #include <winpr/wtypes.h>
42 #include <winpr/string.h>
43 #include <winpr/assert.h>
44 
45 #include "unicode.h"
46 
47 #include "../log.h"
48 #define TAG WINPR_TAG("unicode")
49 
50 /*
51  * Character Types:
52  *
53  * UTF8: uint8_t 8 bits
54  * UTF16: uint16_t 16 bits
55  * UTF32: uint32_t 32 bits
56  */
57 
58 /* Some fundamental constants */
59 #define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
60 #define UNI_MAX_BMP (uint32_t)0x0000FFFF
61 #define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
62 #define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
63 #define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
64 
65 typedef enum
66 {
67  conversionOK, /* conversion successful */
68  sourceExhausted, /* partial character in source, but hit end */
69  targetExhausted, /* insuff. room in target for conversion */
70  sourceIllegal /* source sequence is illegal/malformed */
71 } ConversionResult;
72 
73 typedef enum
74 {
75  strictConversion = 0,
76  lenientConversion
77 } ConversionFlags;
78 
79 static const int halfShift = 10; /* used for shifting by 10 bits */
80 
81 static const uint32_t halfBase = 0x0010000UL;
82 static const uint32_t halfMask = 0x3FFUL;
83 
84 #define UNI_SUR_HIGH_START (uint32_t)0xD800
85 #define UNI_SUR_HIGH_END (uint32_t)0xDBFF
86 #define UNI_SUR_LOW_START (uint32_t)0xDC00
87 #define UNI_SUR_LOW_END (uint32_t)0xDFFF
88 
89 /* --------------------------------------------------------------------- */
90 
91 /*
92  * Index into the table below with the first byte of a UTF-8 sequence to
93  * get the number of trailing bytes that are supposed to follow it.
94  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
95  * left as-is for anyone who may want to do such conversion, which was
96  * allowed in earlier algorithms.
97  */
98 static const char trailingBytesForUTF8[256] = {
99  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
107 };
108 
109 /*
110  * Magic values subtracted from a buffer value during UTF8 conversion.
111  * This table contains as many values as there might be trailing bytes
112  * in a UTF-8 sequence.
113  */
114 static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
115  0x03C82080UL, 0xFA082080UL, 0x82082080UL };
116 
117 /*
118  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
119  * into the first byte, depending on how many bytes follow. There are
120  * as many entries in this table as there are UTF-8 sequence types.
121  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
122  * for *legal* UTF-8 will be 4 or fewer bytes total.
123  */
124 static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
125 
126 /* --------------------------------------------------------------------- */
127 
128 /* The interface converts a whole buffer to avoid function-call overhead.
129  * Constants have been gathered. Loops & conditionals have been removed as
130  * much as possible for efficiency, in favor of drop-through switches.
131  * (See "Note A" at the bottom of the file for equivalent code.)
132  * If your compiler supports it, the "isLegalUTF8" call can be turned
133  * into an inline function.
134  */
135 
136 /* --------------------------------------------------------------------- */
137 
138 static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
139  const uint16_t* sourceEnd,
140  uint8_t** targetStart, uint8_t* targetEnd,
141  ConversionFlags flags)
142 {
143  bool computeLength = (!targetEnd) ? true : false;
144  const uint16_t* source = *sourceStart;
145  uint8_t* target = *targetStart;
146  ConversionResult result = conversionOK;
147 
148  while (source < sourceEnd)
149  {
150  uint32_t ch = 0;
151  unsigned short bytesToWrite = 0;
152  const uint32_t byteMask = 0xBF;
153  const uint32_t byteMark = 0x80;
154  const uint16_t* oldSource =
155  source; /* In case we have to back up because of target overflow. */
156 
157  ch = *source++;
158 
159  /* If we have a surrogate pair, convert to UTF32 first. */
160  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
161  {
162  /* If the 16 bits following the high surrogate are in the source buffer... */
163  if (source < sourceEnd)
164  {
165  uint32_t ch2 = *source;
166 
167  /* If it's a low surrogate, convert to UTF32. */
168  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
169  {
170  ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
171  halfBase;
172  ++source;
173  }
174  else if (flags == strictConversion)
175  {
176  /* it's an unpaired high surrogate */
177  --source; /* return to the illegal value itself */
178  result = sourceIllegal;
179  break;
180  }
181  }
182  else
183  {
184  /* We don't have the 16 bits following the high surrogate. */
185  --source; /* return to the high surrogate */
186  result = sourceExhausted;
187  break;
188  }
189  }
190  else if (flags == strictConversion)
191  {
192  /* UTF-16 surrogate values are illegal in UTF-32 */
193  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
194  {
195  --source; /* return to the illegal value itself */
196  result = sourceIllegal;
197  break;
198  }
199  }
200 
201  /* Figure out how many bytes the result will require */
202  if (ch < (uint32_t)0x80)
203  {
204  bytesToWrite = 1;
205  }
206  else if (ch < (uint32_t)0x800)
207  {
208  bytesToWrite = 2;
209  }
210  else if (ch < (uint32_t)0x10000)
211  {
212  bytesToWrite = 3;
213  }
214  else if (ch < (uint32_t)0x110000)
215  {
216  bytesToWrite = 4;
217  }
218  else
219  {
220  bytesToWrite = 3;
221  ch = UNI_REPLACEMENT_CHAR;
222  }
223 
224  target += bytesToWrite;
225 
226  if ((target > targetEnd) && (!computeLength))
227  {
228  source = oldSource; /* Back up source pointer! */
229  target -= bytesToWrite;
230  result = targetExhausted;
231  break;
232  }
233 
234  if (!computeLength)
235  {
236  switch (bytesToWrite)
237  {
238  /* note: everything falls through. */
239  case 4:
240  *--target = (uint8_t)((ch | byteMark) & byteMask);
241  ch >>= 6;
242  /* fallthrough */
243  WINPR_FALLTHROUGH
244  case 3:
245  *--target = (uint8_t)((ch | byteMark) & byteMask);
246  ch >>= 6;
247  /* fallthrough */
248  WINPR_FALLTHROUGH
249 
250  case 2:
251  *--target = (uint8_t)((ch | byteMark) & byteMask);
252  ch >>= 6;
253  /* fallthrough */
254  WINPR_FALLTHROUGH
255 
256  case 1:
257  *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
258  }
259  }
260  else
261  {
262  switch (bytesToWrite)
263  {
264  /* note: everything falls through. */
265  case 4:
266  --target;
267  /* fallthrough */
268  WINPR_FALLTHROUGH
269 
270  case 3:
271  --target;
272  /* fallthrough */
273  WINPR_FALLTHROUGH
274 
275  case 2:
276  --target;
277  /* fallthrough */
278  WINPR_FALLTHROUGH
279 
280  case 1:
281  --target;
282  }
283  }
284 
285  target += bytesToWrite;
286  }
287 
288  *sourceStart = source;
289  *targetStart = target;
290  return result;
291 }
292 
293 /* --------------------------------------------------------------------- */
294 
295 /*
296  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
297  * This must be called with the length pre-determined by the first byte.
298  * If not calling this from ConvertUTF8to*, then the length can be set by:
299  * length = trailingBytesForUTF8[*source]+1;
300  * and the sequence is illegal right away if there aren't that many bytes
301  * available.
302  * If presented with a length > 4, this returns false. The Unicode
303  * definition of UTF-8 goes up to 4-byte sequences.
304  */
305 
306 static bool isLegalUTF8(const uint8_t* source, int length)
307 {
308  uint8_t a = 0;
309  const uint8_t* srcptr = source + length;
310 
311  switch (length)
312  {
313  default:
314  return false;
315 
316  /* Everything else falls through when "true"... */
317  case 4:
318  if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
319  return false;
320  /* fallthrough */
321  WINPR_FALLTHROUGH
322 
323  case 3:
324  if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
325  return false;
326  /* fallthrough */
327  WINPR_FALLTHROUGH
328 
329  case 2:
330  if ((a = (*--srcptr)) > 0xBF)
331  return false;
332 
333  switch (*source)
334  {
335  /* no fall-through in this inner switch */
336  case 0xE0:
337  if (a < 0xA0)
338  return false;
339 
340  break;
341 
342  case 0xED:
343  if (a > 0x9F)
344  return false;
345 
346  break;
347 
348  case 0xF0:
349  if (a < 0x90)
350  return false;
351 
352  break;
353 
354  case 0xF4:
355  if (a > 0x8F)
356  return false;
357 
358  break;
359 
360  default:
361  if (a < 0x80)
362  return false;
363  break;
364  }
365  /* fallthrough */
366  WINPR_FALLTHROUGH
367 
368  case 1:
369  if (*source >= 0x80 && *source < 0xC2)
370  return false;
371  }
372 
373  if (*source > 0xF4)
374  return false;
375 
376  return true;
377 }
378 
379 /* --------------------------------------------------------------------- */
380 
381 static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
382  const uint8_t* sourceEnd,
383  uint16_t** targetStart,
384  uint16_t* targetEnd,
385  ConversionFlags flags)
386 {
387  bool computeLength = (!targetEnd) ? true : false;
388  ConversionResult result = conversionOK;
389  const uint8_t* source = *sourceStart;
390  uint16_t* target = *targetStart;
391 
392  while (source < sourceEnd)
393  {
394  uint32_t ch = 0;
395  unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
396 
397  if ((source + extraBytesToRead) >= sourceEnd)
398  {
399  result = sourceExhausted;
400  break;
401  }
402 
403  /* Do this check whether lenient or strict */
404  if (!isLegalUTF8(source, extraBytesToRead + 1))
405  {
406  result = sourceIllegal;
407  break;
408  }
409 
410  /*
411  * The cases all fall through. See "Note A" below.
412  */
413  switch (extraBytesToRead)
414  {
415  case 5:
416  ch += *source++;
417  ch <<= 6; /* remember, illegal UTF-8 */
418  /* fallthrough */
419  WINPR_FALLTHROUGH
420 
421  case 4:
422  ch += *source++;
423  ch <<= 6; /* remember, illegal UTF-8 */
424  /* fallthrough */
425  WINPR_FALLTHROUGH
426 
427  case 3:
428  ch += *source++;
429  ch <<= 6;
430  /* fallthrough */
431  WINPR_FALLTHROUGH
432 
433  case 2:
434  ch += *source++;
435  ch <<= 6;
436  /* fallthrough */
437  WINPR_FALLTHROUGH
438 
439  case 1:
440  ch += *source++;
441  ch <<= 6;
442  /* fallthrough */
443  WINPR_FALLTHROUGH
444 
445  case 0:
446  ch += *source++;
447  }
448 
449  ch -= offsetsFromUTF8[extraBytesToRead];
450 
451  if ((target >= targetEnd) && (!computeLength))
452  {
453  source -= (extraBytesToRead + 1); /* Back up source pointer! */
454  result = targetExhausted;
455  break;
456  }
457 
458  if (ch <= UNI_MAX_BMP)
459  {
460  /* Target is a character <= 0xFFFF */
461  /* UTF-16 surrogate values are illegal in UTF-32 */
462  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
463  {
464  if (flags == strictConversion)
465  {
466  source -= (extraBytesToRead + 1); /* return to the illegal value itself */
467  result = sourceIllegal;
468  break;
469  }
470  else
471  {
472  if (!computeLength)
473  *target++ = UNI_REPLACEMENT_CHAR;
474  else
475  target++;
476  }
477  }
478  else
479  {
480  if (!computeLength)
481  *target++ = (uint16_t)ch; /* normal case */
482  else
483  target++;
484  }
485  }
486  else if (ch > UNI_MAX_UTF16)
487  {
488  if (flags == strictConversion)
489  {
490  result = sourceIllegal;
491  source -= (extraBytesToRead + 1); /* return to the start */
492  break; /* Bail out; shouldn't continue */
493  }
494  else
495  {
496  if (!computeLength)
497  *target++ = UNI_REPLACEMENT_CHAR;
498  else
499  target++;
500  }
501  }
502  else
503  {
504  /* target is a character in range 0xFFFF - 0x10FFFF. */
505  if ((target + 1 >= targetEnd) && (!computeLength))
506  {
507  source -= (extraBytesToRead + 1); /* Back up source pointer! */
508  result = targetExhausted;
509  break;
510  }
511 
512  ch -= halfBase;
513 
514  if (!computeLength)
515  {
516  *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
517  *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
518  }
519  else
520  {
521  target++;
522  target++;
523  }
524  }
525  }
526 
527  *sourceStart = source;
528  *targetStart = target;
529  return result;
530 }
531 
536 static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
537 {
538  size_t length = 0;
539  uint16_t* dstBeg = NULL;
540  uint16_t* dstEnd = NULL;
541  const uint8_t* srcBeg = NULL;
542  const uint8_t* srcEnd = NULL;
543  ConversionResult result = sourceIllegal;
544 
545  if (cchSrc == -1)
546  cchSrc = (int)strnlen((const char*)src, INT32_MAX - 1) + 1;
547 
548  srcBeg = src;
549  srcEnd = &src[cchSrc];
550 
551  if (cchDst == 0)
552  {
553  result =
554  winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
555 
556  length = dstBeg - (uint16_t*)NULL;
557  }
558  else
559  {
560  dstBeg = dst;
561  dstEnd = &dst[cchDst];
562 
563  result =
564  winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
565 
566  length = dstBeg - dst;
567  }
568 
569  if (result == targetExhausted)
570  {
571  SetLastError(ERROR_INSUFFICIENT_BUFFER);
572  return 0;
573  }
574 
575  return (result == conversionOK) ? length : 0;
576 }
577 
578 static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
579 {
580  size_t length = 0;
581  uint8_t* dstBeg = NULL;
582  uint8_t* dstEnd = NULL;
583  const uint16_t* srcBeg = NULL;
584  const uint16_t* srcEnd = NULL;
585  ConversionResult result = sourceIllegal;
586 
587  if (cchSrc == -1)
588  cchSrc = (int)_wcsnlen((const WCHAR*)src, INT32_MAX - 1) + 1;
589 
590  srcBeg = src;
591  srcEnd = &src[cchSrc];
592 
593  if (cchDst == 0)
594  {
595  result =
596  winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
597 
598  length = dstBeg - ((uint8_t*)NULL);
599  }
600  else
601  {
602  dstBeg = dst;
603  dstEnd = &dst[cchDst];
604 
605  result =
606  winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
607 
608  length = dstBeg - dst;
609  }
610 
611  if (result == targetExhausted)
612  {
613  SetLastError(ERROR_INSUFFICIENT_BUFFER);
614  return 0;
615  }
616 
617  return (result == conversionOK) ? length : 0;
618 }
619 
620 /* --------------------------------------------------------------------- */
621 
622 int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
623  LPWSTR lpWideCharStr, int cchWideChar)
624 {
625  size_t cbCharLen = (size_t)cbMultiByte;
626 
627  WINPR_UNUSED(dwFlags);
628 
629  /* If cbMultiByte is 0, the function fails */
630  if ((cbMultiByte == 0) || (cbMultiByte < -1))
631  return 0;
632 
633  if (cchWideChar < 0)
634  return -1;
635 
636  if (cbMultiByte < 0)
637  {
638  const size_t len = strlen(lpMultiByteStr);
639  if (len >= INT32_MAX)
640  return 0;
641  cbCharLen = (int)len + 1;
642  }
643  else
644  cbCharLen = cbMultiByte;
645 
646  WINPR_ASSERT(lpMultiByteStr);
647  switch (CodePage)
648  {
649  case CP_ACP:
650  case CP_UTF8:
651  break;
652 
653  default:
654  WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
655  return 0;
656  }
657 
658  return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr, cbCharLen,
659  (uint16_t*)lpWideCharStr, cchWideChar);
660 }
661 
662 int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
663  LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
664  LPBOOL lpUsedDefaultChar)
665 {
666  size_t cbCharLen = (size_t)cchWideChar;
667 
668  WINPR_UNUSED(dwFlags);
669  /* If cchWideChar is 0, the function fails */
670  if ((cchWideChar == 0) || (cchWideChar < -1))
671  return 0;
672 
673  if (cbMultiByte < 0)
674  return -1;
675 
676  WINPR_ASSERT(lpWideCharStr);
677  /* If cchWideChar is -1, the string is null-terminated */
678  if (cchWideChar == -1)
679  {
680  const size_t len = _wcslen(lpWideCharStr);
681  if (len >= INT32_MAX)
682  return 0;
683  cbCharLen = (int)len + 1;
684  }
685  else
686  cbCharLen = cchWideChar;
687 
688  /*
689  * if cbMultiByte is 0, the function returns the required buffer size
690  * in bytes for lpMultiByteStr and makes no use of the output parameter itself.
691  */
692 
693  return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr, cbCharLen,
694  (uint8_t*)lpMultiByteStr, cbMultiByte);
695 }