FreeRDP
unicode_builtin.c
1 /*
2  * Copyright 2001-2004 Unicode, Inc.
3  *
4  * Disclaimer
5  *
6  * This source code is provided as is by Unicode, Inc. No claims are
7  * made as to fitness for any particular purpose. No warranties of any
8  * kind are expressed or implied. The recipient agrees to determine
9  * applicability of information provided. If this file has been
10  * purchased on magnetic or optical media from Unicode, Inc., the
11  * sole remedy for any claim will be exchange of defective media
12  * within 90 days of receipt.
13  *
14  * Limitations on Rights to Redistribute This Code
15  *
16  * Unicode, Inc. hereby grants the right to freely use the information
17  * supplied in this file in the creation of products supporting the
18  * Unicode Standard, and to make copies of this file in any form
19  * for internal or external distribution as long as this notice
20  * remains attached.
21  */
22 
23 /* ---------------------------------------------------------------------
24 
25 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26 Author: Mark E. Davis, 1994.
27 Rev History: Rick McGowan, fixes & updates May 2001.
28 Sept 2001: fixed const & error conditions per
29 mods suggested by S. Parent & A. Lillich.
30 June 2002: Tim Dodd added detection and handling of incomplete
31 source sequences, enhanced error detection, added casts
32 to eliminate compiler warnings.
33 July 2003: slight mods to back out aggressive FFFE detection.
34 Jan 2004: updated switches in from-UTF8 conversions.
35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36 
37 See the header file "utf.h" for complete documentation.
38 
39 ------------------------------------------------------------------------ */
40 
41 #include <winpr/wtypes.h>
42 #include <winpr/string.h>
43 #include <winpr/assert.h>
44 #include <winpr/cast.h>
45 
46 #include "unicode.h"
47 
48 #include "../log.h"
49 #define TAG WINPR_TAG("unicode")
50 
51 /*
52  * Character Types:
53  *
54  * UTF8: uint8_t 8 bits
55  * UTF16: uint16_t 16 bits
56  * UTF32: uint32_t 32 bits
57  */
58 
59 /* Some fundamental constants */
60 #define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
61 #define UNI_MAX_BMP (uint32_t)0x0000FFFF
62 #define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
63 #define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
64 #define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
65 
66 typedef enum
67 {
68  conversionOK, /* conversion successful */
69  sourceExhausted, /* partial character in source, but hit end */
70  targetExhausted, /* insuff. room in target for conversion */
71  sourceIllegal /* source sequence is illegal/malformed */
72 } ConversionResult;
73 
74 typedef enum
75 {
76  strictConversion = 0,
77  lenientConversion
78 } ConversionFlags;
79 
80 static const int halfShift = 10; /* used for shifting by 10 bits */
81 
82 static const uint32_t halfBase = 0x0010000UL;
83 static const uint32_t halfMask = 0x3FFUL;
84 
85 #define UNI_SUR_HIGH_START (uint32_t)0xD800
86 #define UNI_SUR_HIGH_END (uint32_t)0xDBFF
87 #define UNI_SUR_LOW_START (uint32_t)0xDC00
88 #define UNI_SUR_LOW_END (uint32_t)0xDFFF
89 
90 /* --------------------------------------------------------------------- */
91 
92 /*
93  * Index into the table below with the first byte of a UTF-8 sequence to
94  * get the number of trailing bytes that are supposed to follow it.
95  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
96  * left as-is for anyone who may want to do such conversion, which was
97  * allowed in earlier algorithms.
98  */
99 static const char trailingBytesForUTF8[256] = {
100  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
108 };
109 
110 /*
111  * Magic values subtracted from a buffer value during UTF8 conversion.
112  * This table contains as many values as there might be trailing bytes
113  * in a UTF-8 sequence.
114  */
115 static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
116  0x03C82080UL, 0xFA082080UL, 0x82082080UL };
117 
118 /*
119  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
120  * into the first byte, depending on how many bytes follow. There are
121  * as many entries in this table as there are UTF-8 sequence types.
122  * (I.e., one byte sequence, two byte... etc.). Remember that sequence
123  * for *legal* UTF-8 will be 4 or fewer bytes total.
124  */
125 static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
126 
127 /* --------------------------------------------------------------------- */
128 
129 /* The interface converts a whole buffer to avoid function-call overhead.
130  * Constants have been gathered. Loops & conditionals have been removed as
131  * much as possible for efficiency, in favor of drop-through switches.
132  * (See "Note A" at the bottom of the file for equivalent code.)
133  * If your compiler supports it, the "isLegalUTF8" call can be turned
134  * into an inline function.
135  */
136 
137 /* --------------------------------------------------------------------- */
138 
139 static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
140  const uint16_t* sourceEnd,
141  uint8_t** targetStart, uint8_t* targetEnd,
142  ConversionFlags flags)
143 {
144  bool computeLength = (!targetEnd) ? true : false;
145  const uint16_t* source = *sourceStart;
146  uint8_t* target = *targetStart;
147  ConversionResult result = conversionOK;
148 
149  while (source < sourceEnd)
150  {
151  uint32_t ch = 0;
152  unsigned short bytesToWrite = 0;
153  const uint32_t byteMask = 0xBF;
154  const uint32_t byteMark = 0x80;
155  const uint16_t* oldSource =
156  source; /* In case we have to back up because of target overflow. */
157 
158  ch = *source++;
159 
160  /* If we have a surrogate pair, convert to UTF32 first. */
161  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
162  {
163  /* If the 16 bits following the high surrogate are in the source buffer... */
164  if (source < sourceEnd)
165  {
166  uint32_t ch2 = *source;
167 
168  /* If it's a low surrogate, convert to UTF32. */
169  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
170  {
171  ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
172  halfBase;
173  ++source;
174  }
175  else if (flags == strictConversion)
176  {
177  /* it's an unpaired high surrogate */
178  --source; /* return to the illegal value itself */
179  result = sourceIllegal;
180  break;
181  }
182  }
183  else
184  {
185  /* We don't have the 16 bits following the high surrogate. */
186  --source; /* return to the high surrogate */
187  result = sourceExhausted;
188  break;
189  }
190  }
191  else if (flags == strictConversion)
192  {
193  /* UTF-16 surrogate values are illegal in UTF-32 */
194  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
195  {
196  --source; /* return to the illegal value itself */
197  result = sourceIllegal;
198  break;
199  }
200  }
201 
202  /* Figure out how many bytes the result will require */
203  if (ch < (uint32_t)0x80)
204  {
205  bytesToWrite = 1;
206  }
207  else if (ch < (uint32_t)0x800)
208  {
209  bytesToWrite = 2;
210  }
211  else if (ch < (uint32_t)0x10000)
212  {
213  bytesToWrite = 3;
214  }
215  else if (ch < (uint32_t)0x110000)
216  {
217  bytesToWrite = 4;
218  }
219  else
220  {
221  bytesToWrite = 3;
222  ch = UNI_REPLACEMENT_CHAR;
223  }
224 
225  target += bytesToWrite;
226 
227  if ((target > targetEnd) && (!computeLength))
228  {
229  source = oldSource; /* Back up source pointer! */
230  target -= bytesToWrite;
231  result = targetExhausted;
232  break;
233  }
234 
235  if (!computeLength)
236  {
237  switch (bytesToWrite)
238  {
239  /* note: everything falls through. */
240  case 4:
241  *--target = (uint8_t)((ch | byteMark) & byteMask);
242  ch >>= 6;
243  /* fallthrough */
244  WINPR_FALLTHROUGH
245  case 3:
246  *--target = (uint8_t)((ch | byteMark) & byteMask);
247  ch >>= 6;
248  /* fallthrough */
249  WINPR_FALLTHROUGH
250 
251  case 2:
252  *--target = (uint8_t)((ch | byteMark) & byteMask);
253  ch >>= 6;
254  /* fallthrough */
255  WINPR_FALLTHROUGH
256 
257  case 1:
258  *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
259  }
260  }
261  else
262  {
263  switch (bytesToWrite)
264  {
265  /* note: everything falls through. */
266  case 4:
267  --target;
268  /* fallthrough */
269  WINPR_FALLTHROUGH
270 
271  case 3:
272  --target;
273  /* fallthrough */
274  WINPR_FALLTHROUGH
275 
276  case 2:
277  --target;
278  /* fallthrough */
279  WINPR_FALLTHROUGH
280 
281  case 1:
282  --target;
283  }
284  }
285 
286  target += bytesToWrite;
287  }
288 
289  *sourceStart = source;
290  *targetStart = target;
291  return result;
292 }
293 
294 /* --------------------------------------------------------------------- */
295 
296 /*
297  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
298  * This must be called with the length pre-determined by the first byte.
299  * If not calling this from ConvertUTF8to*, then the length can be set by:
300  * length = trailingBytesForUTF8[*source]+1;
301  * and the sequence is illegal right away if there aren't that many bytes
302  * available.
303  * If presented with a length > 4, this returns false. The Unicode
304  * definition of UTF-8 goes up to 4-byte sequences.
305  */
306 
307 static bool isLegalUTF8(const uint8_t* source, int length)
308 {
309  uint8_t a = 0;
310  const uint8_t* srcptr = source + length;
311 
312  switch (length)
313  {
314  default:
315  return false;
316 
317  /* Everything else falls through when "true"... */
318  case 4:
319  if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
320  return false;
321  /* fallthrough */
322  WINPR_FALLTHROUGH
323 
324  case 3:
325  if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
326  return false;
327  /* fallthrough */
328  WINPR_FALLTHROUGH
329 
330  case 2:
331  if ((a = (*--srcptr)) > 0xBF)
332  return false;
333 
334  switch (*source)
335  {
336  /* no fall-through in this inner switch */
337  case 0xE0:
338  if (a < 0xA0)
339  return false;
340 
341  break;
342 
343  case 0xED:
344  if (a > 0x9F)
345  return false;
346 
347  break;
348 
349  case 0xF0:
350  if (a < 0x90)
351  return false;
352 
353  break;
354 
355  case 0xF4:
356  if (a > 0x8F)
357  return false;
358 
359  break;
360 
361  default:
362  if (a < 0x80)
363  return false;
364  break;
365  }
366  /* fallthrough */
367  WINPR_FALLTHROUGH
368 
369  case 1:
370  if (*source >= 0x80 && *source < 0xC2)
371  return false;
372  }
373 
374  if (*source > 0xF4)
375  return false;
376 
377  return true;
378 }
379 
380 /* --------------------------------------------------------------------- */
381 
382 static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
383  const uint8_t* sourceEnd,
384  uint16_t** targetStart,
385  uint16_t* targetEnd,
386  ConversionFlags flags)
387 {
388  bool computeLength = (!targetEnd) ? true : false;
389  ConversionResult result = conversionOK;
390  const uint8_t* source = *sourceStart;
391  uint16_t* target = *targetStart;
392 
393  while (source < sourceEnd)
394  {
395  uint32_t ch = 0;
396  unsigned short extraBytesToRead =
397  WINPR_ASSERTING_INT_CAST(unsigned short, trailingBytesForUTF8[*source]);
398 
399  if ((source + extraBytesToRead) >= sourceEnd)
400  {
401  result = sourceExhausted;
402  break;
403  }
404 
405  /* Do this check whether lenient or strict */
406  if (!isLegalUTF8(source, extraBytesToRead + 1))
407  {
408  result = sourceIllegal;
409  break;
410  }
411 
412  /*
413  * The cases all fall through. See "Note A" below.
414  */
415  switch (extraBytesToRead)
416  {
417  case 5:
418  ch += *source++;
419  ch <<= 6; /* remember, illegal UTF-8 */
420  /* fallthrough */
421  WINPR_FALLTHROUGH
422 
423  case 4:
424  ch += *source++;
425  ch <<= 6; /* remember, illegal UTF-8 */
426  /* fallthrough */
427  WINPR_FALLTHROUGH
428 
429  case 3:
430  ch += *source++;
431  ch <<= 6;
432  /* fallthrough */
433  WINPR_FALLTHROUGH
434 
435  case 2:
436  ch += *source++;
437  ch <<= 6;
438  /* fallthrough */
439  WINPR_FALLTHROUGH
440 
441  case 1:
442  ch += *source++;
443  ch <<= 6;
444  /* fallthrough */
445  WINPR_FALLTHROUGH
446 
447  case 0:
448  ch += *source++;
449  }
450 
451  ch -= offsetsFromUTF8[extraBytesToRead];
452 
453  if ((target >= targetEnd) && (!computeLength))
454  {
455  source -= (extraBytesToRead + 1); /* Back up source pointer! */
456  result = targetExhausted;
457  break;
458  }
459 
460  if (ch <= UNI_MAX_BMP)
461  {
462  /* Target is a character <= 0xFFFF */
463  /* UTF-16 surrogate values are illegal in UTF-32 */
464  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
465  {
466  if (flags == strictConversion)
467  {
468  source -= (extraBytesToRead + 1); /* return to the illegal value itself */
469  result = sourceIllegal;
470  break;
471  }
472  else
473  {
474  if (!computeLength)
475  *target++ = UNI_REPLACEMENT_CHAR;
476  else
477  target++;
478  }
479  }
480  else
481  {
482  if (!computeLength)
483  *target++ = (uint16_t)ch; /* normal case */
484  else
485  target++;
486  }
487  }
488  else if (ch > UNI_MAX_UTF16)
489  {
490  if (flags == strictConversion)
491  {
492  result = sourceIllegal;
493  source -= (extraBytesToRead + 1); /* return to the start */
494  break; /* Bail out; shouldn't continue */
495  }
496  else
497  {
498  if (!computeLength)
499  *target++ = UNI_REPLACEMENT_CHAR;
500  else
501  target++;
502  }
503  }
504  else
505  {
506  /* target is a character in range 0xFFFF - 0x10FFFF. */
507  if ((target + 1 >= targetEnd) && (!computeLength))
508  {
509  source -= (extraBytesToRead + 1); /* Back up source pointer! */
510  result = targetExhausted;
511  break;
512  }
513 
514  ch -= halfBase;
515 
516  if (!computeLength)
517  {
518  *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
519  *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
520  }
521  else
522  {
523  target++;
524  target++;
525  }
526  }
527  }
528 
529  *sourceStart = source;
530  *targetStart = target;
531  return result;
532 }
533 
538 static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
539 {
540  size_t length = 0;
541  uint16_t* dstBeg = NULL;
542  uint16_t* dstEnd = NULL;
543  const uint8_t* srcBeg = NULL;
544  const uint8_t* srcEnd = NULL;
545  ConversionResult result = sourceIllegal;
546 
547  if (cchSrc == -1)
548  cchSrc = (int)strnlen((const char*)src, INT32_MAX - 1) + 1;
549 
550  srcBeg = src;
551  srcEnd = &src[cchSrc];
552 
553  if (cchDst == 0)
554  {
555  result =
556  winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
557 
558  length = dstBeg - (uint16_t*)NULL;
559  }
560  else
561  {
562  dstBeg = dst;
563  dstEnd = &dst[cchDst];
564 
565  result =
566  winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
567 
568  length = dstBeg - dst;
569  }
570 
571  if (result == targetExhausted)
572  {
573  SetLastError(ERROR_INSUFFICIENT_BUFFER);
574  return 0;
575  }
576 
577  return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
578 }
579 
580 static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
581 {
582  size_t length = 0;
583  uint8_t* dstBeg = NULL;
584  uint8_t* dstEnd = NULL;
585  const uint16_t* srcBeg = NULL;
586  const uint16_t* srcEnd = NULL;
587  ConversionResult result = sourceIllegal;
588 
589  if (cchSrc == -1)
590  cchSrc = (int)_wcsnlen((const WCHAR*)src, INT32_MAX - 1) + 1;
591 
592  srcBeg = src;
593  srcEnd = &src[cchSrc];
594 
595  if (cchDst == 0)
596  {
597  result =
598  winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
599 
600  length = dstBeg - ((uint8_t*)NULL);
601  }
602  else
603  {
604  dstBeg = dst;
605  dstEnd = &dst[cchDst];
606 
607  result =
608  winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
609 
610  length = dstBeg - dst;
611  }
612 
613  if (result == targetExhausted)
614  {
615  SetLastError(ERROR_INSUFFICIENT_BUFFER);
616  return 0;
617  }
618 
619  return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
620 }
621 
622 /* --------------------------------------------------------------------- */
623 
624 int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
625  LPWSTR lpWideCharStr, int cchWideChar)
626 {
627  size_t cbCharLen = (size_t)cbMultiByte;
628 
629  WINPR_UNUSED(dwFlags);
630 
631  /* If cbMultiByte is 0, the function fails */
632  if ((cbMultiByte == 0) || (cbMultiByte < -1))
633  return 0;
634 
635  if (cchWideChar < 0)
636  return -1;
637 
638  if (cbMultiByte < 0)
639  {
640  const size_t len = strlen(lpMultiByteStr);
641  if (len >= INT32_MAX)
642  return 0;
643  cbCharLen = (int)len + 1;
644  }
645  else
646  cbCharLen = cbMultiByte;
647 
648  WINPR_ASSERT(lpMultiByteStr);
649  switch (CodePage)
650  {
651  case CP_ACP:
652  case CP_UTF8:
653  break;
654 
655  default:
656  WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
657  return 0;
658  }
659 
660  return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr,
661  WINPR_ASSERTING_INT_CAST(int, cbCharLen),
662  (uint16_t*)lpWideCharStr, cchWideChar);
663 }
664 
665 int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
666  LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
667  LPBOOL lpUsedDefaultChar)
668 {
669  size_t cbCharLen = (size_t)cchWideChar;
670 
671  WINPR_UNUSED(dwFlags);
672  /* If cchWideChar is 0, the function fails */
673  if ((cchWideChar == 0) || (cchWideChar < -1))
674  return 0;
675 
676  if (cbMultiByte < 0)
677  return -1;
678 
679  WINPR_ASSERT(lpWideCharStr);
680  /* If cchWideChar is -1, the string is null-terminated */
681  if (cchWideChar == -1)
682  {
683  const size_t len = _wcslen(lpWideCharStr);
684  if (len >= INT32_MAX)
685  return 0;
686  cbCharLen = (int)len + 1;
687  }
688  else
689  cbCharLen = cchWideChar;
690 
691  /*
692  * if cbMultiByte is 0, the function returns the required buffer size
693  * in bytes for lpMultiByteStr and makes no use of the output parameter itself.
694  */
695 
696  return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr,
697  WINPR_ASSERTING_INT_CAST(int, cbCharLen),
698  (uint8_t*)lpMultiByteStr, cbMultiByte);
699 }