-
Notifications
You must be signed in to change notification settings - Fork 5k
/
Copy pathutf8.c
2151 lines (1802 loc) · 68.4 KB
/
utf8.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
#include <minipal/utf8.h>
#include <errno.h>
#include <limits.h>
#include <string.h>
#include <assert.h>
#include "minipalconfig.h"
#define HIGH_SURROGATE_START 0xd800
#define HIGH_SURROGATE_END 0xdbff
#define LOW_SURROGATE_START 0xdc00
#define LOW_SURROGATE_END 0xdfff
// Test if the wide character is a high surrogate
static bool IsHighSurrogate(const CHAR16_T c)
{
return (c & 0xFC00) == HIGH_SURROGATE_START;
}
// Test if the wide character is a low surrogate
static bool IsLowSurrogate(const CHAR16_T c)
{
return (c & 0xFC00) == LOW_SURROGATE_START;
}
// Test if the wide character is a surrogate half
static bool IsSurrogate(const CHAR16_T c)
{
return (c & 0xF800) == HIGH_SURROGATE_START;
}
typedef struct
{
// Store our default string
unsigned char* byteStart;
CHAR16_T* charEnd;
const CHAR16_T strDefault[2];
int strDefaultLength;
int fallbackCount;
int fallbackIndex;
} DecoderBuffer;
static CHAR16_T DecoderReplacementFallbackBuffer_GetNextChar(DecoderBuffer* self)
{
// We want it to get < 0 because == 0 means that the current/last character is a fallback
// and we need to detect recursion. We could have a flag but we already have this counter.
self->fallbackCount--;
self->fallbackIndex++;
// Do we have anything left? 0 is now last fallback char, negative is nothing left
if (self->fallbackCount < 0)
return '\0';
// Need to get it out of the buffer.
// Make sure it didn't wrap from the fast count-- path
if (self->fallbackCount == INT_MAX)
{
self->fallbackCount = -1;
return '\0';
}
// Now make sure its in the expected range
assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0);
return self->strDefault[self->fallbackIndex];
}
// Fallback Methods
static bool DecoderReplacementFallbackBuffer_Fallback(DecoderBuffer* self)
{
// We expect no previous fallback in our buffer
// We can't call recursively but others might (note, we don't test on last char!!!)
assert(self->fallbackCount < 1);
// Go ahead and get our fallback
if (self->strDefaultLength == 0)
return false;
self->fallbackCount = self->strDefaultLength;
self->fallbackIndex = -1;
return true;
}
// Fallback the current byte by sticking it into the remaining char buffer.
// This can only be called by our encodings (other have to use the public fallback methods), so
// we can use our DecoderNLS here too (except we don't).
// Returns true if we are successful, false if we can't fallback the character (no buffer space)
// So caller needs to throw buffer space if return false.
// Right now this has both bytes and bytes[], since we might have extra bytes, hence the
// array, and we might need the index, hence the byte*
// Don't touch ref chars unless we succeed
static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars, CHAR16_T* pAllocatedBufferEnd)
{
assert(self->byteStart != NULL);
bool fallbackResult = DecoderReplacementFallbackBuffer_Fallback(self);
// See if there's a fallback character and we have an output buffer then copy our string.
if (fallbackResult)
{
// Copy the chars to our output
CHAR16_T ch;
CHAR16_T* charTemp = *chars;
bool bHighSurrogate = false;
(void)bHighSurrogate; // unused in release build
while ((ch = DecoderReplacementFallbackBuffer_GetNextChar(self)) != 0)
{
// Make sure no mixed up surrogates
if (IsSurrogate(ch))
{
if (IsHighSurrogate(ch))
{
// High Surrogate
assert(!bHighSurrogate);
bHighSurrogate = true;
}
else
{
// Low surrogate
assert(bHighSurrogate);
bHighSurrogate = false;
}
}
if (charTemp >= self->charEnd)
{
// No buffer space
return false;
}
*(charTemp++) = ch;
if (charTemp > pAllocatedBufferEnd)
{
errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
return false;
}
}
// Need to make sure that bHighSurrogate isn't true
assert(!bHighSurrogate);
// Now we aren't going to be false, so its OK to update chars
*chars = charTemp;
}
return true;
}
// Clear the buffer
static void DecoderReplacementFallbackBuffer_Reset(DecoderBuffer* self)
{
self->fallbackCount = -1;
self->fallbackIndex = -1;
self->byteStart = NULL;
}
typedef struct
{
const CHAR16_T strDefault[3];
int strDefaultLength;
CHAR16_T* charStart;
CHAR16_T* charEnd;
bool setEncoder;
bool bUsedEncoder;
bool bFallingBack;
int iRecursionCount;
int fallbackCount;
int fallbackIndex;
} EncoderBuffer;
#define MAX_RECURSION 250
// Set the above values
// This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
static void EncoderReplacementFallbackBuffer_InternalInitialize(EncoderBuffer* self, CHAR16_T* charStart, CHAR16_T* charEnd, bool setEncoder)
{
self->charStart = charStart;
self->charEnd = charEnd;
self->setEncoder = setEncoder;
self->bUsedEncoder = false;
self->bFallingBack = false;
self->iRecursionCount = 0;
}
static CHAR16_T EncoderReplacementFallbackBuffer_InternalGetNextChar(EncoderBuffer* self)
{
// We want it to get < 0 because == 0 means that the current/last character is a fallback
// and we need to detect recursion. We could have a flag but we already have this counter.
self->fallbackCount--;
self->fallbackIndex++;
// Do we have anything left? 0 is now last fallback char, negative is nothing left
if (self->fallbackCount < 0)
return '\0';
// Need to get it out of the buffer.
// Make sure it didn't wrap from the fast count-- path
if (self->fallbackCount == INT_MAX)
{
self->fallbackCount = -1;
return '\0';
}
// Now make sure its in the expected range
assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0);
CHAR16_T ch = self->strDefault[self->fallbackIndex];
self->bFallingBack = (ch != 0);
if (ch == 0) self->iRecursionCount = 0;
return ch;
}
// Fallback Methods
static bool EncoderReplacementFallbackBuffer_Fallback(EncoderBuffer* self)
{
// If we had a buffer already we're being recursive, throw, it's probably at the suspect
// character in our array.
assert(self->fallbackCount < 1);
// Go ahead and get our fallback
// Divide by 2 because we aren't a surrogate pair
self->fallbackCount = self->strDefaultLength / 2;
self->fallbackIndex = -1;
return self->fallbackCount != 0;
}
static bool EncoderReplacementFallbackBuffer_Fallback_Unknown(EncoderBuffer* self)
{
// If we had a buffer already we're being recursive, throw, it's probably at the suspect
// character in our array.
assert(self->fallbackCount < 1);
// Go ahead and get our fallback
self->fallbackCount = self->strDefaultLength;
self->fallbackIndex = -1;
return self->fallbackCount != 0;
}
// Fallback the current character using the remaining buffer and encoder if necessary
// This can only be called by our encodings (other have to use the public fallback methods), so
// we can use our EncoderNLS here too.
// setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
//
// Note that this could also change the contents of self->buffer.encoder, which is the same
// object that the caller is using, so the caller could mess up the encoder for us
// if they aren't careful.
static bool EncoderReplacementFallbackBuffer_InternalFallback(EncoderBuffer* self, CHAR16_T ch, CHAR16_T** chars)
{
// Shouldn't have null charStart
assert(self->charStart != NULL);
// See if it was a high surrogate
if (IsHighSurrogate(ch))
{
// See if there's a low surrogate to go with it
if (*chars >= self->charEnd)
{
// Nothing left in input buffer
// No input, return 0
}
else
{
// Might have a low surrogate
CHAR16_T cNext = **chars;
if (IsLowSurrogate(cNext))
{
// If already falling back then fail
assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION);
// Next is a surrogate, add it as surrogate pair, and increment chars
(*chars)++;
self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback_Unknown(self);
return self->bFallingBack;
}
// Next isn't a low surrogate, just fallback the high surrogate
}
}
// If already falling back then fail
assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION);
// Fall back our char
self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback(self);
return self->bFallingBack;
}
static bool EncoderReplacementFallbackBuffer_MovePrevious(EncoderBuffer* self)
{
// Back up one, only if we just processed the last character (or earlier)
if (self->fallbackCount >= -1 && self->fallbackIndex >= 0)
{
self->fallbackIndex--;
self->fallbackCount++;
return true;
}
// Return false 'cause we couldn't do it.
return false;
}
typedef struct
{
union
{
DecoderBuffer decoder;
EncoderBuffer encoder;
} buffer;
bool useFallback;
#if BIGENDIAN
bool treatAsLE;
#endif
} UTF8Encoding;
// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
// while the actual character is being built in the lower bits. They are shifted together
// with the actual bits of the character.
// bits 30 & 31 are used for pending bits fixup
#define FinalByte (1 << 29)
#define SupplimentarySeq (1 << 28)
#define ThreeByteSeq (1 << 27)
static bool InRange(int c, int begin, int end)
{
return begin <= c && c <= end;
}
// During GetChars we had an invalid byte sequence
// pSrc is backed up to the start of the bad sequence if we didn't have room to
// fall it back. Otherwise pSrc remains where it is.
static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget, CHAR16_T* pAllocatedBufferEnd)
{
assert(self->useFallback);
// Get our byte[]
unsigned char* pStart = *pSrc;
bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget, pAllocatedBufferEnd);
// Do the actual fallback
if (!fallbackResult)
{
// Oops, it failed, back up to pStart
*pSrc = pStart;
return false;
}
// It worked
return true;
}
static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t count)
{
assert(bytes != NULL);
assert(count >= 0);
// Initialize stuff
unsigned char *pSrc = bytes;
unsigned char *pEnd = pSrc + count;
size_t availableBytes;
int chc;
// Start by assuming we have as many as count, charCount always includes the adjustment
// for the character being decoded
size_t charCount = count;
int ch = 0;
bool fallbackUsed = false;
while (true)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) break;
// read next byte. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
int cha = *pSrc;
// no pending bits
if (ch == 0) goto ReadChar;
pSrc++;
// we are expecting to see trailing bytes like 10vvvvvv
if ((cha & 0xC0) != 0x80)
{
// This can be a valid starting byte for another UTF8 byte sequence, so let's put
// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
pSrc--;
charCount += (ch >> 30);
goto InvalidByteSequence;
}
// fold in the new byte
ch = (ch << 6) | (cha & 0x3F);
if ((ch & FinalByte) == 0)
{
assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0);
if ((ch & SupplimentarySeq) != 0)
{
if ((ch & (FinalByte >> 6)) != 0)
{
// this is 3rd byte (of 4 byte supplimentary) - nothing to do
continue;
}
// 2nd byte, check for non-shortest form of supplimentary char and the valid
// supplimentary characters in range 0x010000 - 0x10FFFF at the same time
if (!InRange(ch & 0x1F0, 0x10, 0x100))
{
goto InvalidByteSequence;
}
}
else
{
// Must be 2nd byte of a 3-byte sequence
// check for non-shortest form of 3 byte seq
if ((ch & (0x1F << 5)) == 0 || // non-shortest form
(ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
{
goto InvalidByteSequence;
}
}
continue;
}
// ready to punch
// adjust for surrogates in non-shortest form
if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) charCount--;
goto EncodeChar;
InvalidByteSequence:
if (!self->useFallback)
{
errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION;
return 0;
}
if (!fallbackUsed)
{
fallbackUsed = true;
self->buffer.decoder.byteStart = bytes;
self->buffer.decoder.charEnd = NULL;
}
charCount += self->buffer.decoder.strDefaultLength;
ch = 0;
continue;
ReadChar:
ch = *pSrc;
pSrc++;
ProcessChar:
if (ch > 0x7F)
{
// If its > 0x7F, its start of a new multi-byte sequence
// Long sequence, so unreserve our char.
charCount--;
// bit 6 has to be non-zero for start of multibyte chars.
if ((ch & 0x40) == 0) goto InvalidByteSequence;
// start a new long code
if ((ch & 0x20) != 0)
{
if ((ch & 0x10) != 0)
{
// 4 byte encoding - supplimentary character (2 surrogates)
ch &= 0x0F;
// check that bit 4 is zero and the valid supplimentary character
// range 0x000000 - 0x10FFFF at the same time
if (ch > 0x04)
{
ch |= 0xf0;
goto InvalidByteSequence;
}
// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
// Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
(1 << 30) | // If it dies on next byte we'll need an extra char
(3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
(SupplimentarySeq) | (SupplimentarySeq >> 6) |
(SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
// Our character count will be 2 characters for these 4 bytes, so subtract another char
charCount--;
}
else
{
// 3 byte encoding
// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
(ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
// We'll expect 1 character for these 3 bytes, so subtract another char.
charCount--;
}
}
else
{
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1)
{
ch |= 0xc0;
goto InvalidByteSequence;
}
// Add bit flags so we'll be flagged correctly
ch |= (FinalByte >> 6);
}
continue;
}
EncodeChar:
availableBytes = (size_t)(pEnd - pSrc);
// don't fall into the fast decoding loop if we don't have enough bytes
if (availableBytes <= 13)
{
// try to get over the remainder of the ascii characters fast though
unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd)
{
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
goto ProcessChar;
}
// we are done
ch = 0;
break;
}
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
unsigned char *pStop = pSrc + availableBytes - 7;
while (pSrc < pStop)
{
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
{
goto LongCode;
}
// get pSrc 2-byte aligned
if (((size_t)pSrc & 0x1) != 0)
{
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
{
goto LongCode;
}
}
// get pSrc 4-byte aligned
if (((size_t)pSrc & 0x2) != 0)
{
ch = *(unsigned short*)pSrc;
if ((ch & 0x8080) != 0)
{
goto LongCodeWithMask16;
}
pSrc += 2;
}
// Run 8 + 8 characters at a time!
while (pSrc < pStop)
{
ch = *(int*)pSrc;
int chb = *(int*)(pSrc + 4);
if (((ch | chb) & (int)0x80808080) != 0)
{
goto LongCodeWithMask32;
}
pSrc += 8;
// This is a really small loop - unroll it
if (pSrc >= pStop)
break;
ch = *(int*)pSrc;
chb = *(int*)(pSrc + 4);
if (((ch | chb) & (int)0x80808080) != 0)
{
goto LongCodeWithMask32;
}
pSrc += 8;
}
break;
LongCodeWithMask32 :
#if BIGENDIAN
// be careful about the sign extension
if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
else
#endif
ch &= 0xFF;
LongCodeWithMask16:
#if BIGENDIAN
if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8);
else
#endif
ch &= 0xFF;
pSrc++;
if (ch <= 0x7F)
{
continue;
}
LongCode:
chc = *pSrc;
pSrc++;
if (
// bit 6 has to be zero
(ch & 0x40) == 0 ||
// we are expecting to see trailing bytes like 10vvvvvv
(chc & 0xC0) != 0x80)
{
goto BadLongCode;
}
chc &= 0x3F;
// start a new long code
if ((ch & 0x20) != 0)
{
// fold the first two bytes together
chc |= (ch & 0x0F) << 6;
if ((ch & 0x10) != 0)
{
// 4 byte encoding - surrogate
ch = *pSrc;
if (
// check that bit 4 is zero, the non-shortest form of surrogate
// and the valid surrogate range 0x000000 - 0x10FFFF at the same time
!InRange(chc >> 4, 0x01, 0x10) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & 0xC0) != 0x80)
{
goto BadLongCode;
}
chc = (chc << 6) | (ch & 0x3F);
ch = *(pSrc + 1);
// we are expecting to see trailing bytes like 10vvvvvv
if ((ch & 0xC0) != 0x80)
{
goto BadLongCode;
}
pSrc += 2;
// extra byte
charCount--;
}
else
{
// 3 byte encoding
ch = *pSrc;
if (
// check for non-shortest form of 3 byte seq
(chc & (0x1F << 5)) == 0 ||
// Can't have surrogates here.
(chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & 0xC0) != 0x80)
{
goto BadLongCode;
}
pSrc++;
// extra byte
charCount--;
}
}
else
{
// 2 byte encoding
// check for non-shortest form
if ((ch & 0x1E) == 0) goto BadLongCode;
}
// extra byte
charCount--;
}
// no pending bits at this point
ch = 0;
continue;
BadLongCode:
pSrc -= 2;
ch = 0;
continue;
}
// May have a problem if we have to flush
if (ch != 0)
{
// We were already adjusting for these, so need to unadjust
charCount += (ch >> 30);
charCount += self->buffer.decoder.strDefaultLength;
}
// Shouldn't have anything in fallback buffer for GetCharCount
// (don't have to check m_throwOnOverflow for count)
assert(!fallbackUsed || !self->useFallback || self->buffer.decoder.fallbackCount < 0);
return charCount;
}
#define ENSURE_BUFFER_INC \
pTarget++; \
if (pTarget > pAllocatedBufferEnd) \
{ \
errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; \
return 0; \
}
static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount)
{
assert(chars != NULL);
assert(byteCount >= 0);
assert(charCount >= 0);
assert(bytes != NULL);
unsigned char *pSrc = bytes;
CHAR16_T *pTarget = chars;
unsigned char *pEnd = pSrc + byteCount;
CHAR16_T *pAllocatedBufferEnd = pTarget + charCount;
int ch = 0;
int chc;
bool fallbackUsed = false;
while (true)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) break;
// read next byte. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
int cha = *pSrc;
if (ch == 0)
{
// no pending bits
goto ReadChar;
}
pSrc++;
// we are expecting to see trailing bytes like 10vvvvvv
if ((cha & 0xC0) != 0x80)
{
// This can be a valid starting byte for another UTF8 byte sequence, so let's put
// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
pSrc--;
goto InvalidByteSequence;
}
// fold in the new byte
ch = (ch << 6) | (cha & 0x3F);
if ((ch & FinalByte) == 0)
{
// Not at last byte yet
assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0);
if ((ch & SupplimentarySeq) != 0)
{
// Its a 4-byte supplimentary sequence
if ((ch & (FinalByte >> 6)) != 0)
{
// this is 3rd byte of 4 byte sequence - nothing to do
continue;
}
// 2nd byte of 4 bytes
// check for non-shortest form of surrogate and the valid surrogate
// range 0x000000 - 0x10FFFF at the same time
if (!InRange(ch & 0x1F0, 0x10, 0x100))
{
goto InvalidByteSequence;
}
}
else
{
// Must be 2nd byte of a 3-byte sequence
// check for non-shortest form of 3 byte seq
if ((ch & (0x1F << 5)) == 0 || // non-shortest form
(ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
{
goto InvalidByteSequence;
}
}
continue;
}
// ready to punch
// surrogate in shortest form?
// Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
{
// let the range check for the second char throw the exception
if (pTarget < pAllocatedBufferEnd)
{
*pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) +
(HIGH_SURROGATE_START - (0x10000 >> 10)));
ENSURE_BUFFER_INC
ch = (ch & 0x3FF) +
(int)(LOW_SURROGATE_START);
}
}
goto EncodeChar;
InvalidByteSequence:
if (!self->useFallback)
{
errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION;
return 0;
}
// this code fragment should be close to the gotos referencing it
// Have to do fallback for invalid bytes
if (!fallbackUsed)
{
fallbackUsed = true;
self->buffer.decoder.byteStart = bytes;
self->buffer.decoder.charEnd = pAllocatedBufferEnd;
}
// That'll back us up the appropriate # of bytes if we didn't get anywhere
if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget, pAllocatedBufferEnd))
{
if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) return 0;
// Check if we ran out of buffer space
assert(pSrc >= bytes);
DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder);
ch = 0;
break;
}
assert(pSrc >= bytes);
ch = 0;
continue;
ReadChar:
ch = *pSrc;
pSrc++;
ProcessChar:
if (ch > 0x7F)
{
// If its > 0x7F, its start of a new multi-byte sequence
// bit 6 has to be non-zero
if ((ch & 0x40) == 0) goto InvalidByteSequence;
// start a new long code
if ((ch & 0x20) != 0)
{
if ((ch & 0x10) != 0)
{
// 4 byte encoding - supplimentary character (2 surrogates)
ch &= 0x0F;
// check that bit 4 is zero and the valid supplimentary character
// range 0x000000 - 0x10FFFF at the same time
if (ch > 0x04)
{
ch |= 0xf0;
goto InvalidByteSequence;
}
ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
(SupplimentarySeq) | (SupplimentarySeq >> 6) |
(SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
}
else
{
// 3 byte encoding
ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
(ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
}
}
else
{
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1)
{
ch |= 0xc0;
goto InvalidByteSequence;
}
ch |= (FinalByte >> 6);
}
continue;
}
EncodeChar:
// write the pending character
if (pTarget >= pAllocatedBufferEnd)
{
// Fix chars so we make sure to throw if we didn't output anything
ch &= 0x1fffff;
if (ch > 0x7f)
{
if (ch > 0x7ff)
{
if (ch >= LOW_SURROGATE_START &&
ch <= LOW_SURROGATE_END)
{
pSrc--; // It was 4 bytes
pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
}
else if (ch > 0xffff)
{
pSrc--; // It was 4 bytes, nothing was stored
}
pSrc--; // It was at least 3 bytes
}
pSrc--; // It was at least 2 bytes
}
pSrc--;
assert(pSrc >= bytes);
// Don't store ch in decoder, we already backed up to its start
ch = 0;
// Didn't throw, just use this buffer size.
break;
}
*pTarget = (CHAR16_T)ch;
ENSURE_BUFFER_INC
size_t availableChars = (size_t)(pAllocatedBufferEnd - pTarget);
size_t availableBytes = (size_t)(pEnd - pSrc);
// don't fall into the fast decoding loop if we don't have enough bytes
// Test for availableChars is done because pStop would be <= pTarget.
if (availableBytes <= 13)
{
// we may need as many as 1 character per byte
if (availableChars < availableBytes)
{
// not enough output room. no pending bits at this point
ch = 0;
continue;
}