1
+ // -------------------------------------------------------------------------------------------------------
2
+ // Copyright (C) Microsoft. All rights reserved.
3
+ // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
4
+ // -------------------------------------------------------------------------------------------------------
5
+ #include " stdafx.h"
6
+ #include " catch.hpp"
7
+ #include < process.h>
8
+ #include " Codex\Utf8Codex.h"
9
+
10
+ #pragma warning(disable:4100) // unreferenced formal parameter
11
+ #pragma warning(disable:6387) // suppressing preFAST which raises warning for passing null to the JsRT APIs
12
+ #pragma warning(disable:6262) // CATCH is using stack variables to report errors, suppressing the preFAST warning.
13
+
14
+ namespace CodexTest
15
+ {
16
+ // /
17
+ // / The following test verifies that for invalid characters, we replace them
18
+ // / with the unicode replacement character
19
+ // /
20
+
21
+ // Verify single utf8-encoded codepoint
22
+ void CheckIsUnicodeReplacementChar (const utf8char_t * encodedBuffer)
23
+ {
24
+ CHECK (encodedBuffer[0 ] == 0xEF );
25
+ CHECK (encodedBuffer[1 ] == 0xBF );
26
+ CHECK (encodedBuffer[2 ] == 0xBD );
27
+ }
28
+
29
+ //
30
+ // Following test cases are based on the Utf-8 decoder tests
31
+ // suggested by Markus Kuhn at https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
32
+ //
33
+ TEST_CASE (" CodexTest_EncodeTrueUtf8_SingleSurrogates" , " [CodexTest]" )
34
+ {
35
+ const charcount_t charCount = 1 ;
36
+ utf8char_t encodedBuffer[(charCount + 1 ) * 3 ]; // +1 since the buffer will be null-terminated
37
+
38
+ char16 testValues[] = { 0xD800 , 0xDB7F , 0xDB80 , 0xDBFF , 0xDC00 , 0xDF80 , 0xDFFF };
39
+ const int numTestCases = _countof (testValues);
40
+
41
+ for (int i = 0 ; i < numTestCases; i++)
42
+ {
43
+ size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate (encodedBuffer, &testValues[i], charCount);
44
+ CHECK (numEncodedBytes == 3 );
45
+ CheckIsUnicodeReplacementChar (encodedBuffer);
46
+ }
47
+ }
48
+
49
+ //
50
+ // Test encoding of given utf16-encoded strings into another encoding
51
+ //
52
+ // In the expected encoded string, extra bytes are represented as 0
53
+ //
54
+
55
+ template <typename TTestCase, typename TEncodingFunc>
56
+ void RunUtf8EncodingTestCase (const TTestCase &testCases, const TEncodingFunc func)
57
+ {
58
+ const int numTestCases = _countof (testCases);
59
+ const charcount_t charCount = _countof (testCases[0 ].surrogatePair );
60
+ const charcount_t maxEncodedByteCount = _countof (testCases[0 ].utf8Encoding );
61
+ utf8char_t encodedBuffer[maxEncodedByteCount + 1 ]; // +1 in case a null-terminating func is passed in
62
+
63
+ for (int i = 0 ; i < numTestCases; i++)
64
+ {
65
+ size_t numEncodedBytes = func (encodedBuffer, testCases[i].surrogatePair , charCount);
66
+ CHECK (numEncodedBytes <= maxEncodedByteCount);
67
+ for (size_t j = 0 ; j < numEncodedBytes; j++)
68
+ {
69
+ CHECK (encodedBuffer[j] == testCases[i].utf8Encoding [j]);
70
+ }
71
+
72
+ // Check and make sure there were no other bytes expected in the encoded string
73
+ if (numEncodedBytes < maxEncodedByteCount)
74
+ {
75
+ for (size_t j = numEncodedBytes; j < maxEncodedByteCount; j++)
76
+ {
77
+ CHECK (testCases[i].utf8Encoding [j] == 0 );
78
+ }
79
+ }
80
+ }
81
+ }
82
+
83
+ TEST_CASE (" CodexTest_EncodeCesu_PairedSurrogates" , " [CodexTest]" )
84
+ {
85
+ // Each of these test cases verifies the encoding
86
+ // of a single surrogate pair into a 6 byte CESU string
87
+ // Each surrogate-pair unit is encoded seperately into utf8
88
+ struct TestCase
89
+ {
90
+ char16 surrogatePair[2 ];
91
+ utf8char_t utf8Encoding[6 ];
92
+ };
93
+
94
+ TestCase testCases[] = {
95
+ { { 0xD800 , 0xDC00 }, { 0xED , 0xA0 , 0x80 , 0xED , 0xB0 , 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
96
+ { { 0xD800 , 0xDFFF }, { 0xED , 0xA0 , 0x80 , 0xED , 0xBF , 0xBF } }, // U+0103FF
97
+ { { 0xDB7F , 0xDC00 }, { 0xED , 0xAD , 0xBF , 0xED , 0xB0 , 0x80 } }, // U+0EFC00
98
+ { { 0xDB7F , 0xDFFF }, { 0xED , 0xAD , 0xBF , 0xED , 0xBF , 0xBF } }, // U+0EFFFF
99
+ { { 0xDB80 , 0xDC00 }, { 0xED , 0xAE , 0x80 , 0xED , 0xB0 , 0x80 } }, // U+0F0000 Plane 15 Private Use First
100
+ { { 0xDB80 , 0xDFFF }, { 0xED , 0xAE , 0x80 , 0xED , 0xBF , 0xBF } }, // U+0F03FF
101
+ { { 0xDBFF , 0xDC00 }, { 0xED , 0xAF , 0xBF , 0xED , 0xB0 , 0x80 } }, // U+10FC00
102
+ { { 0xDBFF , 0xDFFF }, { 0xED , 0xAF , 0xBF , 0xED , 0xBF , 0xBF } } // U+10FFFF
103
+ };
104
+
105
+ RunUtf8EncodingTestCase (testCases, static_cast <size_t (*)(utf8char_t *, const char16*, charcount_t )>(utf8::EncodeInto));
106
+ }
107
+
108
+ TEST_CASE (" CodexTest_EncodeUtf8_PairedSurrogates" , " [CodexTest]" )
109
+ {
110
+ // Each of these test cases verifies the encoding
111
+ // of a single surrogate pair into a 4 byte utf8 string
112
+ // Each surrogate-pair unit is decoded to its original codepoint
113
+ // and then encoded into utf8
114
+ struct TestCase
115
+ {
116
+ char16 surrogatePair[2 ];
117
+ utf8char_t utf8Encoding[4 ];
118
+ };
119
+
120
+ TestCase testCases[] = {
121
+ { { 0xD800 , 0xDC00 }, { 0xF0 , 0x90 , 0x80 , 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
122
+ { { 0xD800 , 0xDFFF }, { 0xF0 , 0x90 , 0x8F , 0xBF } }, // U+0103FF
123
+ { { 0xDB7F , 0xDC00 }, { 0xF3 , 0xAF , 0xB0 , 0x80 } }, // U+0EFC00
124
+ { { 0xDB7F , 0xDFFF }, { 0xF3 , 0xAF , 0xBF , 0xBF } }, // U+0EFFFF
125
+ { { 0xDB80 , 0xDC00 }, { 0xF3 , 0xB0 , 0x80 , 0x80 } }, // U+0F0000 Plane 15 Private Use First
126
+ { { 0xDB80 , 0xDFFF }, { 0xF3 , 0xB0 , 0x8F , 0xBF } }, // U+0F03FF
127
+ { { 0xDBFF , 0xDC00 }, { 0xF4 , 0x8F , 0xB0 , 0x80 } }, // U+10FC00
128
+ { { 0xDBFF , 0xDFFF }, { 0xF4 , 0x8F , 0xBF , 0xBF } } // U+10FFFF
129
+ };
130
+
131
+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
132
+ }
133
+
134
+ TEST_CASE (" CodexTest_EncodeUtf8_NonCharacters" , " [CodexTest]" )
135
+ {
136
+ // Each of these test cases verifies the encoding
137
+ // of certain problematic codepoints that do not represent
138
+ // characters
139
+ struct TestCase
140
+ {
141
+ char16 surrogatePair[1 ];
142
+ utf8char_t utf8Encoding[3 ];
143
+ };
144
+
145
+ TestCase testCases[] = {
146
+ { { 0xFFFE }, { 0xEF , 0xBF , 0xBE } }, // U+FFFE
147
+ { { 0xFFFF }, { 0xEF , 0xBF , 0xBF } } // U+FFFF
148
+ };
149
+
150
+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
151
+ }
152
+
153
+ TEST_CASE (" CodexTest_EncodeUtf8_BoundaryChars" , " [CodexTest]" )
154
+ {
155
+ // Each of these test cases verifies the encoding
156
+ // of boundary conditions
157
+ struct SingleChar16TestCase
158
+ {
159
+ char16 surrogatePair[1 ];
160
+ utf8char_t utf8Encoding[3 ];
161
+ };
162
+
163
+ SingleChar16TestCase testCases[] = {
164
+ { { 0xD7FF }, { 0xED , 0x9F , 0xBF } }, // U+D7FF
165
+ { { 0xE000 }, { 0xEE , 0x80 , 0x80 } }, // U+E000
166
+ { { 0xFFFD }, { 0xEF , 0xBF , 0xBD } } // U+FFFD
167
+ };
168
+
169
+ struct TwoChar16TestCase
170
+ {
171
+ char16 surrogatePair[2 ];
172
+ utf8char_t utf8Encoding[4 ];
173
+ };
174
+
175
+ TwoChar16TestCase testCases2[] = {
176
+ { { 0xDBFF , 0xDFFF }, { 0xF4 , 0x8F , 0xBF , 0xBF } } // U+10FFFF
177
+ };
178
+
179
+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
180
+ RunUtf8EncodingTestCase (testCases2, utf8::EncodeTrueUtf8IntoAndNullTerminate);
181
+ }
182
+
183
+ TEST_CASE (" CodexTest_EncodeUtf8_SimpleCharacters" , " [CodexTest]" )
184
+ {
185
+ // Each of these test cases verifies the encoding
186
+ // of certain problematic codepoints that do not represent
187
+ // characters
188
+ struct TestCase
189
+ {
190
+ char16 surrogatePair[1 ];
191
+ utf8char_t utf8Encoding[3 ];
192
+ };
193
+
194
+ TestCase testCases[] = {
195
+ { { 0x0024 }, { 0x24 } }, // U+0024 - Dollar Symbol
196
+ { { 0x00A2 }, { 0xC2 , 0xA2 } }, // U+00A2 - Cent symbol
197
+ { { 0x20AC }, { 0xE2 , 0x82 , 0xAC } } // U+20AC - Euro symbol
198
+ };
199
+
200
+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
201
+ }
202
+
203
+ TEST_CASE (" CodexTest_EncodeTrueUtf8_SimpleString" , " [CodexTest]" )
204
+ {
205
+ const charcount_t charCount = 3 ;
206
+ utf8char_t encodedBuffer[(charCount + 1 ) * 3 ]; // +1 since the buffer will be null terminated
207
+ char16* sourceBuffer = L" abc" ;
208
+ size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate (encodedBuffer, sourceBuffer, charCount);
209
+ CHECK (numEncodedBytes == charCount);
210
+ for (int i = 0 ; i < charCount; i++)
211
+ {
212
+ CHECK (sourceBuffer[i] == (char16)encodedBuffer[i]);
213
+ }
214
+ }
215
+ };
0 commit comments