1+ // -------------------------------------------------------------------------------------------------------
2+ // Copyright (C) Microsoft. All rights reserved.
3+ // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
4+ // -------------------------------------------------------------------------------------------------------
5+ #include " stdafx.h"
6+ #include " catch.hpp"
7+ #include < process.h>
8+ #include " Codex\Utf8Codex.h"
9+
10+ #pragma warning(disable:4100) // unreferenced formal parameter
11+ #pragma warning(disable:6387) // suppressing preFAST which raises warning for passing null to the JsRT APIs
12+ #pragma warning(disable:6262) // CATCH is using stack variables to report errors, suppressing the preFAST warning.
13+
14+ namespace CodexTest
15+ {
16+ // /
17+ // / The following test verifies that for invalid characters, we replace them
18+ // / with the unicode replacement character
19+ // /
20+
21+ // Verify single utf8-encoded codepoint
22+ void CheckIsUnicodeReplacementChar (const utf8char_t * encodedBuffer)
23+ {
24+ CHECK (encodedBuffer[0 ] == 0xEF );
25+ CHECK (encodedBuffer[1 ] == 0xBF );
26+ CHECK (encodedBuffer[2 ] == 0xBD );
27+ }
28+
29+ //
30+ // Following test cases are based on the Utf-8 decoder tests
31+ // suggested by Markus Kuhn at https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
32+ //
33+ TEST_CASE (" CodexTest_EncodeTrueUtf8_SingleSurrogates" , " [CodexTest]" )
34+ {
35+ const charcount_t charCount = 1 ;
36+ utf8char_t encodedBuffer[(charCount + 1 ) * 3 ]; // +1 since the buffer will be null-terminated
37+
38+ char16 testValues[] = { 0xD800 , 0xDB7F , 0xDB80 , 0xDBFF , 0xDC00 , 0xDF80 , 0xDFFF };
39+ const int numTestCases = _countof (testValues);
40+
41+ for (int i = 0 ; i < numTestCases; i++)
42+ {
43+ size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate (encodedBuffer, &testValues[i], charCount);
44+ CHECK (numEncodedBytes == 3 );
45+ CheckIsUnicodeReplacementChar (encodedBuffer);
46+ }
47+ }
48+
49+ //
50+ // Test encoding of given utf16-encoded strings into another encoding
51+ //
52+ // In the expected encoded string, extra bytes are represented as 0
53+ //
54+
55+ template <typename TTestCase, typename TEncodingFunc>
56+ void RunUtf8EncodingTestCase (const TTestCase &testCases, const TEncodingFunc func)
57+ {
58+ const int numTestCases = _countof (testCases);
59+ const charcount_t charCount = _countof (testCases[0 ].surrogatePair );
60+ const charcount_t maxEncodedByteCount = _countof (testCases[0 ].utf8Encoding );
61+ utf8char_t encodedBuffer[maxEncodedByteCount + 1 ]; // +1 in case a null-terminating func is passed in
62+
63+ for (int i = 0 ; i < numTestCases; i++)
64+ {
65+ size_t numEncodedBytes = func (encodedBuffer, testCases[i].surrogatePair , charCount);
66+ CHECK (numEncodedBytes <= maxEncodedByteCount);
67+ for (size_t j = 0 ; j < numEncodedBytes; j++)
68+ {
69+ CHECK (encodedBuffer[j] == testCases[i].utf8Encoding [j]);
70+ }
71+
72+ // Check and make sure there were no other bytes expected in the encoded string
73+ if (numEncodedBytes < maxEncodedByteCount)
74+ {
75+ for (size_t j = numEncodedBytes; j < maxEncodedByteCount; j++)
76+ {
77+ CHECK (testCases[i].utf8Encoding [j] == 0 );
78+ }
79+ }
80+ }
81+ }
82+
83+ TEST_CASE (" CodexTest_EncodeCesu_PairedSurrogates" , " [CodexTest]" )
84+ {
85+ // Each of these test cases verifies the encoding
86+ // of a single surrogate pair into a 6 byte CESU string
87+ // Each surrogate-pair unit is encoded seperately into utf8
88+ struct TestCase
89+ {
90+ char16 surrogatePair[2 ];
91+ utf8char_t utf8Encoding[6 ];
92+ };
93+
94+ TestCase testCases[] = {
95+ { { 0xD800 , 0xDC00 }, { 0xED , 0xA0 , 0x80 , 0xED , 0xB0 , 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
96+ { { 0xD800 , 0xDFFF }, { 0xED , 0xA0 , 0x80 , 0xED , 0xBF , 0xBF } }, // U+0103FF
97+ { { 0xDB7F , 0xDC00 }, { 0xED , 0xAD , 0xBF , 0xED , 0xB0 , 0x80 } }, // U+0EFC00
98+ { { 0xDB7F , 0xDFFF }, { 0xED , 0xAD , 0xBF , 0xED , 0xBF , 0xBF } }, // U+0EFFFF
99+ { { 0xDB80 , 0xDC00 }, { 0xED , 0xAE , 0x80 , 0xED , 0xB0 , 0x80 } }, // U+0F0000 Plane 15 Private Use First
100+ { { 0xDB80 , 0xDFFF }, { 0xED , 0xAE , 0x80 , 0xED , 0xBF , 0xBF } }, // U+0F03FF
101+ { { 0xDBFF , 0xDC00 }, { 0xED , 0xAF , 0xBF , 0xED , 0xB0 , 0x80 } }, // U+10FC00
102+ { { 0xDBFF , 0xDFFF }, { 0xED , 0xAF , 0xBF , 0xED , 0xBF , 0xBF } } // U+10FFFF
103+ };
104+
105+ RunUtf8EncodingTestCase (testCases, static_cast <size_t (*)(utf8char_t *, const char16*, charcount_t )>(utf8::EncodeInto));
106+ }
107+
108+ TEST_CASE (" CodexTest_EncodeUtf8_PairedSurrogates" , " [CodexTest]" )
109+ {
110+ // Each of these test cases verifies the encoding
111+ // of a single surrogate pair into a 4 byte utf8 string
112+ // Each surrogate-pair unit is decoded to its original codepoint
113+ // and then encoded into utf8
114+ struct TestCase
115+ {
116+ char16 surrogatePair[2 ];
117+ utf8char_t utf8Encoding[4 ];
118+ };
119+
120+ TestCase testCases[] = {
121+ { { 0xD800 , 0xDC00 }, { 0xF0 , 0x90 , 0x80 , 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
122+ { { 0xD800 , 0xDFFF }, { 0xF0 , 0x90 , 0x8F , 0xBF } }, // U+0103FF
123+ { { 0xDB7F , 0xDC00 }, { 0xF3 , 0xAF , 0xB0 , 0x80 } }, // U+0EFC00
124+ { { 0xDB7F , 0xDFFF }, { 0xF3 , 0xAF , 0xBF , 0xBF } }, // U+0EFFFF
125+ { { 0xDB80 , 0xDC00 }, { 0xF3 , 0xB0 , 0x80 , 0x80 } }, // U+0F0000 Plane 15 Private Use First
126+ { { 0xDB80 , 0xDFFF }, { 0xF3 , 0xB0 , 0x8F , 0xBF } }, // U+0F03FF
127+ { { 0xDBFF , 0xDC00 }, { 0xF4 , 0x8F , 0xB0 , 0x80 } }, // U+10FC00
128+ { { 0xDBFF , 0xDFFF }, { 0xF4 , 0x8F , 0xBF , 0xBF } } // U+10FFFF
129+ };
130+
131+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
132+ }
133+
134+ TEST_CASE (" CodexTest_EncodeUtf8_NonCharacters" , " [CodexTest]" )
135+ {
136+ // Each of these test cases verifies the encoding
137+ // of certain problematic codepoints that do not represent
138+ // characters
139+ struct TestCase
140+ {
141+ char16 surrogatePair[1 ];
142+ utf8char_t utf8Encoding[3 ];
143+ };
144+
145+ TestCase testCases[] = {
146+ { { 0xFFFE }, { 0xEF , 0xBF , 0xBE } }, // U+FFFE
147+ { { 0xFFFF }, { 0xEF , 0xBF , 0xBF } } // U+FFFF
148+ };
149+
150+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
151+ }
152+
153+ TEST_CASE (" CodexTest_EncodeUtf8_BoundaryChars" , " [CodexTest]" )
154+ {
155+ // Each of these test cases verifies the encoding
156+ // of boundary conditions
157+ struct SingleChar16TestCase
158+ {
159+ char16 surrogatePair[1 ];
160+ utf8char_t utf8Encoding[3 ];
161+ };
162+
163+ SingleChar16TestCase testCases[] = {
164+ { { 0xD7FF }, { 0xED , 0x9F , 0xBF } }, // U+D7FF
165+ { { 0xE000 }, { 0xEE , 0x80 , 0x80 } }, // U+E000
166+ { { 0xFFFD }, { 0xEF , 0xBF , 0xBD } } // U+FFFD
167+ };
168+
169+ struct TwoChar16TestCase
170+ {
171+ char16 surrogatePair[2 ];
172+ utf8char_t utf8Encoding[4 ];
173+ };
174+
175+ TwoChar16TestCase testCases2[] = {
176+ { { 0xDBFF , 0xDFFF }, { 0xF4 , 0x8F , 0xBF , 0xBF } } // U+10FFFF
177+ };
178+
179+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
180+ RunUtf8EncodingTestCase (testCases2, utf8::EncodeTrueUtf8IntoAndNullTerminate);
181+ }
182+
183+ TEST_CASE (" CodexTest_EncodeUtf8_SimpleCharacters" , " [CodexTest]" )
184+ {
185+ // Each of these test cases verifies the encoding
186+ // of certain problematic codepoints that do not represent
187+ // characters
188+ struct TestCase
189+ {
190+ char16 surrogatePair[1 ];
191+ utf8char_t utf8Encoding[3 ];
192+ };
193+
194+ TestCase testCases[] = {
195+ { { 0x0024 }, { 0x24 } }, // U+0024 - Dollar Symbol
196+ { { 0x00A2 }, { 0xC2 , 0xA2 } }, // U+00A2 - Cent symbol
197+ { { 0x20AC }, { 0xE2 , 0x82 , 0xAC } } // U+20AC - Euro symbol
198+ };
199+
200+ RunUtf8EncodingTestCase (testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
201+ }
202+
203+ TEST_CASE (" CodexTest_EncodeTrueUtf8_SimpleString" , " [CodexTest]" )
204+ {
205+ const charcount_t charCount = 3 ;
206+ utf8char_t encodedBuffer[(charCount + 1 ) * 3 ]; // +1 since the buffer will be null terminated
207+ char16* sourceBuffer = L" abc" ;
208+ size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate (encodedBuffer, sourceBuffer, charCount);
209+ CHECK (numEncodedBytes == charCount);
210+ for (int i = 0 ; i < charCount; i++)
211+ {
212+ CHECK (sourceBuffer[i] == (char16)encodedBuffer[i]);
213+ }
214+ }
215+ };
0 commit comments