@@ -15,268 +15,4 @@ class Convert
1515 private function __construct ()
1616 {
1717 }
18-
19- /**
20- * Convert the string to a different encoding.
21- *
22- * @param string $string The string to convert to UTF-8.
23- * @param string $fromEncoding The current character encoding of the string.
24- * @param string $toEncoding The desired character encoding for the string.
25- */
26- public static function convertEncoding (string $ string , string $ fromEncoding , string $ toEncoding ): ?string
27- {
28- $ input = Encode::normalize ($ fromEncoding );
29- $ output = Encode::normalize ($ toEncoding );
30-
31- // We fail to fail on non US-ASCII bytes
32- if ('US-ASCII ' === $ input ) {
33- $ non_ascii_octets = '' ;
34-
35- if (!$ non_ascii_octets ) {
36- for ($ i = 0x80 ; $ i <= 0xFF ; $ i ++) {
37- $ non_ascii_octets .= \chr ($ i );
38- }
39- }
40-
41- $ string = \mb_substr ($ string , 0 , \strcspn ($ string , $ non_ascii_octets ));
42- }
43-
44- // This is first, as behaviour of this is completely predictable
45- if ('windows-1252 ' === $ input && 'UTF-8 ' === $ output ) {
46- return self ::windows1252ToUtf8 ($ string );
47- }
48-
49- // This is second, as behaviour of this varies only with PHP version.
50- return self ::withMbstring ($ string , $ input , $ output )
51- // This is last, as behaviour of this varies with OS userland and PHP version
52- ?? self ::withUconverter ($ string , $ input , $ output )
53- // If we can't do anything, just fail
54- ?? null ;
55- }
56-
57- /**
58- * Converts a unicode codepoint to a UTF-8 character.
59- *
60- * @param int $codepoint Unicode codepoint.
61- */
62- public static function codepointToUtf8 (int $ codepoint ): string
63- {
64- if ($ codepoint < 0 ) {
65- return false ;
66- }
67-
68- if ($ codepoint <= 0x7f ) {
69- return \chr ($ codepoint );
70- }
71-
72- if ($ codepoint <= 0x7ff ) {
73- return \chr (0xc0 | ($ codepoint >> 6 ))
74- . \chr (0x80 | ($ codepoint & 0x3f ));
75- }
76-
77- if ($ codepoint <= 0xffff ) {
78- return \chr (0xe0 | ($ codepoint >> 12 ))
79- . \chr (0x80 | (($ codepoint >> 6 ) & 0x3f ))
80- . \chr (0x80 | ($ codepoint & 0x3f ));
81- }
82-
83- if ($ codepoint <= 0x10ffff ) {
84- return \chr (0xf0 | ($ codepoint >> 18 ))
85- . \chr (0x80 | (($ codepoint >> 12 ) & 0x3f ))
86- . \chr (0x80 | (($ codepoint >> 6 ) & 0x3f ))
87- . \chr (0x80 | ($ codepoint & 0x3f ));
88- }
89-
90- // U+FFFD REPLACEMENT CHARACTER
91- return "\xEF\xBF\xBD" ;
92- }
93-
94- /**
95- * Convert a string using mbstring.
96- *
97- * @param string $string The string to convert to UTF-8.
98- * @param string $fromEncoding The current character encoding of the string.
99- * @param string $toEncoding The desired character encoding for the string.
100- */
101- public static function withMbstring (string $ string , string $ fromEncoding , string $ toEncoding ): ?string
102- {
103- if ('windows-949 ' === $ fromEncoding ) {
104- $ input = 'EUC-KR ' ;
105- }
106-
107- if ('windows-949 ' === $ toEncoding ) {
108- $ output = 'EUC-KR ' ;
109- }
110-
111- if ('Windows-31J ' === $ fromEncoding ) {
112- $ input = 'SJIS ' ;
113- }
114-
115- if ('Windows-31J ' === $ toEncoding ) {
116- $ output = 'SJIS ' ;
117- }
118-
119- // Check that the encoding is supported
120- if ("\x00\x80" === \mb_convert_encoding ("\x80" , 'UTF-16BE ' , $ fromEncoding )) {
121- return false ;
122- }
123-
124- if (!\in_array ($ fromEncoding , \mb_list_encodings (), true )) {
125- return false ;
126- }
127-
128- // Let's do some conversion
129- return \mb_convert_encoding ($ string , $ toEncoding , $ fromEncoding ) ?? null ;
130- }
131-
132- /**
133- * Convert a string using the UConvert class from the Intl extension.
134- *
135- * @param string $string The string to convert to UTF-8.
136- * @param string $fromEncoding The current character encoding of the string.
137- * @param string $toEncoding The desired character encoding for the string.
138- */
139- public static function withUconverter (string $ string , string $ fromEncoding , string $ toEncoding ): string
140- {
141- return Uself::transcode ($ string , $ toEncoding , $ fromEncoding );
142- }
143-
144- /**
145- * Converts a Windows-1252/ISO-8859-1 encoded string to a UTF-8 encoded string.
146- *
147- * @param string $string Windows-1252/ISO-8859-1 encoded string.
148- */
149- public static function windows1252ToUtf8 (string $ string ): string
150- {
151- return \strtr ($ string , [
152- "\x80" => "\xE2\x82\xAC" ,
153- "\x81" => "\xEF\xBF\xBD" ,
154- "\x82" => "\xE2\x80\x9A" ,
155- "\x83" => "\xC6\x92" ,
156- "\x84" => "\xE2\x80\x9E" ,
157- "\x85" => "\xE2\x80\xA6" ,
158- "\x86" => "\xE2\x80\xA0" ,
159- "\x87" => "\xE2\x80\xA1" ,
160- "\x88" => "\xCB\x86" ,
161- "\x89" => "\xE2\x80\xB0" ,
162- "\x8A" => "\xC5\xA0" ,
163- "\x8B" => "\xE2\x80\xB9" ,
164- "\x8C" => "\xC5\x92" ,
165- "\x8D" => "\xEF\xBF\xBD" ,
166- "\x8E" => "\xC5\xBD" ,
167- "\x8F" => "\xEF\xBF\xBD" ,
168- "\x90" => "\xEF\xBF\xBD" ,
169- "\x91" => "\xE2\x80\x98" ,
170- "\x92" => "\xE2\x80\x99" ,
171- "\x93" => "\xE2\x80\x9C" ,
172- "\x94" => "\xE2\x80\x9D" ,
173- "\x95" => "\xE2\x80\xA2" ,
174- "\x96" => "\xE2\x80\x93" ,
175- "\x97" => "\xE2\x80\x94" ,
176- "\x98" => "\xCB\x9C" ,
177- "\x99" => "\xE2\x84\xA2" ,
178- "\x9A" => "\xC5\xA1" ,
179- "\x9B" => "\xE2\x80\xBA" ,
180- "\x9C" => "\xC5\x93" ,
181- "\x9D" => "\xEF\xBF\xBD" ,
182- "\x9E" => "\xC5\xBE" ,
183- "\x9F" => "\xC5\xB8" ,
184- "\xA0" => "\xC2\xA0" ,
185- "\xA1" => "\xC2\xA1" ,
186- "\xA2" => "\xC2\xA2" ,
187- "\xA3" => "\xC2\xA3" ,
188- "\xA4" => "\xC2\xA4" ,
189- "\xA5" => "\xC2\xA5" ,
190- "\xA6" => "\xC2\xA6" ,
191- "\xA7" => "\xC2\xA7" ,
192- "\xA8" => "\xC2\xA8" ,
193- "\xA9" => "\xC2\xA9" ,
194- "\xAA" => "\xC2\xAA" ,
195- "\xAB" => "\xC2\xAB" ,
196- "\xAC" => "\xC2\xAC" ,
197- "\xAD" => "\xC2\xAD" ,
198- "\xAE" => "\xC2\xAE" ,
199- "\xAF" => "\xC2\xAF" ,
200- "\xB0" => "\xC2\xB0" ,
201- "\xB1" => "\xC2\xB1" ,
202- "\xB2" => "\xC2\xB2" ,
203- "\xB3" => "\xC2\xB3" ,
204- "\xB4" => "\xC2\xB4" ,
205- "\xB5" => "\xC2\xB5" ,
206- "\xB6" => "\xC2\xB6" ,
207- "\xB7" => "\xC2\xB7" ,
208- "\xB8" => "\xC2\xB8" ,
209- "\xB9" => "\xC2\xB9" ,
210- "\xBA" => "\xC2\xBA" ,
211- "\xBB" => "\xC2\xBB" ,
212- "\xBC" => "\xC2\xBC" ,
213- "\xBD" => "\xC2\xBD" ,
214- "\xBE" => "\xC2\xBE" ,
215- "\xBF" => "\xC2\xBF" ,
216- "\xC0" => "\xC3\x80" ,
217- "\xC1" => "\xC3\x81" ,
218- "\xC2" => "\xC3\x82" ,
219- "\xC3" => "\xC3\x83" ,
220- "\xC4" => "\xC3\x84" ,
221- "\xC5" => "\xC3\x85" ,
222- "\xC6" => "\xC3\x86" ,
223- "\xC7" => "\xC3\x87" ,
224- "\xC8" => "\xC3\x88" ,
225- "\xC9" => "\xC3\x89" ,
226- "\xCA" => "\xC3\x8A" ,
227- "\xCB" => "\xC3\x8B" ,
228- "\xCC" => "\xC3\x8C" ,
229- "\xCD" => "\xC3\x8D" ,
230- "\xCE" => "\xC3\x8E" ,
231- "\xCF" => "\xC3\x8F" ,
232- "\xD0" => "\xC3\x90" ,
233- "\xD1" => "\xC3\x91" ,
234- "\xD2" => "\xC3\x92" ,
235- "\xD3" => "\xC3\x93" ,
236- "\xD4" => "\xC3\x94" ,
237- "\xD5" => "\xC3\x95" ,
238- "\xD6" => "\xC3\x96" ,
239- "\xD7" => "\xC3\x97" ,
240- "\xD8" => "\xC3\x98" ,
241- "\xD9" => "\xC3\x99" ,
242- "\xDA" => "\xC3\x9A" ,
243- "\xDB" => "\xC3\x9B" ,
244- "\xDC" => "\xC3\x9C" ,
245- "\xDD" => "\xC3\x9D" ,
246- "\xDE" => "\xC3\x9E" ,
247- "\xDF" => "\xC3\x9F" ,
248- "\xE0" => "\xC3\xA0" ,
249- "\xE1" => "\xC3\xA1" ,
250- "\xE2" => "\xC3\xA2" ,
251- "\xE3" => "\xC3\xA3" ,
252- "\xE4" => "\xC3\xA4" ,
253- "\xE5" => "\xC3\xA5" ,
254- "\xE6" => "\xC3\xA6" ,
255- "\xE7" => "\xC3\xA7" ,
256- "\xE8" => "\xC3\xA8" ,
257- "\xE9" => "\xC3\xA9" ,
258- "\xEA" => "\xC3\xAA" ,
259- "\xEB" => "\xC3\xAB" ,
260- "\xEC" => "\xC3\xAC" ,
261- "\xED" => "\xC3\xAD" ,
262- "\xEE" => "\xC3\xAE" ,
263- "\xEF" => "\xC3\xAF" ,
264- "\xF0" => "\xC3\xB0" ,
265- "\xF1" => "\xC3\xB1" ,
266- "\xF2" => "\xC3\xB2" ,
267- "\xF3" => "\xC3\xB3" ,
268- "\xF4" => "\xC3\xB4" ,
269- "\xF5" => "\xC3\xB5" ,
270- "\xF6" => "\xC3\xB6" ,
271- "\xF7" => "\xC3\xB7" ,
272- "\xF8" => "\xC3\xB8" ,
273- "\xF9" => "\xC3\xB9" ,
274- "\xFA" => "\xC3\xBA" ,
275- "\xFB" => "\xC3\xBB" ,
276- "\xFC" => "\xC3\xBC" ,
277- "\xFD" => "\xC3\xBD" ,
278- "\xFE" => "\xC3\xBE" ,
279- "\xFF" => "\xC3\xBF" ,
280- ]);
281- }
28218}
0 commit comments