Skip to content

Commit 8718bdc

Browse files
committed
More testing.
1 parent a447e6d commit 8718bdc

10 files changed

Lines changed: 828 additions & 665 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ Once we have high confidence in the _input_ encoding, converting to UTF-8 become
5757

5858
* [UTS #22: Unicode Character Mapping Markup Language, §1.4: Charset Alias Matching](https://www.unicode.org/reports/tr22/tr22-8.html)
5959
* [Extensible Markup Language (XML) 1.0 (Fifth Edition), Appendix F.1: “Detection Without External Encoding Information”](https://www.w3.org/TR/xml/#sec-guessing-no-ext-info)
60+
* [RFC 3023: XML Media Types](https://tools.ietf.org/html/rfc3023)
6061
* [HTML Living Standard: Determining the Character Encoding](https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding)
6162
* [Encoding Living Standard](https://encoding.spec.whatwg.org)
6263
* [BCP-47: Tags for Identifying Languages](https://tools.ietf.org/html/bcp47)

src/TextEncoder.php

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
namespace TextEncoder;
1212

13-
use TextEncoder\Enum\Encoding;
1413
use TextEncoder\Util\Convert;
1514

1615
class TextEncoder implements TextEncoderInterface
@@ -22,16 +21,10 @@ private function __construct()
2221
/**
2322
* Detect the current current character encoding of a string.
2423
*
25-
* @param string $detect The string with which to detect the character encoding.
26-
* @param array|null $encodingList The list of encodings to compare against. If set to `null`, a default function
27-
* will be used to produce a comparison list. The default value is `null`.
24+
* @param ?array $encodingList
2825
*/
2926
public static function detectEncoding(string $detect, ?array $encodingList = null): string
3027
{
31-
$encodingList = $encodingList ?: \array_values(
32-
Encoding::introspect()
33-
);
34-
3528
return \mb_detect_encoding($detect, $encodingList, true);
3629
}
3730

src/Util/Convert.php

Lines changed: 0 additions & 264 deletions
Original file line numberDiff line numberDiff line change
@@ -15,268 +15,4 @@ class Convert
1515
private function __construct()
1616
{
1717
}
18-
19-
/**
20-
* Convert the string to a different encoding.
21-
*
22-
* @param string $string The string to convert to UTF-8.
23-
* @param string $fromEncoding The current character encoding of the string.
24-
* @param string $toEncoding The desired character encoding for the string.
25-
*/
26-
public static function convertEncoding(string $string, string $fromEncoding, string $toEncoding): ?string
27-
{
28-
$input = Encode::normalize($fromEncoding);
29-
$output = Encode::normalize($toEncoding);
30-
31-
// We fail to fail on non US-ASCII bytes
32-
if ('US-ASCII' === $input) {
33-
$non_ascii_octets = '';
34-
35-
if (!$non_ascii_octets) {
36-
for ($i = 0x80; $i <= 0xFF; $i++) {
37-
$non_ascii_octets .= \chr($i);
38-
}
39-
}
40-
41-
$string = \mb_substr($string, 0, \strcspn($string, $non_ascii_octets));
42-
}
43-
44-
// This is first, as behaviour of this is completely predictable
45-
if ('windows-1252' === $input && 'UTF-8' === $output) {
46-
return self::windows1252ToUtf8($string);
47-
}
48-
49-
// This is second, as behaviour of this varies only with PHP version.
50-
return self::withMbstring($string, $input, $output)
51-
// This is last, as behaviour of this varies with OS userland and PHP version
52-
?? self::withUconverter($string, $input, $output)
53-
// If we can't do anything, just fail
54-
?? null;
55-
}
56-
57-
/**
58-
* Converts a unicode codepoint to a UTF-8 character.
59-
*
60-
* @param int $codepoint Unicode codepoint.
61-
*/
62-
public static function codepointToUtf8(int $codepoint): string
63-
{
64-
if ($codepoint < 0) {
65-
return false;
66-
}
67-
68-
if ($codepoint <= 0x7f) {
69-
return \chr($codepoint);
70-
}
71-
72-
if ($codepoint <= 0x7ff) {
73-
return \chr(0xc0 | ($codepoint >> 6))
74-
. \chr(0x80 | ($codepoint & 0x3f));
75-
}
76-
77-
if ($codepoint <= 0xffff) {
78-
return \chr(0xe0 | ($codepoint >> 12))
79-
. \chr(0x80 | (($codepoint >> 6) & 0x3f))
80-
. \chr(0x80 | ($codepoint & 0x3f));
81-
}
82-
83-
if ($codepoint <= 0x10ffff) {
84-
return \chr(0xf0 | ($codepoint >> 18))
85-
. \chr(0x80 | (($codepoint >> 12) & 0x3f))
86-
. \chr(0x80 | (($codepoint >> 6) & 0x3f))
87-
. \chr(0x80 | ($codepoint & 0x3f));
88-
}
89-
90-
// U+FFFD REPLACEMENT CHARACTER
91-
return "\xEF\xBF\xBD";
92-
}
93-
94-
/**
95-
* Convert a string using mbstring.
96-
*
97-
* @param string $string The string to convert to UTF-8.
98-
* @param string $fromEncoding The current character encoding of the string.
99-
* @param string $toEncoding The desired character encoding for the string.
100-
*/
101-
public static function withMbstring(string $string, string $fromEncoding, string $toEncoding): ?string
102-
{
103-
if ('windows-949' === $fromEncoding) {
104-
$input = 'EUC-KR';
105-
}
106-
107-
if ('windows-949' === $toEncoding) {
108-
$output = 'EUC-KR';
109-
}
110-
111-
if ('Windows-31J' === $fromEncoding) {
112-
$input = 'SJIS';
113-
}
114-
115-
if ('Windows-31J' === $toEncoding) {
116-
$output = 'SJIS';
117-
}
118-
119-
// Check that the encoding is supported
120-
if ("\x00\x80" === \mb_convert_encoding("\x80", 'UTF-16BE', $fromEncoding)) {
121-
return false;
122-
}
123-
124-
if (!\in_array($fromEncoding, \mb_list_encodings(), true)) {
125-
return false;
126-
}
127-
128-
// Let's do some conversion
129-
return \mb_convert_encoding($string, $toEncoding, $fromEncoding) ?? null;
130-
}
131-
132-
/**
133-
* Convert a string using the UConvert class from the Intl extension.
134-
*
135-
* @param string $string The string to convert to UTF-8.
136-
* @param string $fromEncoding The current character encoding of the string.
137-
* @param string $toEncoding The desired character encoding for the string.
138-
*/
139-
public static function withUconverter(string $string, string $fromEncoding, string $toEncoding): string
140-
{
141-
return Uself::transcode($string, $toEncoding, $fromEncoding);
142-
}
143-
144-
/**
145-
* Converts a Windows-1252/ISO-8859-1 encoded string to a UTF-8 encoded string.
146-
*
147-
* @param string $string Windows-1252/ISO-8859-1 encoded string.
148-
*/
149-
public static function windows1252ToUtf8(string $string): string
150-
{
151-
return \strtr($string, [
152-
"\x80" => "\xE2\x82\xAC",
153-
"\x81" => "\xEF\xBF\xBD",
154-
"\x82" => "\xE2\x80\x9A",
155-
"\x83" => "\xC6\x92",
156-
"\x84" => "\xE2\x80\x9E",
157-
"\x85" => "\xE2\x80\xA6",
158-
"\x86" => "\xE2\x80\xA0",
159-
"\x87" => "\xE2\x80\xA1",
160-
"\x88" => "\xCB\x86",
161-
"\x89" => "\xE2\x80\xB0",
162-
"\x8A" => "\xC5\xA0",
163-
"\x8B" => "\xE2\x80\xB9",
164-
"\x8C" => "\xC5\x92",
165-
"\x8D" => "\xEF\xBF\xBD",
166-
"\x8E" => "\xC5\xBD",
167-
"\x8F" => "\xEF\xBF\xBD",
168-
"\x90" => "\xEF\xBF\xBD",
169-
"\x91" => "\xE2\x80\x98",
170-
"\x92" => "\xE2\x80\x99",
171-
"\x93" => "\xE2\x80\x9C",
172-
"\x94" => "\xE2\x80\x9D",
173-
"\x95" => "\xE2\x80\xA2",
174-
"\x96" => "\xE2\x80\x93",
175-
"\x97" => "\xE2\x80\x94",
176-
"\x98" => "\xCB\x9C",
177-
"\x99" => "\xE2\x84\xA2",
178-
"\x9A" => "\xC5\xA1",
179-
"\x9B" => "\xE2\x80\xBA",
180-
"\x9C" => "\xC5\x93",
181-
"\x9D" => "\xEF\xBF\xBD",
182-
"\x9E" => "\xC5\xBE",
183-
"\x9F" => "\xC5\xB8",
184-
"\xA0" => "\xC2\xA0",
185-
"\xA1" => "\xC2\xA1",
186-
"\xA2" => "\xC2\xA2",
187-
"\xA3" => "\xC2\xA3",
188-
"\xA4" => "\xC2\xA4",
189-
"\xA5" => "\xC2\xA5",
190-
"\xA6" => "\xC2\xA6",
191-
"\xA7" => "\xC2\xA7",
192-
"\xA8" => "\xC2\xA8",
193-
"\xA9" => "\xC2\xA9",
194-
"\xAA" => "\xC2\xAA",
195-
"\xAB" => "\xC2\xAB",
196-
"\xAC" => "\xC2\xAC",
197-
"\xAD" => "\xC2\xAD",
198-
"\xAE" => "\xC2\xAE",
199-
"\xAF" => "\xC2\xAF",
200-
"\xB0" => "\xC2\xB0",
201-
"\xB1" => "\xC2\xB1",
202-
"\xB2" => "\xC2\xB2",
203-
"\xB3" => "\xC2\xB3",
204-
"\xB4" => "\xC2\xB4",
205-
"\xB5" => "\xC2\xB5",
206-
"\xB6" => "\xC2\xB6",
207-
"\xB7" => "\xC2\xB7",
208-
"\xB8" => "\xC2\xB8",
209-
"\xB9" => "\xC2\xB9",
210-
"\xBA" => "\xC2\xBA",
211-
"\xBB" => "\xC2\xBB",
212-
"\xBC" => "\xC2\xBC",
213-
"\xBD" => "\xC2\xBD",
214-
"\xBE" => "\xC2\xBE",
215-
"\xBF" => "\xC2\xBF",
216-
"\xC0" => "\xC3\x80",
217-
"\xC1" => "\xC3\x81",
218-
"\xC2" => "\xC3\x82",
219-
"\xC3" => "\xC3\x83",
220-
"\xC4" => "\xC3\x84",
221-
"\xC5" => "\xC3\x85",
222-
"\xC6" => "\xC3\x86",
223-
"\xC7" => "\xC3\x87",
224-
"\xC8" => "\xC3\x88",
225-
"\xC9" => "\xC3\x89",
226-
"\xCA" => "\xC3\x8A",
227-
"\xCB" => "\xC3\x8B",
228-
"\xCC" => "\xC3\x8C",
229-
"\xCD" => "\xC3\x8D",
230-
"\xCE" => "\xC3\x8E",
231-
"\xCF" => "\xC3\x8F",
232-
"\xD0" => "\xC3\x90",
233-
"\xD1" => "\xC3\x91",
234-
"\xD2" => "\xC3\x92",
235-
"\xD3" => "\xC3\x93",
236-
"\xD4" => "\xC3\x94",
237-
"\xD5" => "\xC3\x95",
238-
"\xD6" => "\xC3\x96",
239-
"\xD7" => "\xC3\x97",
240-
"\xD8" => "\xC3\x98",
241-
"\xD9" => "\xC3\x99",
242-
"\xDA" => "\xC3\x9A",
243-
"\xDB" => "\xC3\x9B",
244-
"\xDC" => "\xC3\x9C",
245-
"\xDD" => "\xC3\x9D",
246-
"\xDE" => "\xC3\x9E",
247-
"\xDF" => "\xC3\x9F",
248-
"\xE0" => "\xC3\xA0",
249-
"\xE1" => "\xC3\xA1",
250-
"\xE2" => "\xC3\xA2",
251-
"\xE3" => "\xC3\xA3",
252-
"\xE4" => "\xC3\xA4",
253-
"\xE5" => "\xC3\xA5",
254-
"\xE6" => "\xC3\xA6",
255-
"\xE7" => "\xC3\xA7",
256-
"\xE8" => "\xC3\xA8",
257-
"\xE9" => "\xC3\xA9",
258-
"\xEA" => "\xC3\xAA",
259-
"\xEB" => "\xC3\xAB",
260-
"\xEC" => "\xC3\xAC",
261-
"\xED" => "\xC3\xAD",
262-
"\xEE" => "\xC3\xAE",
263-
"\xEF" => "\xC3\xAF",
264-
"\xF0" => "\xC3\xB0",
265-
"\xF1" => "\xC3\xB1",
266-
"\xF2" => "\xC3\xB2",
267-
"\xF3" => "\xC3\xB3",
268-
"\xF4" => "\xC3\xB4",
269-
"\xF5" => "\xC3\xB5",
270-
"\xF6" => "\xC3\xB6",
271-
"\xF7" => "\xC3\xB7",
272-
"\xF8" => "\xC3\xB8",
273-
"\xF9" => "\xC3\xB9",
274-
"\xFA" => "\xC3\xBA",
275-
"\xFB" => "\xC3\xBB",
276-
"\xFC" => "\xC3\xBC",
277-
"\xFD" => "\xC3\xBD",
278-
"\xFE" => "\xC3\xBE",
279-
"\xFF" => "\xC3\xBF",
280-
]);
281-
}
28218
}

0 commit comments

Comments
 (0)