|
9 | 9 |
|
10 | 10 | class Csv extends BaseReader |
11 | 11 | { |
| 12 | + const UTF8_BOM = "\xEF\xBB\xBF"; |
| 13 | + const UTF8_BOM_LEN = 3; |
| 14 | + const UTF16BE_BOM = "\xfe\xff"; |
| 15 | + const UTF16BE_BOM_LEN = 2; |
| 16 | + const UTF16BE_LF = "\x00\x0a"; |
| 17 | + const UTF16LE_BOM = "\xff\xfe"; |
| 18 | + const UTF16LE_BOM_LEN = 2; |
| 19 | + const UTF16LE_LF = "\x0a\x00"; |
| 20 | + const UTF32BE_BOM = "\x00\x00\xfe\xff"; |
| 21 | + const UTF32BE_BOM_LEN = 4; |
| 22 | + const UTF32BE_LF = "\x00\x00\x00\x0a"; |
| 23 | + const UTF32LE_BOM = "\xff\xfe\x00\x00"; |
| 24 | + const UTF32LE_BOM_LEN = 4; |
| 25 | + const UTF32LE_LF = "\x0a\x00\x00\x00"; |
| 26 | + |
12 | 27 | /** |
13 | 28 | * Input encoding. |
14 | 29 | * |
@@ -90,12 +105,8 @@ protected function skipBOM(): void |
90 | 105 | { |
91 | 106 | rewind($this->fileHandle); |
92 | 107 |
|
93 | | - switch ($this->inputEncoding) { |
94 | | - case 'UTF-8': |
95 | | - fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ? |
96 | | - fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0); |
97 | | - |
98 | | - break; |
| 108 | + if (fgets($this->fileHandle, self::UTF8_BOM_LEN + 1) !== self::UTF8_BOM) { |
| 109 | + rewind($this->fileHandle); |
99 | 110 | } |
100 | 111 | } |
101 | 112 |
|
@@ -213,7 +224,9 @@ function ($sum, $value) use ($median) { |
213 | 224 | private function getNextLine() |
214 | 225 | { |
215 | 226 | $line = ''; |
216 | | - $enclosure = '(?<!' . preg_quote($this->escapeCharacter, '/') . ')' . preg_quote($this->enclosure, '/'); |
| 227 | + $enclosure = ($this->escapeCharacter === '' ? '' |
| 228 | + : ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')')) |
| 229 | + . preg_quote($this->enclosure, '/'); |
217 | 230 |
|
218 | 231 | do { |
219 | 232 | // Get the next line in the file |
@@ -307,7 +320,7 @@ private function openFileOrMemory($pFilename): void |
307 | 320 | $this->fileHandle = fopen('php://memory', 'r+b'); |
308 | 321 | $data = StringHelper::convertEncoding($entireFile, 'UTF-8', $this->inputEncoding); |
309 | 322 | fwrite($this->fileHandle, $data); |
310 | | - rewind($this->fileHandle); |
| 323 | + $this->skipBOM(); |
311 | 324 | } |
312 | 325 | } |
313 | 326 |
|
@@ -531,4 +544,63 @@ public function canRead($pFilename) |
531 | 544 |
|
532 | 545 | return in_array($type, $supportedTypes, true); |
533 | 546 | } |
| 547 | + |
| 548 | + private static function guessEncodingTestNoBom(string &$encoding, string &$contents, string $compare, string $setEncoding): void |
| 549 | + { |
| 550 | + if ($encoding === '') { |
| 551 | + $pos = strpos($contents, $compare); |
| 552 | + if ($pos !== false && $pos % strlen($compare) === 0) { |
| 553 | + $encoding = $setEncoding; |
| 554 | + } |
| 555 | + } |
| 556 | + } |
| 557 | + |
| 558 | + private static function guessEncodingNoBom(string $filename): string |
| 559 | + { |
| 560 | + $encoding = ''; |
| 561 | + $contents = file_get_contents($filename); |
| 562 | + self::guessEncodingTestNoBom($encoding, $contents, self::UTF32BE_LF, 'UTF-32BE'); |
| 563 | + self::guessEncodingTestNoBom($encoding, $contents, self::UTF32LE_LF, 'UTF-32LE'); |
| 564 | + self::guessEncodingTestNoBom($encoding, $contents, self::UTF16BE_LF, 'UTF-16BE'); |
| 565 | + self::guessEncodingTestNoBom($encoding, $contents, self::UTF16LE_LF, 'UTF-16LE'); |
| 566 | + if ($encoding === '' && preg_match('//u', $contents) === 1) { |
| 567 | + $encoding = 'UTF-8'; |
| 568 | + } |
| 569 | + |
| 570 | + return $encoding; |
| 571 | + } |
| 572 | + |
| 573 | + private static function guessEncodingTestBom(string &$encoding, string $first4, string $compare, string $setEncoding): void |
| 574 | + { |
| 575 | + if ($encoding === '') { |
| 576 | + if ($compare === substr($first4, 0, strlen($compare))) { |
| 577 | + $encoding = $setEncoding; |
| 578 | + } |
| 579 | + } |
| 580 | + } |
| 581 | + |
| 582 | + private static function guessEncodingBom(string $filename): string |
| 583 | + { |
| 584 | + $encoding = ''; |
| 585 | + $first4 = file_get_contents($filename, false, null, 0, 4); |
| 586 | + if ($first4 !== false) { |
| 587 | + self::guessEncodingTestBom($encoding, $first4, self::UTF8_BOM, 'UTF-8'); |
| 588 | + self::guessEncodingTestBom($encoding, $first4, self::UTF16BE_BOM, 'UTF-16BE'); |
| 589 | + self::guessEncodingTestBom($encoding, $first4, self::UTF32BE_BOM, 'UTF-32BE'); |
| 590 | + self::guessEncodingTestBom($encoding, $first4, self::UTF32LE_BOM, 'UTF-32LE'); |
| 591 | + self::guessEncodingTestBom($encoding, $first4, self::UTF16LE_BOM, 'UTF-16LE'); |
| 592 | + } |
| 593 | + |
| 594 | + return $encoding; |
| 595 | + } |
| 596 | + |
| 597 | + public static function guessEncoding(string $filename, string $dflt = 'CP1252'): string |
| 598 | + { |
| 599 | + $encoding = self::guessEncodingBom($filename); |
| 600 | + if ($encoding === '') { |
| 601 | + $encoding = self::guessEncodingNoBom($filename); |
| 602 | + } |
| 603 | + |
| 604 | + return ($encoding === '') ? $dflt : $encoding; |
| 605 | + } |
534 | 606 | } |
0 commit comments