-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Php/iconv Should Not Treat FFFE/FFFF as Valid (#2910)
Fix #2897. We have been relying on iconv/mb_convert_encoding to detect invalid UTF-8, but all techniques designed to validate UTF-8 seem to accept FFFE and FFFF. This PR explicitly converts those characters to FFFD (Unicode substitution character) before validating the rest of the string. It also substitutes one or more FFFD when it detects invalid UTF-8 character sequences. A comment in the code being change stated that it doesn't handle surrogates. It is right not to do so. The only case where we should see surrogates is reading UTF-16. Additional tests are added to an existing test reading a UTF-16 Csv to demonstrate that surrogates are handled correctly, and that FFFE/FFFF are handled reasonably.
- Loading branch information
Showing
6 changed files
with
121 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
44 changes: 44 additions & 0 deletions
44
tests/PhpSpreadsheetTests/Shared/StringHelperInvalidCharTest.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
<?php | ||
|
||
namespace PhpOffice\PhpSpreadsheetTests\Shared; | ||
|
||
use PhpOffice\PhpSpreadsheet\Shared\StringHelper; | ||
use PhpOffice\PhpSpreadsheet\Spreadsheet; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
class StringHelperInvalidCharTest extends TestCase | ||
{ | ||
public function testInvalidChar(): void | ||
{ | ||
$spreadsheet = new Spreadsheet(); | ||
$sheet = $spreadsheet->getActiveSheet(); | ||
$substitution = '�'; | ||
$array = [ | ||
['Normal string', 'Hello', 'Hello'], | ||
['integer', 2, 2], | ||
['float', 2.1, 2.1], | ||
['boolean true', true, true], | ||
['illegal FFFE/FFFF', "H\xef\xbf\xbe\xef\xbf\xbfello", "H{$substitution}{$substitution}ello"], | ||
['illegal character', "H\xef\x00\x00ello", "H{$substitution}\x00\x00ello"], | ||
['overlong character', "H\xc0\xa0ello", "H{$substitution}{$substitution}ello"], | ||
['Osmanya as single character', "H\xf0\x90\x90\x80ello", 'H𐐀ello'], | ||
['Osmanya as surrogate pair (x)', "\xed\xa0\x81\xed\xb0\x80", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"], | ||
['Osmanya as surrogate pair (u)', "\u{d801}\u{dc00}", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"], | ||
['Half surrogate pair (u)', "\u{d801}", "{$substitution}{$substitution}{$substitution}"], | ||
['Control character', "\u{7}", "\u{7}"], | ||
]; | ||
|
||
$sheet->fromArray($array); | ||
$row = 0; | ||
foreach ($array as $value) { | ||
self::assertSame($value[1] === $value[2], StringHelper::isUTF8((string) $value[1])); | ||
++$row; | ||
$expected = $value[2]; | ||
self::assertSame( | ||
$expected, | ||
$sheet->getCell("B$row")->getValue(), | ||
$sheet->getCell("A$row")->getValue() | ||
); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.