Skip to content

Commit

Permalink
Php/iconv Should Not Treat FFFE/FFFF as Valid (#2910)
Browse files Browse the repository at this point in the history
Fix #2897. We have been relying on iconv/mb_convert_encoding to detect invalid UTF-8, but all techniques designed to validate UTF-8 seem to accept FFFE and FFFF. This PR explicitly converts those characters to FFFD (Unicode substitution character) before validating the rest of the string. It also substitutes one or more FFFD when it detects invalid UTF-8 character sequences.

A comment in the code being change stated that it doesn't handle surrogates. It is right not to do so. The only case where we should see surrogates is reading UTF-16. Additional tests are added to an existing test reading a UTF-16 Csv to demonstrate that surrogates are handled correctly, and that FFFE/FFFF are handled reasonably.
  • Loading branch information
oleibman authored Jul 2, 2022
1 parent f90adcf commit c3f5385
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 45 deletions.
30 changes: 0 additions & 30 deletions phpstan-baseline.neon
Original file line number Diff line number Diff line change
Expand Up @@ -2785,36 +2785,6 @@ parameters:
count: 1
path: src/PhpSpreadsheet/Shared/OLERead.php

-
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:formatNumber\\(\\) should return string but returns array\\|string\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php

-
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:sanitizeUTF8\\(\\) should return string but returns string\\|false\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php

-
message: "#^Parameter \\#1 \\$string of function strlen expects string, float given\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php

-
message: "#^Parameter \\#3 \\$subject of function str_replace expects array\\|string, float given\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php

-
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$decimalSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php

-
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$thousandsSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php

-
message: "#^Static method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\TimeZone\\:\\:validateTimeZone\\(\\) is unused\\.$#"
count: 1
Expand Down
44 changes: 29 additions & 15 deletions src/PhpSpreadsheet/Shared/StringHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
namespace PhpOffice\PhpSpreadsheet\Shared;

use PhpOffice\PhpSpreadsheet\Calculation\Calculation;
use UConverter;

class StringHelper
{
/** Constants */
/** Regular Expressions */
// Fraction
const STRING_REGEXP_FRACTION = '(-?)(\d+)\s+(\d+\/\d+)';
const STRING_REGEXP_FRACTION = '~^\s*(-?)((\d*)\s+)?(\d+\/\d+)\s*$~';

/**
* Control characters array.
Expand All @@ -28,14 +29,14 @@ class StringHelper
/**
* Decimal separator.
*
* @var string
* @var ?string
*/
private static $decimalSeparator;

/**
* Thousands separator.
*
* @var string
* @var ?string
*/
private static $thousandsSeparator;

Expand Down Expand Up @@ -328,39 +329,51 @@ public static function controlCharacterPHP2OOXML($textValue)
}

/**
* Try to sanitize UTF8, stripping invalid byte sequences. Not perfect. Does not surrogate characters.
* Try to sanitize UTF8, replacing invalid sequences with Unicode substitution characters.
*/
public static function sanitizeUTF8(string $textValue): string
{
$textValue = str_replace(["\xef\xbf\xbe", "\xef\xbf\xbf"], "\xef\xbf\xbd", $textValue);
if (class_exists(UConverter::class)) {
$returnValue = UConverter::transcode($textValue, 'UTF-8', 'UTF-8');
if ($returnValue !== false) {
return $returnValue;
}
}
// @codeCoverageIgnoreStart
// I don't think any of the code below should ever be executed.
if (self::getIsIconvEnabled()) {
$textValue = @iconv('UTF-8', 'UTF-8', $textValue);

return $textValue;
$returnValue = @iconv('UTF-8', 'UTF-8', $textValue);
if ($returnValue !== false) {
return $returnValue;
}
}

$textValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8');
// Phpstan does not think this can return false.
$returnValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8');

return $textValue;
return $returnValue;
// @codeCoverageIgnoreEnd
}

/**
* Check if a string contains UTF8 data.
*/
public static function isUTF8(string $textValue): bool
{
return $textValue === '' || preg_match('/^./su', $textValue) === 1;
return $textValue === self::sanitizeUTF8($textValue);
}

/**
* Formats a numeric value as a string for output in various output writers forcing
* point as decimal separator in case locale is other than English.
*
* @param mixed $numericValue
* @param float|int|string $numericValue
*/
public static function formatNumber($numericValue): string
{
if (is_float($numericValue)) {
return str_replace(',', '.', $numericValue);
return str_replace(',', '.', (string) $numericValue);
}

return (string) $numericValue;
Expand Down Expand Up @@ -537,9 +550,10 @@ public static function strCaseReverse(string $textValue): string
*/
public static function convertToNumberIfFraction(string &$operand): bool
{
if (preg_match('/^' . self::STRING_REGEXP_FRACTION . '$/i', $operand, $match)) {
if (preg_match(self::STRING_REGEXP_FRACTION, $operand, $match)) {
$sign = ($match[1] == '-') ? '-' : '+';
$fractionFormula = '=' . $sign . $match[2] . $sign . $match[3];
$wholePart = ($match[3] === '') ? '' : ($sign . $match[3]);
$fractionFormula = '=' . $wholePart . $sign . $match[4];
$operand = Calculation::getInstance()->_calculateFormulaValue($fractionFormula);

return true;
Expand Down Expand Up @@ -686,6 +700,6 @@ public static function testStringAsNumeric($textValue)
}
$v = (float) $textValue;

return (is_numeric(substr($textValue, 0, strlen($v)))) ? $v : $textValue;
return (is_numeric(substr($textValue, 0, strlen((string) $v)))) ? $v : $textValue;
}
}
19 changes: 19 additions & 0 deletions tests/PhpSpreadsheetTests/Reader/Csv/CsvEncodingTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ public function testGuessEncoding(string $filename): void
self::assertEquals('sixième', $sheet->getCell('C2')->getValue());
}

public function testSurrogate(): void
{
// Surrogates should occur only in UTF-16, and should
// be properly converted to UTF8 when read.
// FFFE/FFFF are illegal, and should be converted to
// substitution character when read.
// Excel does not handle any of the cells in row 3 well.
// LibreOffice handles A3 fine, and discards B3/C3,
// which is a reasonable action.
$filename = 'tests/data/Reader/CSV/premiere.utf16le.csv';
$reader = new Csv();
$reader->setInputEncoding(Csv::guessEncoding($filename));
$spreadsheet = $reader->load($filename);
$sheet = $spreadsheet->getActiveSheet();
self::assertEquals('𐐀', $sheet->getCell('A3')->getValue());
self::assertEquals('', $sheet->getCell('B3')->getValue());
self::assertEquals('', $sheet->getCell('C3')->getValue());
}

/**
* @dataProvider providerGuessEncoding
*/
Expand Down
44 changes: 44 additions & 0 deletions tests/PhpSpreadsheetTests/Shared/StringHelperInvalidCharTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?php

namespace PhpOffice\PhpSpreadsheetTests\Shared;

use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
use PhpOffice\PhpSpreadsheet\Spreadsheet;
use PHPUnit\Framework\TestCase;

class StringHelperInvalidCharTest extends TestCase
{
public function testInvalidChar(): void
{
$spreadsheet = new Spreadsheet();
$sheet = $spreadsheet->getActiveSheet();
$substitution = '';
$array = [
['Normal string', 'Hello', 'Hello'],
['integer', 2, 2],
['float', 2.1, 2.1],
['boolean true', true, true],
['illegal FFFE/FFFF', "H\xef\xbf\xbe\xef\xbf\xbfello", "H{$substitution}{$substitution}ello"],
['illegal character', "H\xef\x00\x00ello", "H{$substitution}\x00\x00ello"],
['overlong character', "H\xc0\xa0ello", "H{$substitution}{$substitution}ello"],
['Osmanya as single character', "H\xf0\x90\x90\x80ello", 'H𐐀ello'],
['Osmanya as surrogate pair (x)', "\xed\xa0\x81\xed\xb0\x80", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
['Osmanya as surrogate pair (u)', "\u{d801}\u{dc00}", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
['Half surrogate pair (u)', "\u{d801}", "{$substitution}{$substitution}{$substitution}"],
['Control character', "\u{7}", "\u{7}"],
];

$sheet->fromArray($array);
$row = 0;
foreach ($array as $value) {
self::assertSame($value[1] === $value[2], StringHelper::isUTF8((string) $value[1]));
++$row;
$expected = $value[2];
self::assertSame(
$expected,
$sheet->getCell("B$row")->getValue(),
$sheet->getCell("A$row")->getValue()
);
}
}
}
29 changes: 29 additions & 0 deletions tests/PhpSpreadsheetTests/Shared/StringHelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,33 @@ public function testSYLKtoUTF8(): void

self::assertEquals($expectedResult, $result);
}

/**
* @dataProvider providerFractions
*/
public function testFraction(string $expected, string $value): void
{
$originalValue = $value;
$result = StringHelper::convertToNumberIfFraction($value);
if ($result === false) {
self::assertSame($expected, $originalValue);
self::assertSame($expected, $value);
} else {
self::assertSame($expected, (string) $value);
self::assertNotEquals($value, $originalValue);
}
}

public function providerFractions(): array
{
return [
'non-fraction' => ['1', '1'],
'common fraction' => ['1.5', '1 1/2'],
'fraction between -1 and 0' => ['-0.5', '-1/2'],
'fraction between -1 and 0 with space' => ['-0.5', ' - 1/2'],
'fraction between 0 and 1' => ['0.75', '3/4 '],
'fraction between 0 and 1 with space' => ['0.75', ' 3/4'],
'improper fraction' => ['1.75', '7/4'],
];
}
}
Binary file modified tests/data/Reader/CSV/premiere.utf16le.csv
Binary file not shown.

0 comments on commit c3f5385

Please sign in to comment.