Skip to content

Commit

Permalink
Throw an exception for unsupported charsets
Browse files Browse the repository at this point in the history
  • Loading branch information
zbateson committed Mar 18, 2024
1 parent 09a8b77 commit 437876d
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 14 deletions.
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,21 @@ To include it for use in your project, please install via composer:
composer require zbateson/mb-wrapper
```

## Php 7 Support Dropped

As of mb-wrapper 2.0, support for php 7 has been dropped.

## Requirements

mb-wrapper requires PHP 7.1 or newer. Tested on PHP 7.1, 7.2, 7.3, 7.4, 8.0, 8.1, and 8.2 on GitHub Actions.
mb-wrapper requires PHP 8.0 or newer. Tested on PHP 8.0, 8.1, 8.2, and 8.3 on GitHub Actions.

## New in 2.0

If converting or performing an operation on a string fails in iconv, an UnsupportedCharsetException is now thrown.

## Description

MbWrapper is intended for use wherever mb_* or iconv_* is used. It scans supported charsets returned by mb_list_encodings(), and prefers mb_* functions, but will fallback to iconv if a charset isn't supported.
MbWrapper is intended for use wherever mb_* or iconv_* is used. It scans supported charsets returned by mb_list_encodings(), and prefers mb_* functions, but will fallback to iconv if a charset isn't supported by the mb_ functions.

A list of aliased charsets is maintained for both mb_* and iconv, where a supported charset exists for an alias. This is useful for mail and http parsing as other systems may report encodings not recognized by mb_* or iconv.

Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
}
],
"require": {
"php": ">=7.1",
"php": ">=8.0",
"symfony/polyfill-mbstring": "^1.9",
"symfony/polyfill-iconv": "^1.9"
},
Expand Down
58 changes: 47 additions & 11 deletions src/MbWrapper.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
*
* @license http://opensource.org/licenses/bsd-license.php BSD
*/

namespace ZBateson\MbWrapper;

/**
Expand Down Expand Up @@ -325,6 +324,38 @@ private function getNormalizedCharset($charset)
return \preg_replace('/[^A-Z0-9]+/', '', $upper);
}

private function iconv(string $fromCharset, string $toCharset, string $str) : string
{
$ret = @\iconv($fromCharset, $toCharset . '//TRANSLIT//IGNORE', $str);
if ($ret === false) {
throw new UnsupportedCharsetException("Unable to convert from charsets: $fromCharset to $toCharset");
}
return $ret;
}

private function iconvStrlen(string $str, string $charset) : int
{
$ret = @\iconv_strlen($str, $charset . '//TRANSLIT//IGNORE');
if ($ret === false) {
throw new UnsupportedCharsetException("Charset $charset is not supported");
}
return $ret;
}

private function iconvSubstr(string $str, string $charset, int $start, int $length = null) : string
{
$ret = @\iconv_substr($str, $start, $length, $charset . '//TRANSLIT//IGNORE');
if ($ret === false) {
$strLength = $this->iconvStrlen($str, $charset);
if ($start > $strLength) {
// returns empty to keep in line with mb_substr functionality
return '';
}
throw new UnsupportedCharsetException("Charset $charset is not supported");
}
return $ret;
}

/**
* Converts the passed string's charset from the passed $fromCharset to the
* passed $toCharset
Expand All @@ -333,6 +364,7 @@ private function getNormalizedCharset($charset)
* back to iconv if not. If the source or destination character sets aren't
* supported, a blank string is returned.
*
* @throws UnsupportedCharsetException if iconv fails
*/
public function convert(string $str, string $fromCharset, string $toCharset) : string
{
Expand All @@ -347,16 +379,16 @@ public function convert(string $str, string $fromCharset, string $toCharset) : s
if ($str !== '') {
if ($from !== false && $to === false) {
$str = \mb_convert_encoding($str, 'UTF-8', $from);
return \iconv('UTF-8', $this->getIconvAlias($toCharset) . '//TRANSLIT//IGNORE', $str);
return $this->iconv('UTF-8', $this->getIconvAlias($toCharset), $str);
} elseif ($from === false && $to !== false) {
$str = \iconv($this->getIconvAlias($fromCharset), 'UTF-8//TRANSLIT//IGNORE', $str);
$str = $this->iconv($this->getIconvAlias($fromCharset), 'UTF-8', $str);
return \mb_convert_encoding($str, $to, 'UTF-8');
} elseif ($from !== false && $to !== false) {
return \mb_convert_encoding($str, $to, $from);
}
return \iconv(
return $this->iconv(
$this->getIconvAlias($fromCharset),
$this->getIconvAlias($toCharset) . '//TRANSLIT//IGNORE',
$this->getIconvAlias($toCharset),
$str
);
}
Expand All @@ -376,25 +408,32 @@ public function checkEncoding(string $str, string $charset) : bool
return \mb_check_encoding($str, $mb);
}
$ic = $this->getIconvAlias($charset);
return (@\iconv($ic, $ic, $str) !== false);
return (@\iconv($ic, $ic . '//TRANSLIT//IGNORE', $str) !== false);
}

/**
* Uses either mb_strlen or iconv_strlen to return the number of characters
* in the passed $str for the given $charset
*
* @throws UnsupportedCharsetException if iconv fails
*/
public function getLength(string $str, string $charset) : int
{
$mb = $this->getMbCharset($charset);
if ($mb !== false) {
return \mb_strlen($str, $mb);
}
return \iconv_strlen($str, $this->getIconvAlias($charset) . '//TRANSLIT//IGNORE');
return $this->iconvStrlen($str, $this->getIconvAlias($charset));
}

/**
* Uses either mb_substr or iconv_substr to create and return a substring of
* the passed $str.
*
* If the offset provided in $start is greater than the length of the
* string, an empty string is returned.
*
* @throws UnsupportedCharsetException if iconv fails
*/
public function getSubstr(string $str, string $charset, int $start, ?int $length = null) : string
{
Expand All @@ -409,10 +448,7 @@ public function getSubstr(string $str, string $charset, int $start, ?int $length
$str = $this->convert($str, $ic, 'UTF-8');
return $this->convert($this->getSubstr($str, 'UTF-8', $start, $length), 'UTF-8', $ic);
}
if ($length === null) {
$length = \iconv_strlen($str, $ic . '//TRANSLIT//IGNORE');
}
return \iconv_substr($str, $start, $length, $ic . '//TRANSLIT//IGNORE');
return $this->iconvSubstr($str, $ic, $start, $length);
}

/**
Expand Down
19 changes: 19 additions & 0 deletions src/UnsupportedCharsetException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php
/**
* This file is part of the ZBateson\MailMimeParser project.
*
* @license http://opensource.org/licenses/bsd-license.php BSD
*/

namespace ZBateson\MbWrapper;

use InvalidArgumentException;

/**
* Exception thrown if MbWrapper can't convert from or two a specified charset.
*
* @author Zaahid Bateson
*/
class UnsupportedCharsetException extends InvalidArgumentException
{
}
31 changes: 31 additions & 0 deletions tests/MbWrapper/MbWrapperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,35 @@ public function testIconvSubstr() : void

}
}

public function testConvertInvalidCharset() : void
{
$this->expectException(UnsupportedCharsetException::class);
$test = 'This is my string';
$converter = new MbWrapper();
$converter->convert($converter->convert($test, 'UTF-8', 'ASDF-ABC-123'), 'ASDF-ABC-123', 'UTF-8');
}

public function testLengthInvalidCharset() : void
{
$this->expectException(UnsupportedCharsetException::class);
$test = 'This is my string';
$converter = new MbWrapper();
$converter->getLength($test, 'ASDF-ABC-123');
}

public function testSubstrInvalidCharset() : void
{
$this->expectException(UnsupportedCharsetException::class);
$test = 'This is my string';
$converter = new MbWrapper();
$converter->getSubstr($test, 'ASDF-ABC-123', 0);
}

public function testSubstrInvalidOffset() : void
{
$test = 'Test';
$converter = new MbWrapper();
$this->assertEquals('', $converter->getSubstr($test, 'CP1250', 10));
}
}

0 comments on commit 437876d

Please sign in to comment.