Skip to content

bpo-36778: cp65001 encoding becomes an alias to utf_8 #13230

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Doc/library/codecs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1106,8 +1106,7 @@ particular, the following variants typically exist:
+-----------------+--------------------------------+--------------------------------+
| cp1258 | windows-1258 | Vietnamese |
+-----------------+--------------------------------+--------------------------------+
| cp65001 | | Windows only: Windows UTF-8 |
| | | (``CP_UTF8``) |
| cp65001 | | Alias to ``utf_8`` encoding |
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add the versionchanged directive.

I think it is better to remove this row add add cp65001 to the list of utf-8 aliases.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created PR #13240 for your doc change proposal.

| | | |
| | | .. versionadded:: 3.3 |
+-----------------+--------------------------------+--------------------------------+
Expand Down
1 change: 1 addition & 0 deletions Lib/encodings/aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@
'utf8' : 'utf_8',
'utf8_ucs2' : 'utf_8',
'utf8_ucs4' : 'utf_8',
'cp65001' : 'utf_8',

# uu_codec codec
'uu' : 'uu_codec',
Expand Down
43 changes: 0 additions & 43 deletions Lib/encodings/cp65001.py

This file was deleted.

89 changes: 0 additions & 89 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,95 +875,6 @@ def test_surrogatepass_handler(self):
b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")


@unittest.skipUnless(sys.platform == 'win32',
'cp65001 is a Windows-only codec')
class CP65001Test(ReadTest, unittest.TestCase):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does utf-8 pass these tests?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I confirmed utf-8 passes this test.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does utf-8 pass these tests?

Sorry, I forgot to specify before I merged my PR that yes: I tested on my Windows 10 and the test still passed. But CP65001Test is now redundant with UTF8Ttest.

I confirmed utf-8 passes this test.

Thanks for checking :-)

encoding = "cp65001"

def test_encode(self):
tests = [
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
('\udc80', 'strict', None),
('\udc80', 'ignore', b''),
('\udc80', 'replace', b'?'),
('\udc80', 'backslashreplace', b'\\udc80'),
('\udc80', 'namereplace', b'\\udc80'),
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
]
for text, errors, expected in tests:
if expected is not None:
try:
encoded = text.encode('cp65001', errors)
except UnicodeEncodeError as err:
self.fail('Unable to encode %a to cp65001 with '
'errors=%r: %s' % (text, errors, err))
self.assertEqual(encoded, expected,
'%a.encode("cp65001", %r)=%a != %a'
% (text, errors, encoded, expected))
else:
self.assertRaises(UnicodeEncodeError,
text.encode, "cp65001", errors)

def test_decode(self):
tests = [
(b'abc', 'strict', 'abc'),
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
(b'\xef\xbf\xbd', 'strict', '\ufffd'),
(b'[\xc3\xa9]', 'strict', '[\xe9]'),
# invalid bytes
(b'[\xff]', 'strict', None),
(b'[\xff]', 'ignore', '[]'),
(b'[\xff]', 'replace', '[\ufffd]'),
(b'[\xff]', 'surrogateescape', '[\udcff]'),
(b'[\xed\xb2\x80]', 'strict', None),
(b'[\xed\xb2\x80]', 'ignore', '[]'),
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
]
for raw, errors, expected in tests:
if expected is not None:
try:
decoded = raw.decode('cp65001', errors)
except UnicodeDecodeError as err:
self.fail('Unable to decode %a from cp65001 with '
'errors=%r: %s' % (raw, errors, err))
self.assertEqual(decoded, expected,
'%a.decode("cp65001", %r)=%a != %a'
% (raw, errors, decoded, expected))
else:
self.assertRaises(UnicodeDecodeError,
raw.decode, 'cp65001', errors)

def test_lone_surrogates(self):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
b'[\\udc80]')
self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
b'[\\udc80]')
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
b'[�]')
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
b'[\x80]')
self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
b'[]')
self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
b'[?]')

def test_surrogatepass_handler(self):
self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
b"abc\xed\xa0\x80def")
self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
"abc\ud800def")
self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
b"\xf0\x90\xbf\xbf\xed\xa0\x80")
self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
"\U00010fff\uD800")
self.assertTrue(codecs.lookup_error("surrogatepass"))


class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
``cp65001`` encoding (Windows code page 65001) becomes an alias to ``utf_8``
encoding.