Skip to content

Commit

Permalink
Issue python#19619: Blacklist non-text codecs in method API
Browse files Browse the repository at this point in the history
str.encode, bytes.decode and bytearray.decode now use an
internal API to throw LookupError for known non-text encodings,
rather than attempting the encoding or decoding operation and
then throwing a TypeError for an unexpected output type.

The latter mechanism remains in place for third party non-text
encodings.

Backported changeset d68df99d7a57.
  • Loading branch information
serhiy-storchaka committed Feb 24, 2014
1 parent 20f8728 commit 94ee389
Show file tree
Hide file tree
Showing 13 changed files with 219 additions and 19 deletions.
27 changes: 27 additions & 0 deletions Include/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
const char *errors
);

#ifndef PY_LIMITED_API
/* Text codec specific encoding and decoding API.
Checks the encoding against a list of codecs which do not
implement a str<->bytes encoding before attempting the
operation.
Please note that these APIs are internal and should not
be used in Python C extensions.
*/

PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
PyObject *object,
const char *encoding,
const char *errors
);

PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
PyObject *object,
const char *encoding,
const char *errors
);
#endif



/* --- Codec Lookup APIs --------------------------------------------------
All APIs return a codec object with incremented refcount and are
Expand Down
14 changes: 13 additions & 1 deletion Lib/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,19 @@
### Codec base classes (defining the API)

class CodecInfo(tuple):
"""Codec details when looking up the codec registry"""

# Private API to allow Python 3.4 to blacklist the known non-Unicode
# codecs in the standard library. A more general mechanism to
# reliably distinguish test encodings from other codecs will hopefully
# be defined for Python 3.5
#
# See http://bugs.python.org/issue19619
_is_text_encoding = True # Assume codecs are text encodings by default

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
incrementalencoder=None, incrementaldecoder=None, name=None):
incrementalencoder=None, incrementaldecoder=None, name=None,
*, _is_text_encoding=None):
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
self.name = name
self.encode = encode
Expand All @@ -84,6 +94,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
self.incrementaldecoder = incrementaldecoder
self.streamwriter = streamwriter
self.streamreader = streamreader
if _is_text_encoding is not None:
self._is_text_encoding = _is_text_encoding
return self

def __repr__(self):
Expand Down
1 change: 1 addition & 0 deletions Lib/encodings/base64_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_text_encoding=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/bz2_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_text_encoding=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/hex_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_text_encoding=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/quopri_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_text_encoding=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/rot_13.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_text_encoding=False,
)

### Map
Expand Down
1 change: 1 addition & 0 deletions Lib/encodings/uu_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_text_encoding=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/zlib_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_text_encoding=False,
)
42 changes: 42 additions & 0 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import unittest
import warnings
import encodings

from test import support

Expand Down Expand Up @@ -2408,6 +2409,47 @@ def test_readline(self):
sout = reader.readline()
self.assertEqual(sout, b"\x80")

def test_text_to_binary_blacklists_binary_transforms(self):
# Check binary -> binary codecs give a good error for str input
bad_input = "bad input type"
for encoding in bytes_transform_encodings:
fmt = (r"{!r} is not a text encoding; "
r"use codecs.encode\(\) to handle arbitrary codecs")
msg = fmt.format(encoding)
with self.assertRaisesRegex(LookupError, msg) as failure:
bad_input.encode(encoding)
self.assertIsNone(failure.exception.__cause__)

def test_text_to_binary_blacklists_text_transforms(self):
# Check str.encode gives a good error message for str -> str codecs
msg = (r"^'rot_13' is not a text encoding; "
r"use codecs.encode\(\) to handle arbitrary codecs")
with self.assertRaisesRegex(LookupError, msg):
"just an example message".encode("rot_13")

def test_binary_to_text_blacklists_binary_transforms(self):
# Check bytes.decode and bytearray.decode give a good error
# message for binary -> binary codecs
data = b"encode first to ensure we meet any format restrictions"
for encoding in bytes_transform_encodings:
encoded_data = codecs.encode(data, encoding)
fmt = (r"{!r} is not a text encoding; "
r"use codecs.decode\(\) to handle arbitrary codecs")
msg = fmt.format(encoding)
with self.assertRaisesRegex(LookupError, msg):
encoded_data.decode(encoding)
with self.assertRaisesRegex(LookupError, msg):
bytearray(encoded_data).decode(encoding)

def test_binary_to_text_blacklists_text_transforms(self):
# Check str -> str codec gives a good error for binary input
for bad_input in (b"immutable", bytearray(b"mutable")):
msg = (r"^'rot_13' is not a text encoding; "
r"use codecs.decode\(\) to handle arbitrary codecs")
with self.assertRaisesRegex(LookupError, msg) as failure:
bad_input.decode("rot_13")
self.assertIsNone(failure.exception.__cause__)


@unittest.skipUnless(sys.platform == 'win32',
'code pages are specific to Windows')
Expand Down
6 changes: 6 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ What's New in Python 3.3.5 release candidate 1?
Core and Builtins
-----------------

- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an
internal API to throw LookupError for known non-text encodings, rather
than attempting the encoding or decoding operation and then throwing a
TypeError for an unexpected output type. (The latter mechanism remains
in place for third party non-text encodings)

- Issue #20588: Make Python-ast.c C89 compliant.

- Issue #20437: Fixed 21 potential bugs when deleting objects references.
Expand Down
4 changes: 2 additions & 2 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3129,7 +3129,7 @@ PyUnicode_Decode(const char *s,
buffer = PyMemoryView_FromBuffer(&info);
if (buffer == NULL)
goto onError;
unicode = PyCodec_Decode(buffer, encoding, errors);
unicode = _PyCodec_DecodeText(buffer, encoding, errors);
if (unicode == NULL)
goto onError;
if (!PyUnicode_Check(unicode)) {
Expand Down Expand Up @@ -3489,7 +3489,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
}

/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
v = _PyCodec_EncodeText(unicode, encoding, errors);
if (v == NULL)
return NULL;

Expand Down
138 changes: 122 additions & 16 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -337,18 +337,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
errors is passed to the encoder factory as argument if non-NULL. */

PyObject *PyCodec_Encode(PyObject *object,
const char *encoding,
const char *errors)
static PyObject *
_PyCodec_EncodeInternal(PyObject *object,
PyObject *encoder,
const char *encoding,
const char *errors)
{
PyObject *encoder = NULL;
PyObject *args = NULL, *result = NULL;
PyObject *v = NULL;

encoder = PyCodec_Encoder(encoding);
if (encoder == NULL)
goto onError;

args = args_tuple(object, errors);
if (args == NULL)
goto onError;
Expand Down Expand Up @@ -384,18 +381,15 @@ PyObject *PyCodec_Encode(PyObject *object,
errors is passed to the decoder factory as argument if non-NULL. */

PyObject *PyCodec_Decode(PyObject *object,
const char *encoding,
const char *errors)
static PyObject *
_PyCodec_DecodeInternal(PyObject *object,
PyObject *decoder,
const char *encoding,
const char *errors)
{
PyObject *decoder = NULL;
PyObject *args = NULL, *result = NULL;
PyObject *v;

decoder = PyCodec_Decoder(encoding);
if (decoder == NULL)
goto onError;

args = args_tuple(object, errors);
if (args == NULL)
goto onError;
Expand Down Expand Up @@ -425,6 +419,118 @@ PyObject *PyCodec_Decode(PyObject *object,
return NULL;
}

/* Generic encoding/decoding API */
PyObject *PyCodec_Encode(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *encoder;

encoder = PyCodec_Encoder(encoding);
if (encoder == NULL)
return NULL;

return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
}

PyObject *PyCodec_Decode(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *decoder;

decoder = PyCodec_Decoder(encoding);
if (decoder == NULL)
return NULL;

return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
}

/* Text encoding/decoding API */
static
PyObject *codec_getitem_checked(const char *encoding,
const char *operation_name,
int index)
{
_Py_IDENTIFIER(_is_text_encoding);
PyObject *codec;
PyObject *attr;
PyObject *v;
int is_text_codec;

codec = _PyCodec_Lookup(encoding);
if (codec == NULL)
return NULL;

/* Backwards compatibility: assume any raw tuple describes a text
* encoding, and the same for anything lacking the private
* attribute.
*/
if (!PyTuple_CheckExact(codec)) {
attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
if (attr == NULL) {
if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
PyErr_Clear();
} else {
Py_DECREF(codec);
return NULL;
}
} else {
is_text_codec = PyObject_IsTrue(attr);
Py_DECREF(attr);
if (!is_text_codec) {
Py_DECREF(codec);
PyErr_Format(PyExc_LookupError,
"'%.400s' is not a text encoding; "
"use codecs.%s() to handle arbitrary codecs",
encoding, operation_name);
return NULL;
}
}
}

v = PyTuple_GET_ITEM(codec, index);
Py_DECREF(codec);
Py_INCREF(v);
return v;
}

static PyObject * _PyCodec_TextEncoder(const char *encoding)
{
return codec_getitem_checked(encoding, "encode", 0);
}

static PyObject * _PyCodec_TextDecoder(const char *encoding)
{
return codec_getitem_checked(encoding, "decode", 1);
}

PyObject *_PyCodec_EncodeText(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *encoder;

encoder = _PyCodec_TextEncoder(encoding);
if (encoder == NULL)
return NULL;

return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
}

PyObject *_PyCodec_DecodeText(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *decoder;

decoder = _PyCodec_TextDecoder(encoding);
if (decoder == NULL)
return NULL;

return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
}

/* Register the error handling callback function error under the name
name. This function will be called by the codec when it encounters
an unencodable characters/undecodable bytes and doesn't know the
Expand Down

0 comments on commit 94ee389

Please sign in to comment.