Skip to content

Commit

Permalink
Add object key interning support (PY3 only)
Browse files Browse the repository at this point in the history
- Includes other minor updates
  • Loading branch information
vtermanis committed Sep 13, 2017
1 parent eaff4fe commit 62c217d
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 50 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
UNRELEASED
- Support No-Op type (decoder only)
- Allow for object keys to be interned, saving memory if repeated (PY3 only)
- Use PyUnicode_FromStringAndSize instead of PyUnicode_DecodeUTF8 (decoder)
- Open file for writing, not appending (to/from json utility)
- Used more compact json encoding (to/from json utility)

Expand Down
4 changes: 2 additions & 2 deletions pylint.rc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ confidence=
# W0511 - ignore fixme/todo for report
# I0011 - locally-disabled pylint options
# C0111 - missing-docstring
disable=R0903,W0511,I0011,C0111
disable=R0903,W0511,I0011,C0111,no-else-return

[REPORTS]

Expand Down Expand Up @@ -321,7 +321,7 @@ max-locals=15
max-returns=6

# Maximum number of branch for function / method body
max-branches=12
max-branches=13

# Maximum number of statements in function / method body
max-statements=50
Expand Down
15 changes: 8 additions & 7 deletions src/_ubjson.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
static _ubjson_encoder_prefs_t _ubjson_encoder_prefs_defaults = { 0, 0, 1 };

// no_bytes, object_pairs_hook
static _ubjson_decoder_prefs_t _ubjson_decoder_prefs_defaults = { NULL, 0 };
static _ubjson_decoder_prefs_t _ubjson_decoder_prefs_defaults = { NULL, 0, 0 };

/******************************************************************************/

Expand Down Expand Up @@ -98,8 +98,8 @@ PyDoc_STRVAR(_ubjson_load__doc__, "See pure Python version (encoder.load) for do
#define FUNC_DEF_LOAD {"load", (PyCFunction)_ubjson_load, METH_VARARGS | METH_KEYWORDS, _ubjson_load__doc__}
static PyObject*
_ubjson_load(PyObject *self, PyObject *args, PyObject *kwargs) {
static const char *format = "O|iO:load";
static char *keywords[] = {"fp", "no_bytes", "object_pairs_hook", NULL};
static const char *format = "O|iOi:load";
static char *keywords[] = {"fp", "no_bytes", "object_pairs_hook", "intern_object_keys", NULL};

_ubjson_decoder_buffer_t *buffer = NULL;
_ubjson_decoder_prefs_t prefs = _ubjson_decoder_prefs_defaults;
Expand All @@ -108,7 +108,8 @@ _ubjson_load(PyObject *self, PyObject *args, PyObject *kwargs) {
PyObject *obj = NULL;
UNUSED(self);

if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, keywords, &fp, &prefs.no_bytes, &prefs.object_pairs_hook)) {
if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, keywords, &fp, &prefs.no_bytes, &prefs.object_pairs_hook,
&prefs.intern_object_keys)) {
goto bail;
}
BAIL_ON_NULL(fp_read = PyObject_GetAttrString(fp, "read"));
Expand All @@ -135,8 +136,8 @@ PyDoc_STRVAR(_ubjson_loadb__doc__, "See pure Python version (encoder.loadb) for
#define FUNC_DEF_LOADB {"loadb", (PyCFunction)_ubjson_loadb, METH_VARARGS | METH_KEYWORDS, _ubjson_loadb__doc__}
static PyObject*
_ubjson_loadb(PyObject *self, PyObject *args, PyObject *kwargs) {
static const char *format = "O|iO:loadb";
static char *keywords[] = {"chars", "no_bytes", "object_pairs_hook", NULL};
static const char *format = "O|iOi:loadb";
static char *keywords[] = {"chars", "no_bytes", "object_pairs_hook", "intern_object_keys", NULL};

_ubjson_decoder_buffer_t *buffer = NULL;
_ubjson_decoder_prefs_t prefs = _ubjson_decoder_prefs_defaults;
Expand All @@ -145,7 +146,7 @@ _ubjson_loadb(PyObject *self, PyObject *args, PyObject *kwargs) {
UNUSED(self);

if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, keywords, &chars, &prefs.no_bytes,
&prefs.object_pairs_hook)) {
&prefs.object_pairs_hook, &prefs.intern_object_keys)) {
goto bail;
}
if (PyUnicode_Check(chars)) {
Expand Down
32 changes: 22 additions & 10 deletions src/decoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
}

#define DECODE_UNICODE_OR_BAIL(dst_obj, raw, length, item_str) {\
if (NULL == ((dst_obj) = PyUnicode_DecodeUTF8(raw, length, NULL))) {\
if (NULL == ((dst_obj) = PyUnicode_FromStringAndSize(raw, length))) {\
RAISE_DECODER_EXCEPTION(("Failed to decode utf8: " item_str));\
}\
}\
Expand Down Expand Up @@ -665,22 +665,32 @@ static PyObject* _decode_array(_ubjson_decoder_buffer_t *buffer) {
}

// same as string, except there is no 'S' marker
static PyObject* _decode_object_key(_ubjson_decoder_buffer_t *buffer, char marker) {
static PyObject* _decode_object_key(_ubjson_decoder_buffer_t *buffer, char marker, int intern) {
long long length;
const char *raw;
PyObject *key;

DECODE_LENGTH_OR_BAIL_MARKER(length, marker);
READ_OR_BAIL((Py_ssize_t)length, raw, "string");

return PyUnicode_DecodeUTF8(raw, (Py_ssize_t)length, NULL);
BAIL_ON_NULL(key = PyUnicode_FromStringAndSize(raw, (Py_ssize_t)length));
// unicode string interning not supported in v2
#if PY_MAJOR_VERSION < 3
UNUSED(intern);
#else
if (intern) {
PyUnicode_InternInPlace(&key);
}
#endif
return key;

bail:
return NULL;
}

// used by _decode_object* functions
#define DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION(context_str) {\
key = _decode_object_key(buffer, marker);\
#define DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION(context_str, intern) {\
key = _decode_object_key(buffer, marker, intern);\
if (NULL == key) {\
RAISE_DECODER_EXCEPTION("Failed to decode object key (" context_str ")");\
}\
Expand All @@ -695,6 +705,7 @@ static PyObject* _decode_object_with_hook(_ubjson_decoder_buffer_t *buffer) {
PyObject *item = NULL;
char *fixed_type;
char marker;
int intern = buffer->prefs.intern_object_keys;

if (params.invalid) {
goto bail;
Expand All @@ -713,7 +724,7 @@ static PyObject* _decode_object_with_hook(_ubjson_decoder_buffer_t *buffer) {
Py_INCREF(value);

while (params.count > 0) {
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized, no data");
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized, no data", intern);
BAIL_ON_NULL(item = PyTuple_Pack(2, key, value));
Py_CLEAR(key);
PyList_SET_ITEM(list, list_pos++, item);
Expand All @@ -733,7 +744,7 @@ static PyObject* _decode_object_with_hook(_ubjson_decoder_buffer_t *buffer) {
READ_CHAR_OR_BAIL(marker, "object key length (sized, after no-op)");
continue;
}
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized");
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized", intern);
BAIL_ON_NULL(value = _ubjson_decode_value(buffer, fixed_type));
BAIL_ON_NULL(item = PyTuple_Pack(2, key, value));
Py_CLEAR(key);
Expand All @@ -757,7 +768,7 @@ static PyObject* _decode_object_with_hook(_ubjson_decoder_buffer_t *buffer) {
READ_CHAR_OR_BAIL(marker, "object key length (after no-op)");
continue;
}
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("unsized");
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("unsized", intern);
BAIL_ON_NULL(value = _ubjson_decode_value(buffer, fixed_type));
BAIL_ON_NULL(item = PyTuple_Pack(2, key, value));
Py_CLEAR(key);
Expand Down Expand Up @@ -789,6 +800,7 @@ static PyObject* _decode_object(_ubjson_decoder_buffer_t *buffer) {
PyObject *value = NULL;
char *fixed_type;
char marker;
int intern = buffer->prefs.intern_object_keys;

if (params.invalid) {
goto bail;
Expand All @@ -802,7 +814,7 @@ static PyObject* _decode_object(_ubjson_decoder_buffer_t *buffer) {
value = _no_data_type(params.type);

while (params.count > 0) {
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized, no data");
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized, no data", intern);
BAIL_ON_NONZERO(PyDict_SetItem(object, key, value));
// reference stolen in above call, but only for value!
Py_CLEAR(key);
Expand All @@ -821,7 +833,7 @@ static PyObject* _decode_object(_ubjson_decoder_buffer_t *buffer) {
READ_CHAR_OR_BAIL(marker, "object key length");
continue;
}
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized/unsized");
DECODE_OBJECT_KEY_OR_RAISE_ENCODER_EXCEPTION("sized/unsized", intern);
BAIL_ON_NULL(value = _ubjson_decode_value(buffer, fixed_type));
BAIL_ON_NONZERO(PyDict_SetItem(object, key, value));
Py_CLEAR(key);
Expand Down
2 changes: 2 additions & 0 deletions src/decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ extern "C" {

typedef struct {
PyObject *object_pairs_hook;
// don't convert UINT8 arrays to bytes instances (and keep as an array of individual integers)
int no_bytes;
int intern_object_keys;
} _ubjson_decoder_prefs_t;

typedef struct {
Expand Down
40 changes: 27 additions & 13 deletions test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ def test_no_data(self):
with self.assertRaises(DecoderException):
self.ubjloadb(b'')

def test_invalid_data(self):
with self.assertRaises(TypeError):
self.ubjloadb(123)

def test_trailing_input(self):
self.assertEqual(self.ubjloadb(TYPE_BOOL_TRUE * 10), True)

Expand Down Expand Up @@ -379,15 +383,26 @@ def test_object_noop(self):
# only supported without type
for hook in (None, OrderedDict):
loadb = partial(self.ubjloadb, object_pairs_hook=hook)
self.assertEqual(ubjloadb(OBJECT_START +
TYPE_NOOP +
TYPE_UINT8 + b'\x01' + 'a'.encode('utf-8') + TYPE_NULL +
TYPE_NOOP +
TYPE_UINT8 + b'\x01' + 'b'.encode('utf-8') + TYPE_BOOL_TRUE +
OBJECT_END), {'a': None, 'b': True})
self.assertEqual(ubjloadb(OBJECT_START + CONTAINER_COUNT + TYPE_UINT8 + b'\x01' +
TYPE_NOOP +
TYPE_UINT8 + b'\x01' + 'a'.encode('utf-8') + TYPE_NULL), {'a': None})
self.assertEqual(loadb(OBJECT_START +
TYPE_NOOP +
TYPE_UINT8 + b'\x01' + 'a'.encode('utf-8') + TYPE_NULL +
TYPE_NOOP +
TYPE_UINT8 + b'\x01' + 'b'.encode('utf-8') + TYPE_BOOL_TRUE +
OBJECT_END), {'a': None, 'b': True})
self.assertEqual(loadb(OBJECT_START + CONTAINER_COUNT + TYPE_UINT8 + b'\x01' +
TYPE_NOOP +
TYPE_UINT8 + b'\x01' + 'a'.encode('utf-8') + TYPE_NULL), {'a': None})

def test_intern_object_keys(self):
encoded = self.ubjdumpb({'asdasd': 1, 'qwdwqd': 2})
mapping2 = self.ubjloadb(encoded, intern_object_keys=True)
mapping3 = self.ubjloadb(encoded, intern_object_keys=True)
for key1, key2 in zip(sorted(mapping2.keys()), sorted(mapping3.keys())):
if PY2: # pragma: no cover
# interning of unicode not supported
self.assertEqual(key1, key2)
else:
self.assertIs(key1, key2)

def test_circular(self):
sequence = [1, 2, 3]
Expand Down Expand Up @@ -444,11 +459,10 @@ class TestEncodeDecodeFp(TestEncodeDecodePlain):
def ubjloadb(raw, *args, **kwargs):
try:
raw = BytesIO(raw)
except: # pylint: disable=bare-except
except TypeError: # pylint: disable=bare-except
# Invalid raw input testing
pass
else:
return ubjload(raw, *args, **kwargs)
raise
return ubjload(raw, *args, **kwargs)

@staticmethod
def ubjdumpb(obj, *args, **kwargs):
Expand Down
9 changes: 7 additions & 2 deletions ubjson/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

"""Python v2.7 (NOT 2.6) compatibility"""

# pylint: disable=unused-import,invalid-name,redefined-variable-type,wrong-import-position,no-name-in-module
# pylint: disable=unused-import,invalid-name,wrong-import-position,no-name-in-module
# pylint: disable=import-error
# pragma: no cover

Expand All @@ -45,7 +45,7 @@
PY2 = (version_info[0] == 2)

if PY2:
# pylint:disable=undefined-variable
# pylint: disable=undefined-variable
INTEGER_TYPES = (int, long) # noqa
UNICODE_TYPE = unicode # noqa
TEXT_TYPES = (str, unicode) # noqa
Expand All @@ -55,6 +55,10 @@
STDOUT_RAW = stdout
STDERR_RAW = stderr

# Interning applies to str, not unicode
def intern_unicode(obj):
return obj

else:
INTEGER_TYPES = (int,)
UNICODE_TYPE = str
Expand All @@ -64,6 +68,7 @@
STDIN_RAW = getattr(stdin, 'buffer', stdin)
STDOUT_RAW = getattr(stdout, 'buffer', stdout)
STDERR_RAW = getattr(stderr, 'buffer', stderr)
from sys import intern as intern_unicode # noqa

try:
# introduced in v3.3
Expand Down
Loading

0 comments on commit 62c217d

Please sign in to comment.