Skip to content

Commit 7b7b93b

Browse files
guywithfacetcalmant
authored andcommitted
added support for Java's modified UTF-8
1 parent 5748151 commit 7b7b93b

File tree

2 files changed

+175
-1
lines changed

2 files changed

+175
-1
lines changed

javaobj.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
import struct
4141
import sys
4242

43+
from modifiedutf8 import decode_modified_utf8
44+
4345
try:
4446
# Python 2
4547
from StringIO import StringIO as BytesIO
@@ -111,7 +113,10 @@ def to_str(data, encoding="UTF-8"):
111113
if type(data) is str:
112114
# Nothing to do
113115
return data
114-
return str(data, encoding)
116+
try:
117+
return str(data, encoding)
118+
except UnicodeDecodeError:
119+
return decode_modified_utf8(data)[0]
115120

116121
def read_to_str(data):
117122
"""

modifiedutf8.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Migrated from
2+
# https://github.com/swstephe/py2jdbc/blob/master/py2jdbc/mutf8.py
3+
4+
class DecodeMap(object):
5+
"""
6+
A utility class which manages masking, comparing and mapping in bits.
7+
If the mask and compare fails, this will raise UnicodeDecodeError so
8+
encode and decode will correctly handle bad characters.
9+
"""
10+
def __init__(self, count, mask, value, bits):
11+
"""
12+
Initialize a DecodeMap, entry from a static dictionary for the module.
13+
It automatically calculates the mask for the bits for the value, (always
14+
assumed to be at the bottom of the byte).
15+
:param count: The number of bytes in this entire sequence.
16+
:param mask: The mask to apply to the byte at this position.
17+
:param value: The value of masked bits, (without shifting).
18+
:param bits: The number of bits.
19+
"""
20+
self.count = count
21+
self.mask = mask
22+
self.value = value
23+
self.bits = bits
24+
self.mask2 = (1 << bits) - 1
25+
26+
def apply(self, byte, value, data, i, count):
27+
"""
28+
Apply mask, compare to expected value, shift and return
29+
result. Eventually, this could become a `reduce` function.
30+
:param byte: The byte to compare
31+
:param value: The currently accumulated value.
32+
:param data: The data buffer, (array of bytes).
33+
:param i: The position within the data buffer.
34+
:param count: The position of this comparison.
35+
:return: A new value with the bits merged in.
36+
:raises: UnicodeDecodeError if maked bits don't match.
37+
"""
38+
if byte & self.mask == self.value:
39+
value <<= self.bits
40+
value |= byte & self.mask2
41+
else:
42+
raise UnicodeDecodeError(
43+
NAME, data, i, i + count,
44+
"invalid {}-byte sequence".format(self.count)
45+
)
46+
return value
47+
48+
def __repr__(self):
49+
return "DecodeMap({})".format(
50+
', '.join(
51+
'{}=0x{:02x}'.format(n, getattr(self, n))
52+
for n in ('count', 'mask', 'value', 'bits', 'mask2')
53+
)
54+
)
55+
56+
57+
DECODER_MAP = {
58+
2: (
59+
(0xc0, 0x80, 6),
60+
),
61+
3: (
62+
(0xc0, 0x80, 6),
63+
(0xc0, 0x80, 6)
64+
),
65+
6: (
66+
(0xf0, 0xa0, 4),
67+
(0xc0, 0x80, 6),
68+
(0xff, 0xed, 0),
69+
(0xf0, 0xb0, 4),
70+
(0xc0, 0x80, 6),
71+
)
72+
}
73+
DECODE_MAP = dict(
74+
(k, tuple(
75+
DecodeMap(k, *vv) for vv in v)
76+
)
77+
for k, v in DECODER_MAP.items()
78+
)
79+
80+
81+
def decoder(data):
82+
"""
83+
This generator processes a sequence of bytes in Modified UTF-8 encoding and produces
84+
a sequence of unicode string characters. It takes bits from the byte until it matches
85+
one of the known encoding serquences.
86+
It uses `DecodeMap` to mask, compare and generate values.
87+
:param data: a string of bytes in Modified UTF-8 encoding.
88+
:return: a generator producing a string of unicode characters
89+
:raises: `UnicodeDecodeError` if unrecognized byte in sequence is encountered.
90+
"""
91+
def next_byte(_it, start, count):
92+
try:
93+
return next(_it)[1]
94+
except StopIteration:
95+
raise UnicodeDecodeError(
96+
NAME, data, start, start + count,
97+
"incomplete byte sequence"
98+
)
99+
100+
it = iter(enumerate(data))
101+
for i, d in it:
102+
if d == 0x00: # 00000000
103+
raise UnicodeDecodeError(
104+
NAME, data, i, i + 1,
105+
"embedded zero-byte not allowed"
106+
)
107+
elif d & 0x80: # 1xxxxxxx
108+
if d & 0x40: # 11xxxxxx
109+
if d & 0x20: # 111xxxxx
110+
if d & 0x10: # 1111xxxx
111+
raise UnicodeDecodeError(
112+
NAME, data, i, i + 1,
113+
"invalid encoding character"
114+
)
115+
elif d == 0xed:
116+
value = 0
117+
for i1, dm in enumerate(DECODE_MAP[6]):
118+
d1 = next_byte(it, i, i1 + 1)
119+
value = dm.apply(d1, value, data, i, i1 + 1)
120+
else: # 1110xxxx
121+
value = d & 0x0f
122+
for i1, dm in enumerate(DECODE_MAP[3]):
123+
d1 = next_byte(it, i, i1 + 1)
124+
value = dm.apply(d1, value, data, i, i1 + 1)
125+
else: # 110xxxxx
126+
value = d & 0x1f
127+
for i1, dm in enumerate(DECODE_MAP[2]):
128+
d1 = next_byte(it, i, i1 + 1)
129+
value = dm.apply(d1, value, data, i, i1 + 1)
130+
else: # 10xxxxxx
131+
raise UnicodeDecodeError(
132+
NAME, data, i, i + 1,
133+
"misplaced continuation character"
134+
)
135+
else: # 0xxxxxxx
136+
value = d
137+
# noinspection PyCompatibility
138+
yield mutf8_unichr(value)
139+
140+
141+
def decode_modified_utf8(data, errors='strict'):
142+
"""
143+
Decodes a sequence of bytes to a unicode text and length using Modified UTF-8.
144+
This function is designed to be used with Python `codecs` module.
145+
:param data: a string of bytes in Modified UTF-8
146+
:param errors: handle decoding errors
147+
:return: unicode text and length
148+
:raises: `UnicodeDecodeError` if sequence is invalid.
149+
"""
150+
value, length = u'', 0
151+
it = iter(decoder(data))
152+
while True:
153+
try:
154+
value += next(it)
155+
length += 1
156+
except StopIteration:
157+
break
158+
except UnicodeDecodeError as e:
159+
if errors == 'strict':
160+
raise e
161+
elif errors == 'ignore':
162+
pass
163+
elif errors == 'replace':
164+
value += u'\uFFFD'
165+
length += 1
166+
return value, length
167+
168+
def mutf8_unichr(value):
169+
return chr(value)

0 commit comments

Comments
 (0)