Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix non-ASCII support for prettytoml #3176

Merged
merged 7 commits into from
Nov 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/2737.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Handle non-ASCII characters correctly in TOML.
29 changes: 25 additions & 4 deletions pipenv/patched/prettytoml/tokens/py2toml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
A converter of python values to TOML Token instances.
"""
from __future__ import unicode_literals
import codecs
import datetime
import six
Expand Down Expand Up @@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow


def _escape_single_line_quoted_string(text):
if six.PY2:
return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
else:
return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
start = 0
i = 0
res = []
_escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
'\b': '\\b', '\f': '\\f', '"': '\\"'}

def flush():
if start < i:
res.append(text[start:i])
return i + 1

while i < len(text):
c = text[i]
if c in _escapes:
start = flush()
res.append(_escapes[c])
elif ord(c) < 0x20:
start = flush()
res.append('\\u%04x' % ord(c))
i += 1

flush()
return ''.join(res)


def _create_multiline_string_token(text):
Expand Down
71 changes: 35 additions & 36 deletions pipenv/patched/prettytoml/tokens/toml2py.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import unicode_literals
import re
import string
import iso8601
Expand All @@ -19,7 +20,7 @@ def deserialize(token):

Raises DeserializationError when appropriate.
"""

if token.type == TYPE_BOOLEAN:
return _to_boolean(token)
elif token.type == TYPE_INTEGER:
Expand All @@ -39,42 +40,40 @@ def _unescape_str(text):
"""
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
"""

# Detect bad escape jobs
bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
if bad_escape_regexp.findall(text):
raise BadEscapeCharacter

# Do the unescaping
if six.PY2:
return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
else:
return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')


def _unicode_escaped_string(text):
"""
Escapes all unicode characters in the given string
"""

if six.PY2:
text = unicode(text)

def is_unicode(c):
return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits

def escape_unicode_char(x):
if six.PY2:
return x.encode('unicode-escape')
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
tokens = []
i = 0
basicstr_re = re.compile(r'[^"\\\000-\037]*')
unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
escapes = {
'b': '\b',
't': '\t',
'n': '\n',
'f': '\f',
'r': '\r',
'\\': '\\',
'"': '"',
'/': '/',
"'": "'"
}
while True:
m = basicstr_re.match(text, i)
i = m.end()
tokens.append(m.group())
if i == len(text) or text[i] != '\\':
break
else:
return codecs.encode(x, 'unicode-escape')

if any(is_unicode(c) for c in text):
homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
return homogeneous_bytes.decode()
else:
return text
i += 1
if unicode_re.match(text, i):
m = unicode_re.match(text, i)
i = m.end()
tokens.append(six.unichr(int(m.group(1), 16)))
else:
if text[i] not in escapes:
raise BadEscapeCharacter
tokens.append(escapes[text[i]])
i += 1
return ''.join(tokens)


def _to_string(token):
Expand Down
132 changes: 132 additions & 0 deletions tasks/vendoring/patches/patched/prettytoml-unicode.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py
index 8299195..2decd02 100644
--- a/pipenv/patched/prettytoml/tokens/py2toml.py
+++ b/pipenv/patched/prettytoml/tokens/py2toml.py
@@ -2,6 +2,7 @@
"""
A converter of python values to TOML Token instances.
"""
+from __future__ import unicode_literals
import codecs
import datetime
import six
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow


def _escape_single_line_quoted_string(text):
- if six.PY2:
- return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
- else:
- return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+ start = 0
+ i = 0
+ res = []
+ _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
+ '\b': '\\b', '\f': '\\f', '"': '\\"'}
+
+ def flush():
+ if start < i:
+ res.append(text[start:i])
+ return i + 1
+
+ while i < len(text):
+ c = text[i]
+ if c in _escapes:
+ start = flush()
+ res.append(_escapes[c])
+ elif ord(c) < 0x20:
+ start = flush()
+ res.append('\\u%04x' % ord(c))
+ i += 1
+
+ flush()
+ return ''.join(res)


def _create_multiline_string_token(text):
diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py
index 2bf9c1c..5680443 100644
--- a/pipenv/patched/prettytoml/tokens/toml2py.py
+++ b/pipenv/patched/prettytoml/tokens/toml2py.py
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
import re
import string
import iso8601
@@ -39,42 +40,40 @@ def _unescape_str(text):
"""
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
"""
-
- # Detect bad escape jobs
- bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
- if bad_escape_regexp.findall(text):
- raise BadEscapeCharacter
-
- # Do the unescaping
- if six.PY2:
- return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
- else:
- return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
-
-
-def _unicode_escaped_string(text):
- """
- Escapes all unicode characters in the given string
- """
-
- if six.PY2:
- text = unicode(text)
-
- def is_unicode(c):
- return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
-
- def escape_unicode_char(x):
- if six.PY2:
- return x.encode('unicode-escape')
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+ tokens = []
+ i = 0
+ basicstr_re = re.compile(r'[^"\\\000-\037]*')
+ unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
+ escapes = {
+ 'b': '\b',
+ 't': '\t',
+ 'n': '\n',
+ 'f': '\f',
+ 'r': '\r',
+ '\\': '\\',
+ '"': '"',
+ '/': '/',
+ "'": "'"
+ }
+ while True:
+ m = basicstr_re.match(text, i)
+ i = m.end()
+ tokens.append(m.group())
+ if i == len(text) or text[i] != '\\':
+ break
else:
- return codecs.encode(x, 'unicode-escape')
-
- if any(is_unicode(c) for c in text):
- homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
- homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
- return homogeneous_bytes.decode()
- else:
- return text
+ i += 1
+ if unicode_re.match(text, i):
+ m = unicode_re.match(text, i)
+ i = m.end()
+ tokens.append(six.unichr(int(m.group(1), 16)))
+ else:
+ if text[i] not in escapes:
+ raise BadEscapeCharacter
+ tokens.append(escapes[text[i]])
+ i += 1
+ return ''.join(tokens)


def _to_string(token):
8 changes: 8 additions & 0 deletions tests/unit/test_vendor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# We need to import the patched packages directly from sys.path, so the
# identity checks can pass.
import pipenv # noqa
Expand All @@ -8,6 +9,7 @@
import pytest
import pytz

import contoml
from pipfile.api import PipfileParser
from prettytoml import lexer, tokens
from prettytoml.elements.atomic import AtomicElement
Expand Down Expand Up @@ -104,3 +106,9 @@ def test_inject_environment_variables(self):
def test_token_date(dt, content):
token = create_primitive_token(dt)
assert token == tokens.Token(tokens.TYPE_DATE, content)


def test_dump_nonascii_string():
content = 'name = "Stažené"\n'
toml_content = contoml.dumps(contoml.loads(content))
assert toml_content == content