trotterdylan
diff --git a/‎pythonparser/lexer.py‎
Lines changed: 60 additions & 18 deletions b/‎pythonparser/lexer.py‎
Lines changed: 60 additions & 18 deletions
diff --git a/‎pythonparser/parser.py‎
Lines changed: 6 additions & 1 deletion b/‎pythonparser/parser.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎pythonparser/source.py‎
Lines changed: 29 additions & 1 deletion b/‎pythonparser/source.py‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎pythonparser/test/test_lexer.py‎
Lines changed: 71 additions & 41 deletions b/‎pythonparser/test/test_lexer.py‎
Lines changed: 71 additions & 41 deletions
@@ -10,6 +10,9 @@
 
 if sys.version_info[0] == 3:
     unichr = chr
+    byte = lambda x: bytes([x])
+else:
+    byte = chr
 
 class Token:
     """
@@ -105,6 +108,7 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
         self.diagnostic_engine = diagnostic_engine
         self.interactive = interactive
         self.print_function = False
+        self.unicode_literals = self.version >= (3, 0)
 
         self.offset = 0
         self.new_line = True
@@ -184,24 +188,23 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
                    id_xid=id_xid), re.VERBOSE|re.UNICODE)
 
     # These are identical for all lexer instances.
-    _lex_escape_re = re.compile(r"""
+    _lex_escape_pattern = r"""
     \\(?:
         ([\n\\'"abfnrtv]) # 1 single-char
     |   ([0-7]{1,3})      # 2 oct
     |   x([0-9A-Fa-f]{2}) # 3 hex
     )
-    """, re.VERBOSE)
+    """
+    _lex_escape_re = re.compile(_lex_escape_pattern.encode(), re.VERBOSE)
 
-    _lex_escape_unicode_re = re.compile(_lex_escape_re.pattern + r"""
+    _lex_escape_unicode_re = re.compile(_lex_escape_pattern + r"""
     | \\(?:
         u([0-9A-Fa-f]{4}) # 4 unicode-16
     |   U([0-9A-Fa-f]{8}) # 5 unicode-32
     |   N\{(.+?)\}        # 6 unicode-name
     )
     """, re.VERBOSE)
 
-    _lex_check_byte_re = re.compile("[^\x00-\x7f]")
-
     def next(self, eof_token=False):
         """
         Returns token at ``offset`` as a :class:`Token` and advances ``offset``
@@ -419,27 +422,24 @@ def _string_literal(self, options, begin_span, data, data_span, end_span):
 
     def _replace_escape(self, range, mode, value):
         is_raw     = ("r" in mode)
-        is_byte    = ("b" in mode)
-        is_unicode = ("u" in mode)
+        is_unicode = "u" in mode or ("b" not in mode and self.unicode_literals)
+
+        if not is_unicode:
+            value = value.encode(self.source_buffer.encoding)
+            if is_raw:
+                return value
+            return self._replace_escape_bytes(value)
 
         if is_raw:
             return value
 
-        if is_byte and self._lex_check_byte_re.match(value):
-            error = diagnostic.Diagnostic(
-                "error", "non-7-bit character in a byte literal", {},
-                tok_range)
-            self.diagnostic_engine.process(error)
-
-        if is_unicode or self.version >= (3, 0):
-            re = self._lex_escape_unicode_re
-        else:
-            re = self._lex_escape_re
+        return self._replace_escape_unicode(range, value)
 
+    def _replace_escape_unicode(self, range, value):
         chunks = []
         offset = 0
         while offset < len(value):
-            match = re.search(value, offset)
+            match = self._lex_escape_unicode_re.search(value, offset)
             if match is None:
                 # Append the remaining of the string
                 chunks.append(value[offset:])
@@ -499,6 +499,48 @@ def _replace_escape(self, range, mode, value):
 
         return "".join(chunks)
 
+    def _replace_escape_bytes(self, value):
+        chunks = []
+        offset = 0
+        while offset < len(value):
+            match = self._lex_escape_re.search(value, offset)
+            if match is None:
+                # Append the remaining of the string
+                chunks.append(value[offset:])
+                break
+
+            # Append the part of string before match
+            chunks.append(value[offset:match.start()])
+            offset = match.end()
+
+            # Process the escape
+            if match.group(1) is not None: # single-char
+                chr = match.group(1)
+                if chr == b"\n":
+                    pass
+                elif chr == b"\\" or chr == b"'" or chr == b"\"":
+                    chunks.append(chr)
+                elif chr == b"a":
+                    chunks.append(b"\a")
+                elif chr == b"b":
+                    chunks.append(b"\b")
+                elif chr == b"f":
+                    chunks.append(b"\f")
+                elif chr == b"n":
+                    chunks.append(b"\n")
+                elif chr == b"r":
+                    chunks.append(b"\r")
+                elif chr == b"t":
+                    chunks.append(b"\t")
+                elif chr == b"v":
+                    chunks.append(b"\v")
+            elif match.group(2) is not None: # oct
+                chunks.append(byte(int(match.group(2), 8)))
+            elif match.group(3) is not None: # hex
+                chunks.append(byte(int(match.group(3), 16)))
+
+        return b"".join(chunks)
+
     def _check_long_literal(self, range, literal):
         if literal[-1] in "lL" and self.version >= (3, 0):
             error = diagnostic.Diagnostic(
 
@@ -520,6 +520,8 @@ def _assignable(self, node, is_delete=False):
     def add_flags(self, flags):
         if "print_function" in flags:
             self.lexer.print_function = True
+        if "unicode_literals" in flags:
+            self.lexer.unicode_literals = True
 
     # Grammar
     @action(Expect(Alt(Newline(),
@@ -1522,7 +1524,10 @@ def atom_4(self, begin_tok, data_tok, end_tok):
 
     @action(Plus(atom_4))
     def atom_5(self, strings):
-        return ast.Str(s="".join([x.s for x in strings]),
+        joint = ""
+        if all(isinstance(x.s, bytes) for x in strings):
+            joint = b""
+        return ast.Str(s=joint.join([x.s for x in strings]),
                        begin_loc=strings[0].begin_loc, end_loc=strings[-1].end_loc,
                        loc=strings[0].loc.join(strings[-1].loc))
 
 
@@ -7,6 +7,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 import bisect
+import regex as re
 
 class Buffer:
     """
@@ -18,7 +19,11 @@ class Buffer:
     :ivar line: (integer) first line of the input
     """
     def __init__(self, source, name="<input>", first_line=1):
-        self.source = source
+        self.encoding = self._extract_encoding(source)
+        if isinstance(source, bytes):
+            self.source = source.decode(self.encoding)
+        else:
+            self.source = source
         self.name = name
         self.first_line = first_line
         self._line_begins = None
@@ -65,6 +70,29 @@ def _extract_line_begins(self):
                 return self._line_begins
             self._line_begins.append(index)
 
+    _encoding_re = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+    _encoding_bytes_re = re.compile(_encoding_re.pattern.encode())
+
+    def _extract_encoding(self, source):
+        if isinstance(source, bytes):
+            re = self._encoding_bytes_re
+            nl = b"\n"
+        else:
+            re = self._encoding_re
+            nl = "\n"
+        match = re.match(source)
+        if not match:
+            index = source.find(nl)
+            if index != -1:
+                match = re.match(source[index + 1:])
+        if match:
+            encoding = match.group(1)
+            if isinstance(encoding, bytes):
+                return encoding.decode("ascii")
+            return encoding
+        return "ascii"
+
+
 class Range:
     """
     Location of an exclusive range of characters [*begin_pos*, *end_pos*)
 
@@ -1,9 +1,13 @@
 # coding:utf-8
 
 from __future__ import absolute_import, division, print_function, unicode_literals
+from . import test_utils
 from .. import source, lexer, diagnostic
 import unittest
 
+BytesOnly = test_utils.BytesOnly
+UnicodeOnly = test_utils.UnicodeOnly
+
 class LexerTestCase(unittest.TestCase):
 
     def assertLexesVersions(self, input, versions, *expected_tokens, **kwargs):
@@ -152,42 +156,67 @@ def test_integer_py3(self):
                          "int", 123)
 
     def test_string_literal(self):
-        self.assertLexes("''",
-                         "strbegin", "",
-                         "strdata",  "",
-                         "strend",   None)
-        self.assertLexes("''''''",
-                         "strbegin", "",
-                         "strdata",  "",
-                         "strend",   None)
-        self.assertLexes("\"\"",
-                         "strbegin", "",
-                         "strdata",  "",
-                         "strend",   None)
-        self.assertLexes("\"\"\"\"\"\"",
-                         "strbegin", "",
-                         "strdata",  "",
-                         "strend",   None)
-
-        self.assertLexes("'x'",
-                         "strbegin", "",
-                         "strdata",  "x",
-                         "strend",   None)
-
-        self.assertLexes("'''\n'''",
-                         "strbegin", "",
-                         "strdata",  "\n",
-                         "strend",   None)
-
-        self.assertLexes("'''\n'''",
-                         "strbegin", "",
-                         "strdata",  "\n",
-                         "strend",   None)
-
-        self.assertLexes(r"'\0 \10 \010'",
-                         "strbegin", "",
-                         "strdata",  "\x00 \x08 \x08",
-                         "strend",   None)
+        for version in self.VERSIONS:
+            if version < (3,):
+                str_type = BytesOnly
+            else:
+                str_type = UnicodeOnly
+            self.assertLexesVersions("''", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type(""),
+                                     "strend",   None)
+            self.assertLexesVersions("''''''", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type(""),
+                                     "strend",   None)
+            self.assertLexesVersions("\"\"", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type(""),
+                                     "strend",   None)
+            self.assertLexesVersions("\"\"\"\"\"\"", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type(""),
+                                     "strend",   None)
+
+            self.assertLexesVersions("'x'", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type("x"),
+                                     "strend",   None)
+
+            self.assertLexesVersions("'''\n'''", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type("\n"),
+                                     "strend",   None)
+
+            self.assertLexesVersions("'''\n'''", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type("\n"),
+                                     "strend",   None)
+
+            self.assertLexesVersions(r"'\0 \10 \010'", [version],
+                                     "strbegin", "",
+                                     "strdata",  str_type("\x00 \x08 \x08"),
+                                     "strend",   None)
+
+        self.assertLexesVersions(r"b'\xc3\xa7'", [(2,7), (3,0), (3,1)],
+                                 "strbegin", "b",
+                                 "strdata",  BytesOnly(b"\xc3\xa7"),
+                                 "strend",   None)
+
+        self.assertLexesVersions(b"# coding: koi8-r\nb'\xc3\xa7'", [(2,7), (3,0), (3,1)],
+                                 "strbegin", "b",
+                                 "strdata",  BytesOnly(b"\xc3\xa7"),
+                                 "strend",   None)
+
+        self.assertLexesVersions(b"# coding: koi8-r\n'\xc3\xa7'", [(3,0), (3,1)],
+                                 "strbegin", "",
+                                 "strdata",  UnicodeOnly("\u0446\u2556"),
+                                 "strend",   None)
+
+        self.assertLexesVersions(b"# coding: koi8-r\nu'\xc3\xa7'", [(2,7)],
+                                 "strbegin", "u",
+                                 "strdata",  UnicodeOnly("\u0446\u2556"),
+                                 "strend",   None)
 
         self.assertDiagnoses(
                          "'",
@@ -211,12 +240,13 @@ def test_escape_clike(self):
                 (r"\a", "\a"), (r"\b", "\b"), (r"\f", "\f"), (r"\n", "\n"),
                 (r"\r", "\r"), (r"\t", "\t"), (r"\v", "\v"),
                 (r"\x53", "S"), (r"\123", "S")]:
-            for mode in [ "", "u", "b" ]:
-                self.assertLexesEscape(mode, chr, val)
-            for mode in [ "r", "br" ]:
-                self.assertLexesEscape(mode, chr, chr)
+            self.assertLexesEscape("b", chr, BytesOnly(val))
+            self.assertLexesEscape("u", chr, UnicodeOnly(val))
+            self.assertLexesEscape("", chr, UnicodeOnly(val))
+            self.assertLexesEscape("r", chr, UnicodeOnly(chr))
+            self.assertLexesEscape("br", chr, BytesOnly(chr))
 
-        self.assertLexesEscape("r", "\\\"", "\\\"")
+        self.assertLexesEscape("r", "\\\"", UnicodeOnly("\\\""))
 
     def test_escape_unicode(self):
         self.assertLexesEscape("u", "\\u044b", "ы")