pandas-dev · nateGeorge · Jul 5, 2016 · Jul 6, 2016 · Jul 6, 2016 · Jul 6, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -81,3 +81,4 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+- Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -339,6 +339,10 @@ def _validate_nrows(nrows):
 def _read(filepath_or_buffer, kwds):
     "Generic reader of line files."
     encoding = kwds.get('encoding', None)
+    if encoding is not None:
+        encoding = re.sub('_', '-', encoding).lower()
+        kwds['encoding'] = encoding
+
     skipfooter = kwds.pop('skipfooter', None)
     if skipfooter is not None:
         kwds['skip_footer'] = skipfooter

diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1469,3 +1469,23 @@ def test_memory_map(self):
 
         out = self.read_csv(mmap_file, memory_map=True)
         tm.assert_frame_equal(out, expected)
+
+    def test_read_csv_utf_aliases(self):
+        # see gh issue 13549
+        path = 'test.csv'
+        expected = DataFrame({'A': [0, 1], 'B': [2, 3],
+                              'multibyte_test': ['testing123', 'bananabis'],
+                              'mb_nums': [154.868, 457.8798]})
+
+        for byte in [8, 16]:
+            expected.to_csv(path, encoding='utf-' + str(byte), index=False)
+            for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']:
+                encoding = fmt.format(byte)
+                for engine in ['c', 'python', None]:
+                    out = self.read_csv(
+                        path,
+                        engine=engine,
+                        encoding=encoding)
+                    tm.assert_frame_equal(out, expected)
+
+        os.remove("test.csv")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -81,3 +81,4 @@ Performance Improvements

		Bug Fixes
		~~~~~~~~~
		- Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`)
Copy link Contributor jreback Jul 6, 2016 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. put in 0.18.2 Copy link Contributor jreback Jul 6, 2016 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. double backticks around `UnicodeDecodeError` Copy link Contributor jreback Jul 6, 2016 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. pd.read_csv() Copy link Contributor Author nateGeorge Jul 12, 2016 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Looks like 0.18.2 was moved to 0.19.0 this weekend