From 3971ca4a8cddb3e12f7afadaade7e3e61861f38a Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 4 Dec 2013 11:16:47 -0500 Subject: [PATCH] BUG: fix stata unicode issues on win32 --- pandas/io/stata.py | 38 ++++++++++++++++++++++++++++++-- pandas/io/tests/test_stata.py | 41 +++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1d0d1d17ec631..064b95293e721 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -19,9 +19,9 @@ from pandas.core.series import Series from pandas.core.categorical import Categorical import datetime -from pandas import compat +from pandas import compat, isnull from pandas.compat import long, lrange, lmap, lzip -from pandas import isnull +from pandas.core import common as com from pandas.io.common import get_filepath_or_buffer @@ -957,6 +957,40 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, self._write(typ) # varlist, length 33*nvar, char array, null terminated +<<<<<<< Updated upstream +======= + converted_names = [] + duplicate_var_id = 0 + for j, name in enumerate(self.varlist): + orig_name = name + # Replaces all characters disallowed in .dta format by their integral representation. + for c in name: + if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_': + name = name.replace(c, '_') + + # Variable name may not start with a number + if name[0] > '0' and name[0] < '9': + name = '_' + name + + name = name[:min(len(name), 32)] + + if not name == orig_name: + # check for duplicates + while self.varlist.count(name) > 0: + # prepend ascending number to avoid duplicates + name = '_' + str(duplicate_var_id) + name + name = name[:min(len(name), 32)] + duplicate_var_id += 1 + + # need to possibly encode the orig name if its unicode + try: + orig_name = orig_name.encode('utf-8') + except: + pass + converted_names.append("{0} -> {1}".format(orig_name,name)) + self.varlist[j] = name + +>>>>>>> Stashed changes for name in self.varlist: name = self._null_terminate(name, True) name = _pad_bytes(name[:32], 33) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 76dae396c04ed..e4fc7c11d8607 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -13,6 +13,7 @@ import pandas.util.testing as tm from pandas.util.misc import is_little_endian from pandas import compat +from pandas.compat import u class TestStata(tm.TestCase): @@ -231,6 +232,46 @@ def test_encoding(self): self.assert_(result == expected) self.assert_(isinstance(result, unicode)) +<<<<<<< Updated upstream +======= + def test_read_write_dta11(self): + import pdb; pdb.set_trace() + original = DataFrame([(1, 2, 3, 4)], + columns=['good', u("\u03c3"), '8number', 'astringwithmorethan32characters______']) + if compat.PY3: + formatted = DataFrame([(1, 2, 3, 4)], + columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) + else: + formatted = DataFrame([(1, 2, 3, 4)], + columns=['good', 'b__d', '_8number', 'astringwithmorethan32characters_']) + formatted.index.name = 'index' + + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + original.to_stata(path, None, False) + np.testing.assert_equal( + len(w), 1) # should get a warning for that format. + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta12(self): + original = DataFrame([(1, 2, 3, 4)], + columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) + formatted = DataFrame([(1, 2, 3, 4)], + columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) + formatted.index.name = 'index' + + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + original.to_stata(path, None, False) + np.testing.assert_equal( + len(w), 1) # should get a warning for that format. + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + +>>>>>>> Stashed changes if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)