From 3971ca4a8cddb3e12f7afadaade7e3e61861f38a Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Wed, 4 Dec 2013 11:16:47 -0500
Subject: [PATCH] BUG: fix stata unicode issues on win32

---
 pandas/io/stata.py            | 38 ++++++++++++++++++++++++++++++--
 pandas/io/tests/test_stata.py | 41 +++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 1d0d1d17ec631..064b95293e721 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -19,9 +19,9 @@
 from pandas.core.series import Series
 from pandas.core.categorical import Categorical
 import datetime
-from pandas import compat
+from pandas import compat, isnull
 from pandas.compat import long, lrange, lmap, lzip
-from pandas import isnull
+from pandas.core import common as com
 from pandas.io.common import get_filepath_or_buffer
 
 
@@ -957,6 +957,40 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
             self._write(typ)
 
         # varlist, length 33*nvar, char array, null terminated
+<<<<<<< Updated upstream
+=======
+        converted_names = []
+        duplicate_var_id = 0
+        for j, name in enumerate(self.varlist):
+            orig_name = name
+            # Replaces all characters disallowed in .dta format by their integral representation.
+            for c in name:
+                if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
+                    name = name.replace(c, '_')
+
+            # Variable name may not start with a number
+            if name[0] > '0' and name[0] < '9':
+                name = '_' + name
+
+            name = name[:min(len(name), 32)]
+
+            if not name == orig_name:
+                # check for duplicates
+                while self.varlist.count(name) > 0:
+                    # prepend ascending number to avoid duplicates
+                    name = '_' + str(duplicate_var_id) + name
+                    name = name[:min(len(name), 32)]
+                    duplicate_var_id += 1
+
+                # need to possibly encode the orig name if its unicode
+                try:
+                    orig_name = orig_name.encode('utf-8')
+                except:
+                    pass
+                converted_names.append("{0} -> {1}".format(orig_name,name))
+                self.varlist[j] = name
+
+>>>>>>> Stashed changes
         for name in self.varlist:
             name = self._null_terminate(name, True)
             name = _pad_bytes(name[:32], 33)
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
index 76dae396c04ed..e4fc7c11d8607 100644
--- a/pandas/io/tests/test_stata.py
+++ b/pandas/io/tests/test_stata.py
@@ -13,6 +13,7 @@
 import pandas.util.testing as tm
 from pandas.util.misc import is_little_endian
 from pandas import compat
+from pandas.compat import u
 
 class TestStata(tm.TestCase):
 
@@ -231,6 +232,46 @@ def test_encoding(self):
             self.assert_(result == expected)
             self.assert_(isinstance(result, unicode))
 
+<<<<<<< Updated upstream
+=======
+    def test_read_write_dta11(self):
+        import pdb; pdb.set_trace()
+        original = DataFrame([(1, 2, 3, 4)],
+                             columns=['good', u("\u03c3"), '8number', 'astringwithmorethan32characters______'])
+        if compat.PY3:
+            formatted = DataFrame([(1, 2, 3, 4)],
+                                  columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_'])
+        else:
+            formatted = DataFrame([(1, 2, 3, 4)],
+                                  columns=['good', 'b__d', '_8number', 'astringwithmorethan32characters_'])
+        formatted.index.name = 'index'
+
+        with tm.ensure_clean() as path:
+            with warnings.catch_warnings(record=True) as w:
+                original.to_stata(path, None, False)
+                np.testing.assert_equal(
+                    len(w), 1)  # should get a warning for that format.
+
+            written_and_read_again = self.read_dta(path)
+            tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
+
+    def test_read_write_dta12(self):
+        original = DataFrame([(1, 2, 3, 4)],
+                             columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
+        formatted = DataFrame([(1, 2, 3, 4)],
+                              columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
+        formatted.index.name = 'index'
+
+        with tm.ensure_clean() as path:
+            with warnings.catch_warnings(record=True) as w:
+                original.to_stata(path, None, False)
+                np.testing.assert_equal(
+                    len(w), 1)  # should get a warning for that format.
+
+            written_and_read_again = self.read_dta(path)
+            tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
+
+>>>>>>> Stashed changes
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)