pandas-dev
diff --git a/‎doc/source/release.rst
Lines changed: 4 additions & 0 deletions b/‎doc/source/release.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎doc/source/v0.14.0.txt
Lines changed: 3 additions & 0 deletions b/‎doc/source/v0.14.0.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎pandas/io/stata.py
Lines changed: 122 additions & 29 deletions b/‎pandas/io/stata.py
Lines changed: 122 additions & 29 deletions
diff --git a/‎pandas/io/tests/data/stata1.dta renamed to ‎pandas/io/tests/data/stata1_114.dta b/‎pandas/io/tests/data/stata1.dta renamed to ‎pandas/io/tests/data/stata1_114.dta
diff --git a/‎pandas/io/tests/data/stata1_v13.dta renamed to ‎pandas/io/tests/data/stata1_117.dta b/‎pandas/io/tests/data/stata1_v13.dta renamed to ‎pandas/io/tests/data/stata1_117.dta
diff --git a/‎pandas/io/tests/data/stata2_113.dta
1.46 KB b/‎pandas/io/tests/data/stata2_113.dta
1.46 KB
diff --git a/‎pandas/io/tests/data/stata2.dta renamed to ‎pandas/io/tests/data/stata2_114.dta b/‎pandas/io/tests/data/stata2.dta renamed to ‎pandas/io/tests/data/stata2_114.dta
diff --git a/‎pandas/io/tests/data/stata2_115.dta
1.74 KB b/‎pandas/io/tests/data/stata2_115.dta
1.74 KB
diff --git a/‎pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
1.74 KB b/‎pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
1.74 KB
diff --git a/‎pandas/io/tests/data/stata2_v13.dta renamed to ‎pandas/io/tests/data/stata2_117.dta b/‎pandas/io/tests/data/stata2_v13.dta renamed to ‎pandas/io/tests/data/stata2_117.dta
@@ -201,6 +201,10 @@ Bug Fixes
 - ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`).
 - Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`)
 - Bug in setitem with loc on mixed integer Indexes (:issue:`6546`)
+- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`)
+- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the
+  wrong data types and missing values (:issue:`6335`)
+
 
 pandas 0.13.1
 -------------
 
@@ -291,6 +291,9 @@ Enhancements
   using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
 - Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex``
   to convert to the Julian Date used primarily in astronomy. (:issue:`4041`)
+- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types
+  and will upcast when needed.  When it isn't possibly to losslessly upcast, a warning
+  is raised (:issue:`6327`)
 
 Performance
 ~~~~~~~~~~~
 
@@ -23,7 +23,7 @@
 from pandas.compat import long, lrange, lmap, lzip
 from pandas import isnull
 from pandas.io.common import get_filepath_or_buffer
-
+from pandas.tslib import NaT
 
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index=None):
@@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
 
     return reader.data(convert_dates, convert_categoricals, index)
 
-_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"]
+_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
 
 
 def _stata_elapsed_date_to_datetime(date, fmt):
@@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
     # numpy types and numpy datetime isn't mature enough / we can't rely on
     # pandas version > 0.7.1
     #TODO: IIRC relative delta doesn't play well with np.datetime?
+    #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
     if np.isnan(date):
         return np.datetime64('nat')
 
@@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
         from warnings import warn
         warn("Encountered %tC format. Leaving in Stata Internal Format.")
         return date
-    elif fmt in ["%td", "td"]:
+    elif fmt in ["%td", "td", "%d", "d"]:
         return stata_epoch + datetime.timedelta(int(date))
     elif fmt in ["%tw", "tw"]:  # does not count leap days - 7 days is a week
         year = datetime.datetime(stata_epoch.year + date // 52, 1, 1)
@@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt):
     if not isinstance(date, datetime.datetime):
         raise ValueError("date should be datetime.datetime format")
     stata_epoch = datetime.datetime(1960, 1, 1)
+    # Handle NaTs
+    if date is NaT:
+        # Missing value for dates ('.'), assumed always double
+        # TODO: Should be moved so a const somewhere, and consolidated
+        return struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
     if fmt in ["%tc", "tc"]:
         delta = date - stata_epoch
         return (delta.days * 86400000 + delta.seconds*1000 +
@@ -175,6 +181,62 @@ def _datetime_to_stata_elapsed(date, fmt):
         raise ValueError("fmt %s not understood" % fmt)
 
 
+class PossiblePrecisionLoss(Warning):
+    pass
+
+
+precision_loss_doc = """
+Column converted from %s to %s, and some data are outside of the lossless
+conversion range. This may result in a loss of precision in the saved data.
+"""
+
+
+def _cast_to_stata_types(data):
+    """Checks the dtypes of the columns of a pandas DataFrame for
+    compatibility with the data types and ranges supported by Stata, and
+    converts if necessary.
+
+    Parameters
+    ----------
+    data : DataFrame
+        The DataFrame to check and convert
+
+    Notes
+    -----
+    Numeric columns must be one of int8, int16, int32, float32 or float64, with
+    some additional value restrictions on the integer data types.  int8 and
+    int16 columns are checked for violations of the value restrictions and
+    upcast if needed.  int64 data is not usable in Stata, and so it is
+    downcast to int32 whenever the value are in the int32 range, and
+    sidecast to float64 when larger than this range.  If the int64 values
+    are outside of the range of those perfectly representable as float64 values,
+    a warning is raised.
+    """
+    ws = ''
+    for col in data:
+        dtype = data[col].dtype
+        if dtype == np.int8:
+            if data[col].max() > 100 or data[col].min() < -127:
+                data[col] = data[col].astype(np.int16)
+        elif dtype == np.int16:
+            if data[col].max() > 32740 or data[col].min() < -32767:
+                data[col] = data[col].astype(np.int32)
+        elif dtype == np.int64:
+            if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:
+                data[col] = data[col].astype(np.int32)
+            else:
+                data[col] = data[col].astype(np.float64)
+                if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53:
+                    ws = precision_loss_doc % ('int64', 'float64')
+
+    if ws:
+        import warnings
+
+        warnings.warn(ws, PossiblePrecisionLoss)
+
+    return data
+
+
 class StataMissingValue(StringMixin):
     """
     An observation's missing value.
@@ -193,14 +255,23 @@ class StataMissingValue(StringMixin):
     -----
     More information: <http://www.stata.com/help.cgi?missing>
     """
-
+    # TODO: Needs test
     def __init__(self, offset, value):
         self._value = value
-        if type(value) is int or type(value) is long:
-            self._str = value - offset is 1 and \
-                '.' or ('.' + chr(value - offset + 96))
+        value_type = type(value)
+        if value_type in int:
+            loc = value - offset
+        elif value_type in (float, np.float32, np.float64):
+            if value <= np.finfo(np.float32).max:  # float32
+                conv_str, byte_loc, scale = '<f', 1, 8
+            else:
+                conv_str, byte_loc, scale = '<d', 5, 1
+            value_bytes = struct.pack(conv_str, value)
+            loc = (struct.unpack('<b', value_bytes[byte_loc])[0] / scale) + 0
         else:
-            self._str = '.'
+            # Should never be hit
+            loc = 0
+        self._str = loc is 0 and '.' or ('.' + chr(loc + 96))
     string = property(lambda self: self._str,
                       doc="The Stata representation of the missing value: "
                           "'.', '.a'..'.z'")
@@ -240,9 +311,9 @@ def __init__(self, encoding):
             dict(
                 lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) +
                 [
-                    (251, np.int16),
-                    (252, np.int32),
-                    (253, np.int64),
+                    (251, np.int8),
+                    (252, np.int16),
+                    (253, np.int32),
                     (254, np.float32),
                     (255, np.float64)
                 ]
@@ -253,9 +324,9 @@ def __init__(self, encoding):
                     (32768, np.string_),
                     (65526, np.float64),
                     (65527, np.float32),
-                    (65528, np.int64),
-                    (65529, np.int32),
-                    (65530, np.int16)
+                    (65528, np.int32),
+                    (65529, np.int16),
+                    (65530, np.int8)
                 ]
             )
         self.TYPE_MAP = lrange(251) + list('bhlfd')
@@ -272,13 +343,19 @@ def __init__(self, encoding):
         #NOTE: technically, some of these are wrong. there are more numbers
         # that can be represented. it's the 27 ABOVE and BELOW the max listed
         # numeric data type in [U] 12.2.2 of the 11.2 manual
-        self.MISSING_VALUES = \
+        float32_min = b'\xff\xff\xff\xfe'
+        float32_max = b'\xff\xff\xff\x7e'
+        float64_min = b'\xff\xff\xff\xff\xff\xff\xef\xff'
+        float64_max = b'\xff\xff\xff\xff\xff\xff\xdf\x7f'
+        self.VALID_RANGE = \
             {
                 'b': (-127, 100),
                 'h': (-32767, 32740),
                 'l': (-2147483647, 2147483620),
-                'f': (-1.701e+38, +1.701e+38),
-                'd': (-1.798e+308, +8.988e+307)
+                'f': (np.float32(struct.unpack('<f', float32_min)[0]),
+                      np.float32(struct.unpack('<f', float32_max)[0])),
+                'd': (np.float64(struct.unpack('<d', float64_min)[0]),
+                      np.float64(struct.unpack('<d', float64_max)[0]))
             }
 
         self.OLD_TYPE_MAPPING = \
@@ -287,6 +364,16 @@ def __init__(self, encoding):
                 'f': 254,
                 'b': 251
             }
+        # These missing values are the generic '.' in Stata, and are used
+        # to replace nans
+        self.MISSING_VALUES = \
+            {
+                'b': 101,
+                'h': 32741,
+                'l': 2147483621,
+                'f': np.float32(struct.unpack('<f', b'\x00\x00\x00\x7f')[0]),
+                'd': np.float64(struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
+            }
 
     def _decode_bytes(self, str, errors=None):
         if compat.PY3 or self._encoding is not None:
@@ -556,8 +643,8 @@ def _col_size(self, k=None):
 
     def _unpack(self, fmt, byt):
         d = struct.unpack(self.byteorder + fmt, byt)[0]
-        if fmt[-1] in self.MISSING_VALUES:
-            nmin, nmax = self.MISSING_VALUES[fmt[-1]]
+        if fmt[-1] in self.VALID_RANGE:
+            nmin, nmax = self.VALID_RANGE[fmt[-1]]
             if d < nmin or d > nmax:
                 if self._missing_values:
                     return StataMissingValue(nmax, d)
@@ -855,11 +942,12 @@ def _dtype_to_stata_type(dtype):
     See TYPE_MAP and comments for an explanation. This is also explained in
     the dta spec.
     1 - 244 are strings of this length
-    251 - chr(251) - for int8 and int16, byte
-    252 - chr(252) - for int32, int
-    253 - chr(253) - for int64, long
-    254 - chr(254) - for float32, float
-    255 - chr(255) - double, double
+                         Pandas    Stata
+    251 - chr(251) - for int8      byte
+    252 - chr(252) - for int16     int
+    253 - chr(253) - for int32     long
+    254 - chr(254) - for float32   float
+    255 - chr(255) - for double    double
 
     If there are dates to convert, then dtype will already have the correct
     type inserted.
@@ -878,8 +966,10 @@ def _dtype_to_stata_type(dtype):
     elif dtype == np.int64:
         return chr(253)
     elif dtype == np.int32:
+        return chr(253)
+    elif dtype == np.int16:
         return chr(252)
-    elif dtype == np.int8 or dtype == np.int16:
+    elif dtype == np.int8:
         return chr(251)
     else:  # pragma : no cover
         raise ValueError("Data type %s not currently understood. "
@@ -970,7 +1060,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
         self._file = _open_file_binary_write(
             fname, self._encoding or self._default_encoding
         )
-        self.type_converters = {253: np.long, 252: int}
+        self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
 
     def _write(self, to_write):
         """
@@ -990,11 +1080,14 @@ def __init__(self, data):
                 self.data = data
 
             def __iter__(self):
-                for i, row in data.iterrows():
-                    yield row
+                for row in data.itertuples():
+                    # First element is index, so remove
+                    yield row[1:]
 
         if self._write_index:
             data = data.reset_index()
+        # Check columns for compatbaility with stata
+        data = _cast_to_stata_types(data)
         self.datarows = DataFrameRowIter(data)
         self.nobs, self.nvar = data.shape
         self.data = data
@@ -1181,7 +1274,7 @@ def _write_data_dates(self):
                     self._write(var)
                 else:
                     if isnull(var):  # this only matters for floats
-                        var = MISSING_VALUES[typ]
+                        var = MISSING_VALUES[TYPE_MAP[typ]]
                     self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))
 
     def _null_terminate(self, s, as_string=False):