23
23
from pandas .compat import long , lrange , lmap , lzip
24
24
from pandas import isnull
25
25
from pandas .io .common import get_filepath_or_buffer
26
-
26
+ from pandas . tslib import NaT
27
27
28
28
def read_stata (filepath_or_buffer , convert_dates = True ,
29
29
convert_categoricals = True , encoding = None , index = None ):
@@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
48
48
49
49
return reader .data (convert_dates , convert_categoricals , index )
50
50
51
- _date_formats = ["%tc" , "%tC" , "%td" , "%tw" , "%tm" , "%tq" , "%th" , "%ty" ]
51
+ _date_formats = ["%tc" , "%tC" , "%td" , "%d" , "% tw" , "%tm" , "%tq" , "%th" , "%ty" ]
52
52
53
53
54
54
def _stata_elapsed_date_to_datetime (date , fmt ):
@@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
97
97
# numpy types and numpy datetime isn't mature enough / we can't rely on
98
98
# pandas version > 0.7.1
99
99
#TODO: IIRC relative delta doesn't play well with np.datetime?
100
+ #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
100
101
if np .isnan (date ):
101
102
return np .datetime64 ('nat' )
102
103
@@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
109
110
from warnings import warn
110
111
warn ("Encountered %tC format. Leaving in Stata Internal Format." )
111
112
return date
112
- elif fmt in ["%td" , "td" ]:
113
+ elif fmt in ["%td" , "td" , "%d" , "d" ]:
113
114
return stata_epoch + datetime .timedelta (int (date ))
114
115
elif fmt in ["%tw" , "tw" ]: # does not count leap days - 7 days is a week
115
116
year = datetime .datetime (stata_epoch .year + date // 52 , 1 , 1 )
@@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt):
150
151
if not isinstance (date , datetime .datetime ):
151
152
raise ValueError ("date should be datetime.datetime format" )
152
153
stata_epoch = datetime .datetime (1960 , 1 , 1 )
154
+ # Handle NaTs
155
+ if date is NaT :
156
+ # Missing value for dates ('.'), assumed always double
157
+ # TODO: Should be moved so a const somewhere, and consolidated
158
+ return struct .unpack ('<d' , b'\x00 \x00 \x00 \x00 \x00 \x00 \xe0 \x7f ' )[0 ]
153
159
if fmt in ["%tc" , "tc" ]:
154
160
delta = date - stata_epoch
155
161
return (delta .days * 86400000 + delta .seconds * 1000 +
@@ -175,6 +181,62 @@ def _datetime_to_stata_elapsed(date, fmt):
175
181
raise ValueError ("fmt %s not understood" % fmt )
176
182
177
183
184
+ class PossiblePrecisionLoss (Warning ):
185
+ pass
186
+
187
+
188
+ precision_loss_doc = """
189
+ Column converted from %s to %s, and some data are outside of the lossless
190
+ conversion range. This may result in a loss of precision in the saved data.
191
+ """
192
+
193
+
194
+ def _cast_to_stata_types (data ):
195
+ """Checks the dtypes of the columns of a pandas DataFrame for
196
+ compatibility with the data types and ranges supported by Stata, and
197
+ converts if necessary.
198
+
199
+ Parameters
200
+ ----------
201
+ data : DataFrame
202
+ The DataFrame to check and convert
203
+
204
+ Notes
205
+ -----
206
+ Numeric columns must be one of int8, int16, int32, float32 or float64, with
207
+ some additional value restrictions on the integer data types. int8 and
208
+ int16 columns are checked for violations of the value restrictions and
209
+ upcast if needed. int64 data is not usable in Stata, and so it is
210
+ downcast to int32 whenever the value are in the int32 range, and
211
+ sidecast to float64 when larger than this range. If the int64 values
212
+ are outside of the range of those perfectly representable as float64 values,
213
+ a warning is raised.
214
+ """
215
+ ws = ''
216
+ for col in data :
217
+ dtype = data [col ].dtype
218
+ if dtype == np .int8 :
219
+ if data [col ].max () > 100 or data [col ].min () < - 127 :
220
+ data [col ] = data [col ].astype (np .int16 )
221
+ elif dtype == np .int16 :
222
+ if data [col ].max () > 32740 or data [col ].min () < - 32767 :
223
+ data [col ] = data [col ].astype (np .int32 )
224
+ elif dtype == np .int64 :
225
+ if data [col ].max () <= 2147483620 and data [col ].min () >= - 2147483647 :
226
+ data [col ] = data [col ].astype (np .int32 )
227
+ else :
228
+ data [col ] = data [col ].astype (np .float64 )
229
+ if data [col ].max () <= 2 * 53 or data [col ].min () >= - 2 ** 53 :
230
+ ws = precision_loss_doc % ('int64' , 'float64' )
231
+
232
+ if ws :
233
+ import warnings
234
+
235
+ warnings .warn (ws , PossiblePrecisionLoss )
236
+
237
+ return data
238
+
239
+
178
240
class StataMissingValue (StringMixin ):
179
241
"""
180
242
An observation's missing value.
@@ -193,14 +255,23 @@ class StataMissingValue(StringMixin):
193
255
-----
194
256
More information: <http://www.stata.com/help.cgi?missing>
195
257
"""
196
-
258
+ # TODO: Needs test
197
259
def __init__ (self , offset , value ):
198
260
self ._value = value
199
- if type (value ) is int or type (value ) is long :
200
- self ._str = value - offset is 1 and \
201
- '.' or ('.' + chr (value - offset + 96 ))
261
+ value_type = type (value )
262
+ if value_type in int :
263
+ loc = value - offset
264
+ elif value_type in (float , np .float32 , np .float64 ):
265
+ if value <= np .finfo (np .float32 ).max : # float32
266
+ conv_str , byte_loc , scale = '<f' , 1 , 8
267
+ else :
268
+ conv_str , byte_loc , scale = '<d' , 5 , 1
269
+ value_bytes = struct .pack (conv_str , value )
270
+ loc = (struct .unpack ('<b' , value_bytes [byte_loc ])[0 ] / scale ) + 0
202
271
else :
203
- self ._str = '.'
272
+ # Should never be hit
273
+ loc = 0
274
+ self ._str = loc is 0 and '.' or ('.' + chr (loc + 96 ))
204
275
string = property (lambda self : self ._str ,
205
276
doc = "The Stata representation of the missing value: "
206
277
"'.', '.a'..'.z'" )
@@ -240,9 +311,9 @@ def __init__(self, encoding):
240
311
dict (
241
312
lzip (range (1 , 245 ), ['a' + str (i ) for i in range (1 , 245 )]) +
242
313
[
243
- (251 , np .int16 ),
244
- (252 , np .int32 ),
245
- (253 , np .int64 ),
314
+ (251 , np .int8 ),
315
+ (252 , np .int16 ),
316
+ (253 , np .int32 ),
246
317
(254 , np .float32 ),
247
318
(255 , np .float64 )
248
319
]
@@ -253,9 +324,9 @@ def __init__(self, encoding):
253
324
(32768 , np .string_ ),
254
325
(65526 , np .float64 ),
255
326
(65527 , np .float32 ),
256
- (65528 , np .int64 ),
257
- (65529 , np .int32 ),
258
- (65530 , np .int16 )
327
+ (65528 , np .int32 ),
328
+ (65529 , np .int16 ),
329
+ (65530 , np .int8 )
259
330
]
260
331
)
261
332
self .TYPE_MAP = lrange (251 ) + list ('bhlfd' )
@@ -272,13 +343,19 @@ def __init__(self, encoding):
272
343
#NOTE: technically, some of these are wrong. there are more numbers
273
344
# that can be represented. it's the 27 ABOVE and BELOW the max listed
274
345
# numeric data type in [U] 12.2.2 of the 11.2 manual
275
- self .MISSING_VALUES = \
346
+ float32_min = b'\xff \xff \xff \xfe '
347
+ float32_max = b'\xff \xff \xff \x7e '
348
+ float64_min = b'\xff \xff \xff \xff \xff \xff \xef \xff '
349
+ float64_max = b'\xff \xff \xff \xff \xff \xff \xdf \x7f '
350
+ self .VALID_RANGE = \
276
351
{
277
352
'b' : (- 127 , 100 ),
278
353
'h' : (- 32767 , 32740 ),
279
354
'l' : (- 2147483647 , 2147483620 ),
280
- 'f' : (- 1.701e+38 , + 1.701e+38 ),
281
- 'd' : (- 1.798e+308 , + 8.988e+307 )
355
+ 'f' : (np .float32 (struct .unpack ('<f' , float32_min )[0 ]),
356
+ np .float32 (struct .unpack ('<f' , float32_max )[0 ])),
357
+ 'd' : (np .float64 (struct .unpack ('<d' , float64_min )[0 ]),
358
+ np .float64 (struct .unpack ('<d' , float64_max )[0 ]))
282
359
}
283
360
284
361
self .OLD_TYPE_MAPPING = \
@@ -287,6 +364,16 @@ def __init__(self, encoding):
287
364
'f' : 254 ,
288
365
'b' : 251
289
366
}
367
+ # These missing values are the generic '.' in Stata, and are used
368
+ # to replace nans
369
+ self .MISSING_VALUES = \
370
+ {
371
+ 'b' : 101 ,
372
+ 'h' : 32741 ,
373
+ 'l' : 2147483621 ,
374
+ 'f' : np .float32 (struct .unpack ('<f' , b'\x00 \x00 \x00 \x7f ' )[0 ]),
375
+ 'd' : np .float64 (struct .unpack ('<d' , b'\x00 \x00 \x00 \x00 \x00 \x00 \xe0 \x7f ' )[0 ])
376
+ }
290
377
291
378
def _decode_bytes (self , str , errors = None ):
292
379
if compat .PY3 or self ._encoding is not None :
@@ -556,8 +643,8 @@ def _col_size(self, k=None):
556
643
557
644
def _unpack (self , fmt , byt ):
558
645
d = struct .unpack (self .byteorder + fmt , byt )[0 ]
559
- if fmt [- 1 ] in self .MISSING_VALUES :
560
- nmin , nmax = self .MISSING_VALUES [fmt [- 1 ]]
646
+ if fmt [- 1 ] in self .VALID_RANGE :
647
+ nmin , nmax = self .VALID_RANGE [fmt [- 1 ]]
561
648
if d < nmin or d > nmax :
562
649
if self ._missing_values :
563
650
return StataMissingValue (nmax , d )
@@ -855,11 +942,12 @@ def _dtype_to_stata_type(dtype):
855
942
See TYPE_MAP and comments for an explanation. This is also explained in
856
943
the dta spec.
857
944
1 - 244 are strings of this length
858
- 251 - chr(251) - for int8 and int16, byte
859
- 252 - chr(252) - for int32, int
860
- 253 - chr(253) - for int64, long
861
- 254 - chr(254) - for float32, float
862
- 255 - chr(255) - double, double
945
+ Pandas Stata
946
+ 251 - chr(251) - for int8 byte
947
+ 252 - chr(252) - for int16 int
948
+ 253 - chr(253) - for int32 long
949
+ 254 - chr(254) - for float32 float
950
+ 255 - chr(255) - for double double
863
951
864
952
If there are dates to convert, then dtype will already have the correct
865
953
type inserted.
@@ -878,8 +966,10 @@ def _dtype_to_stata_type(dtype):
878
966
elif dtype == np .int64 :
879
967
return chr (253 )
880
968
elif dtype == np .int32 :
969
+ return chr (253 )
970
+ elif dtype == np .int16 :
881
971
return chr (252 )
882
- elif dtype == np .int8 or dtype == np . int16 :
972
+ elif dtype == np .int8 :
883
973
return chr (251 )
884
974
else : # pragma : no cover
885
975
raise ValueError ("Data type %s not currently understood. "
@@ -970,7 +1060,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
970
1060
self ._file = _open_file_binary_write (
971
1061
fname , self ._encoding or self ._default_encoding
972
1062
)
973
- self .type_converters = {253 : np .long , 252 : int }
1063
+ self .type_converters = {253 : np .int32 , 252 : np . int16 , 251 : np . int8 }
974
1064
975
1065
def _write (self , to_write ):
976
1066
"""
@@ -990,11 +1080,14 @@ def __init__(self, data):
990
1080
self .data = data
991
1081
992
1082
def __iter__ (self ):
993
- for i , row in data .iterrows ():
994
- yield row
1083
+ for row in data .itertuples ():
1084
+ # First element is index, so remove
1085
+ yield row [1 :]
995
1086
996
1087
if self ._write_index :
997
1088
data = data .reset_index ()
1089
+ # Check columns for compatbaility with stata
1090
+ data = _cast_to_stata_types (data )
998
1091
self .datarows = DataFrameRowIter (data )
999
1092
self .nobs , self .nvar = data .shape
1000
1093
self .data = data
@@ -1181,7 +1274,7 @@ def _write_data_dates(self):
1181
1274
self ._write (var )
1182
1275
else :
1183
1276
if isnull (var ): # this only matters for floats
1184
- var = MISSING_VALUES [typ ]
1277
+ var = MISSING_VALUES [TYPE_MAP [ typ ] ]
1185
1278
self ._file .write (struct .pack (byteorder + TYPE_MAP [typ ], var ))
1186
1279
1187
1280
def _null_terminate (self , s , as_string = False ):
0 commit comments