Added option to not decode header text

pandas-dev · kshedden · Mar 17, 2016 · Mar 19, 2016 · Mar 19, 2016 · Mar 19, 2016
commit 873a877ef946ddf006eb0edd408f3f5bff9a4682
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -54,20 +54,24 @@ class SAS7BDATReader(BaseIterator):
         with given number of lines.
     encoding : string, defaults to None
         String encoding.
-    convert_text : bool, deafaults to True
+    convert_text : bool, defaults to True
         If False, text variables are left as raw bytes.
+    convert_header_text : bool, defaults to True
+        If False, header text, including column names, are left as raw
+        bytes.
     """
 
     def __init__(self, path_or_buf, index=None, convert_dates=True,
                  blank_missing=True, chunksize=None, encoding=None,
-                 convert_text=True):
+                 convert_text=True, convert_header_text=True):
 
         self.index = index
         self.convert_dates = convert_dates
         self.blank_missing = blank_missing
         self.chunksize = chunksize
         self.encoding = encoding
         self.convert_text = convert_text
+        self.convert_header_text = convert_header_text
 
         self.compression = ""
         self.column_names_strings = []
@@ -143,10 +147,14 @@ def _get_properties(self):
             self.platform = "unknown"
 
         buf = self._read_bytes(const.dataset_offset, const.dataset_length)
-        self.name = buf.rstrip(b'\x00 ').decode()
+        self.name = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.name = self.name.decode(self.encoding)
 
         buf = self._read_bytes(const.file_type_offset, const.file_type_length)
-        self.file_type = buf.rstrip(b'\x00 ').decode()
+        self.file_type = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.file_type = self.file_type.decode(self.encoding)
 
         # Timestamp is epoch 01/01/1960
         epoch = pd.datetime(1960, 1, 1)
@@ -173,25 +181,33 @@ def _get_properties(self):
 
         buf = self._read_bytes(const.sas_release_offset + total_align,
                                const.sas_release_length)
-        self.sas_release = buf.rstrip(b'\x00 ').decode()
+        self.sas_release = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.sas_release = self.sas_release.decode(self.encoding)
 
         buf = self._read_bytes(const.sas_server_type_offset + total_align,
                                const.sas_server_type_length)
-        self.server_type = buf.rstrip(b'\x00 ').decode()
+        self.server_type = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.server_type = self.server_type.decode(self.encoding)
 
         buf = self._read_bytes(const.os_version_number_offset + total_align,
                                const.os_version_number_length)
-        self.os_version = buf.rstrip(b'\x00 ').decode()
+        self.os_version = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.os_version = self.os_version.decode(self.encoding)
 
         buf = self._read_bytes(const.os_name_offset + total_align,
                                const.os_name_length)
         buf = buf.rstrip(b'\x00 ')
         if len(buf) > 0:
-            self.os_name = buf.decode()
+            self.os_name = buf.decode(self.encoding)
         else:
             buf = self._read_bytes(const.os_maker_offset + total_align,
                                    const.os_maker_length)
-            self.os_name = buf.rstrip(b'\x00 ').decode()
+            self.os_name = buf.rstrip(b'\x00 ')
+            if self.convert_header_text:
+                self.os_name = self.os_name.decode(self.encoding)
 
     # Read a single float of the given width (4 or 8).
     def _read_float(self, offset, width):
@@ -383,8 +399,10 @@ def _process_columntext_subheader(self, offset, length):
         text_block_size = self._read_int(offset, const.text_block_size_length)
 
         buf = self._read_bytes(offset, text_block_size)
-        self.column_names_strings.append(
-            buf[0:text_block_size].rstrip(b"\x00 ").decode(self.encoding))
+        cname = buf[0:text_block_size].rstrip(b"\x00 ")
+        if self.convert_header_text:
+            cname = cname.decode(self.encoding)
+        self.column_names_strings.append(cname)
 
         if len(self.column_names_strings) == 1:
             column_name = self.column_names_strings[0]

diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx
@@ -11,7 +11,6 @@ cdef rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
 
     cdef uint8_t control_byte
     cdef uint8_t [:] result = np.zeros(result_length, np.uint8)
-
     cdef int rpos = 0
     cdef int ipos = 0
     cdef int i
@@ -106,7 +105,7 @@ cdef rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
     if len(result) != result_length:
         print("RLE: %v != %v\n", (len(result), result_length))
 
-    return np.asarray(result).tostring()
+    return np.asarray(result)
 
 
 # rdc_decompress decompresses data using the Ross Data Compression algorithm:
@@ -122,7 +121,6 @@ cdef rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
     cdef int ipos = 0
     cdef int rpos = 0
     cdef int k
-
     cdef uint8_t [:] outbuff = np.zeros(result_length, dtype=np.uint8)
 
     ii = -1
@@ -190,7 +188,7 @@ cdef rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
     if len(outbuff) != result_length:
         raise ValueError("RDC: %v != %v\n", len(outbuff), result_length)
 
-    return np.asarray(outbuff).tostring()
+    return np.asarray(outbuff)
 
 cdef decompress(object parser, int row_length, page):
     page = np.frombuffer(page, dtype=np.uint8)
@@ -292,10 +290,13 @@ cdef process_byte_array_with_data(object parser, int offset, int length):
         char[:] column_types = parser.column_types
         uint8_t[:, :] byte_chunk = parser._byte_chunk
         object[:, :] string_chunk = parser._string_chunk
+        np.ndarray[uint8_t, ndim=1] source
+        np.ndarray[uint8_t, ndim=1] raw_source = np.frombuffer(parser._cached_page[offset:offset+length], dtype=np.uint8)
 
-    source = parser._cached_page[offset:offset+length]
     if (parser.compression != "") and (length < parser.row_length):
-        source = decompress(parser, parser.row_length, source)
+        source = decompress(parser, parser.row_length, raw_source)
+    else:
+        source = raw_source
 
     s = 8 * parser._current_row_in_chunk_index
     js = 0
@@ -314,7 +315,7 @@ cdef process_byte_array_with_data(object parser, int offset, int length):
                 byte_chunk[jb, m + k] = source[start + k]
             jb += 1
         elif column_types[j] == b's':
-            string_chunk[js, parser._current_row_in_chunk_index] = source[start:(start+lngt)].rstrip()
+            string_chunk[js, parser._current_row_in_chunk_index] = source[start:(start+lngt)].tostring().rstrip()
             js += 1
         else:
           raise ValueError("unknown column type: %s" % parser.columns[j].ctype)

diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py
@@ -47,7 +47,7 @@ def test_from_buffer(self):
                 byts = open(fname, 'rb').read()
                 buf = io.BytesIO(byts)
                 df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8')
-                tm.assert_frame_equal(df, df0)
+                tm.assert_frame_equal(df, df0, check_exact=False)
 
     def test_from_iterator(self):
         for j in 0, 1: