Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modest performance, address #12647 #12656

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Further cythonization
  • Loading branch information
kshedden committed Apr 21, 2016
commit 11c2f310808f7fd65f2756c6a1ebcc170a4c9119
7 changes: 2 additions & 5 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,10 +550,7 @@ def read(self, nrows=None):
nd = (self.column_types == b'd').sum()
ns = (self.column_types == b's').sum()

self._string_chunk = []
for j,ct in enumerate(self.column_types):
if ct == b's':
self._string_chunk.append([None] * nrows)
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8)

self._current_row_in_chunk_index = 0
Expand Down Expand Up @@ -607,7 +604,7 @@ def _chunk_to_dataframe(self):
rslt[name] = epoch + pd.to_timedelta(rslt[name], unit='d')
jb += 1
elif self.column_types[j] == b's':
rslt[name] = pd.Series(self._string_chunk[js], dtype=np.object)
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(self.encoding)
if self.blank_missing:
Expand Down
56 changes: 29 additions & 27 deletions pandas/io/sas/saslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,21 @@ cimport numpy as np
from numpy cimport uint8_t, uint16_t, int8_t
import sas_constants as const


# rle_decompress decompresses data using a Run Length Encoding
# algorithm. It is partially documented here:
#
# https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
cdef rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):

cdef:
uint8_t control_byte, x
np.ndarray[uint8_t, ndim=1] result = np.zeros(result_length, np.uint8)
int rpos = 0
int ipos = 0
int i, nbytes
length = len(inbuff)
cdef uint8_t control_byte
cdef uint8_t [:] result = np.zeros(result_length, np.uint8)

cdef int rpos = 0
cdef int ipos = 0
cdef int i
cdef int nbytes
cdef uint8_t x
cdef length = len(inbuff)

while ipos < length:
control_byte = inbuff[ipos] & 0xF0
Expand Down Expand Up @@ -105,22 +106,24 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
if len(result) != result_length:
print("RLE: %v != %v\n", (len(result), result_length))

return np.asarray(result, dtype=np.uint8)
return np.asarray(result).tostring()


# rdc_decompress decompresses data using the Ross Data Compression algorithm:
#
# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
cdef rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):

cdef:
uint8_t cmd, ofs, cnt
uint16_t ctrl_bits
uint16_t ctrl_mask = 0
int ipos = 0
int rpos = 0
int k
np.ndarray[uint8_t, ndim=1] outbuff = np.zeros(result_length, dtype=np.uint8)
cdef uint8_t cmd
cdef uint16_t ctrl_bits
cdef uint16_t ctrl_mask = 0
cdef uint16_t ofs
cdef uint16_t cnt
cdef int ipos = 0
cdef int rpos = 0
cdef int k

cdef uint8_t [:] outbuff = np.zeros(result_length, dtype=np.uint8)

ii = -1

Expand Down Expand Up @@ -187,10 +190,9 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
if len(outbuff) != result_length:
raise ValueError("RDC: %v != %v\n", len(outbuff), result_length)

return np.asarray(outbuff, dtype=np.uint8)

return np.asarray(outbuff).tostring()

cdef np.ndarray[uint8_t, ndim=1] decompress(object parser, int row_length, uint8_t[:] page):
cdef decompress(object parser, int row_length, page):
page = np.frombuffer(page, dtype=np.uint8)
if parser.compression == const.rle_compression:
return rle_decompress(row_length, page)
Expand All @@ -210,7 +212,7 @@ def do_read(object parser, int nrows):
break


cdef bint readline(object parser):
cdef readline(object parser):

cdef:
int offset, bit_offset, align_correction, subheader_pointer_length
Expand Down Expand Up @@ -281,17 +283,17 @@ cdef bint readline(object parser):
parser._current_page_type)


cdef void process_byte_array_with_data(object parser, int offset, int length):
cdef process_byte_array_with_data(object parser, int offset, int length):

cdef:
int s, j, k, m, start, jb, js, lngt
long[:] lengths = parser._column_data_lengths
long[:] offsets = parser._column_data_offsets
char[:] column_types = parser.column_types
uint8_t[:, :] byte_chunk = parser._byte_chunk
#object[:, :] string_chunk = parser._string_chunk
object[:, :] string_chunk = parser._string_chunk

source = np.frombuffer(parser._cached_page[offset:offset+length], dtype=np.uint8)
source = parser._cached_page[offset:offset+length]
if (parser.compression != "") and (length < parser.row_length):
source = decompress(parser, parser.row_length, source)

Expand All @@ -312,7 +314,7 @@ cdef void process_byte_array_with_data(object parser, int offset, int length):
byte_chunk[jb, m + k] = source[start + k]
jb += 1
elif column_types[j] == b's':
parser._string_chunk[js][parser._current_row_in_chunk_index] = source[start:(start+lngt)].tostring().rstrip()
string_chunk[js, parser._current_row_in_chunk_index] = source[start:(start+lngt)].rstrip()
js += 1
else:
raise ValueError("unknown column type: %s" % parser.columns[j].ctype)
Expand Down