4242)
4343
4444from pandas .io .common import get_handle
45- from pandas .io .sas ._sas import Parser
45+ from pandas .io .sas ._sas import (
46+ Parser ,
47+ get_subheader_index ,
48+ )
4649import pandas .io .sas .sas_constants as const
4750from pandas .io .sas .sasreader import ReaderBase
4851
@@ -87,19 +90,6 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
8790 return s_series
8891
8992
90- class _SubheaderPointer :
91- offset : int
92- length : int
93- compression : int
94- ptype : int
95-
96- def __init__ (self , offset : int , length : int , compression : int , ptype : int ) -> None :
97- self .offset = offset
98- self .length = length
99- self .compression = compression
100- self .ptype = ptype
101-
102-
10393class _Column :
10494 col_id : int
10595 name : str | bytes
@@ -189,7 +179,7 @@ def __init__(
189179 self .column_formats : list [str | bytes ] = []
190180 self .columns : list [_Column ] = []
191181
192- self ._current_page_data_subheader_pointers : list [_SubheaderPointer ] = []
182+ self ._current_page_data_subheader_pointers : list [tuple [ int , int ] ] = []
193183 self ._cached_page = None
194184 self ._column_data_lengths : list [int ] = []
195185 self ._column_data_offsets : list [int ] = []
@@ -205,6 +195,19 @@ def __init__(
205195
206196 self ._path_or_buf = self .handles .handle
207197
198+ # Same order as const.SASIndex
199+ self ._subheader_processors = [
200+ self ._process_rowsize_subheader ,
201+ self ._process_columnsize_subheader ,
202+ self ._process_subheader_counts ,
203+ self ._process_columntext_subheader ,
204+ self ._process_columnname_subheader ,
205+ self ._process_columnattributes_subheader ,
206+ self ._process_format_subheader ,
207+ self ._process_columnlist_subheader ,
208+ None , # Data
209+ ]
210+
208211 try :
209212 self ._get_properties ()
210213 self ._parse_metadata ()
@@ -426,89 +429,47 @@ def _process_page_metadata(self) -> None:
426429 bit_offset = self ._page_bit_offset
427430
428431 for i in range (self ._current_page_subheaders_count ):
429- pointer = self ._process_subheader_pointers (
430- const .subheader_pointers_offset + bit_offset , i
431- )
432- if pointer .length == 0 :
433- continue
434- if pointer .compression == const .truncated_subheader_id :
435- continue
436- subheader_signature = self ._read_subheader_signature (pointer .offset )
437- subheader_index = self ._get_subheader_index (
438- subheader_signature , pointer .compression , pointer .ptype
439- )
440- self ._process_subheader (subheader_index , pointer )
441-
442- def _get_subheader_index (self , signature : bytes , compression , ptype ) -> int :
443- # TODO: return here could be made an enum
444- index = const .subheader_signature_to_index .get (signature )
445- if index is None :
446- f1 = (compression == const .compressed_subheader_id ) or (compression == 0 )
447- f2 = ptype == const .compressed_subheader_type
448- if (self .compression != b"" ) and f1 and f2 :
449- index = const .SASIndex .data_subheader_index
450- else :
451- self .close ()
452- raise ValueError ("Unknown subheader signature" )
453- return index
454-
455- def _process_subheader_pointers (
456- self , offset : int , subheader_pointer_index : int
457- ) -> _SubheaderPointer :
458-
459- subheader_pointer_length = self ._subheader_pointer_length
460- total_offset = offset + subheader_pointer_length * subheader_pointer_index
432+ offset = const .subheader_pointers_offset + bit_offset
433+ total_offset = offset + self ._subheader_pointer_length * i
461434
462- subheader_offset = self ._read_int (total_offset , self ._int_length )
463- total_offset += self ._int_length
435+ subheader_offset = self ._read_int (total_offset , self ._int_length )
436+ total_offset += self ._int_length
464437
465- subheader_length = self ._read_int (total_offset , self ._int_length )
466- total_offset += self ._int_length
438+ subheader_length = self ._read_int (total_offset , self ._int_length )
439+ total_offset += self ._int_length
467440
468- subheader_compression = self ._read_int (total_offset , 1 )
469- total_offset += 1
470-
471- subheader_type = self ._read_int (total_offset , 1 )
472-
473- x = _SubheaderPointer (
474- subheader_offset , subheader_length , subheader_compression , subheader_type
475- )
441+ subheader_compression = self ._read_int (total_offset , 1 )
442+ total_offset += 1
476443
477- return x
444+ subheader_type = self . _read_int ( total_offset , 1 )
478445
479- def _read_subheader_signature (self , offset : int ) -> bytes :
480- subheader_signature = self ._read_bytes (offset , self ._int_length )
481- return subheader_signature
482-
483- def _process_subheader (
484- self , subheader_index : int , pointer : _SubheaderPointer
485- ) -> None :
486- offset = pointer .offset
487- length = pointer .length
488-
489- if subheader_index == const .SASIndex .row_size_index :
490- processor = self ._process_rowsize_subheader
491- elif subheader_index == const .SASIndex .column_size_index :
492- processor = self ._process_columnsize_subheader
493- elif subheader_index == const .SASIndex .column_text_index :
494- processor = self ._process_columntext_subheader
495- elif subheader_index == const .SASIndex .column_name_index :
496- processor = self ._process_columnname_subheader
497- elif subheader_index == const .SASIndex .column_attributes_index :
498- processor = self ._process_columnattributes_subheader
499- elif subheader_index == const .SASIndex .format_and_label_index :
500- processor = self ._process_format_subheader
501- elif subheader_index == const .SASIndex .column_list_index :
502- processor = self ._process_columnlist_subheader
503- elif subheader_index == const .SASIndex .subheader_counts_index :
504- processor = self ._process_subheader_counts
505- elif subheader_index == const .SASIndex .data_subheader_index :
506- self ._current_page_data_subheader_pointers .append (pointer )
507- return
508- else :
509- raise ValueError ("unknown subheader index" )
446+ if (
447+ subheader_length == 0
448+ or subheader_compression == const .truncated_subheader_id
449+ ):
450+ continue
510451
511- processor (offset , length )
452+ subheader_signature = self ._read_bytes (subheader_offset , self ._int_length )
453+ subheader_index = get_subheader_index (subheader_signature )
454+ subheader_processor = self ._subheader_processors [subheader_index ]
455+
456+ if subheader_processor is None :
457+ f1 = (
458+ subheader_compression == const .compressed_subheader_id
459+ or subheader_compression == 0
460+ )
461+ f2 = subheader_type == const .compressed_subheader_type
462+ if self .compression and f1 and f2 :
463+ self ._current_page_data_subheader_pointers .append (
464+ (subheader_offset , subheader_length )
465+ )
466+ else :
467+ self .close ()
468+ raise ValueError (
469+ f"Unknown subheader signature { subheader_signature } "
470+ )
471+ else :
472+ subheader_processor (subheader_offset , subheader_length )
512473
513474 def _process_rowsize_subheader (self , offset : int , length : int ) -> None :
514475
@@ -523,10 +484,12 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
523484 lcp_offset += 378
524485
525486 self .row_length = self ._read_int (
526- offset + const .row_length_offset_multiplier * int_len , int_len
487+ offset + const .row_length_offset_multiplier * int_len ,
488+ int_len ,
527489 )
528490 self .row_count = self ._read_int (
529- offset + const .row_count_offset_multiplier * int_len , int_len
491+ offset + const .row_count_offset_multiplier * int_len ,
492+ int_len ,
530493 )
531494 self .col_count_p1 = self ._read_int (
532495 offset + const .col_count_p1_multiplier * int_len , int_len
0 commit comments