Skip to content

Commit

Permalink
[airbyte-cdk] Increase the maximum parseable field size for CSV files (
Browse files Browse the repository at this point in the history
  • Loading branch information
blarghmatey authored May 7, 2024
1 parent 7f70ac4 commit 18c9ebc
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@ def _skip_rows(fp: IOBase, rows_to_skip: int) -> None:
class CsvParser(FileTypeParser):
_MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000

def __init__(self, csv_reader: Optional[_CsvReader] = None):
def __init__(self, csv_reader: Optional[_CsvReader] = None, csv_field_max_bytes: int = 2**31):
# Increase the maximum length of data that can be parsed in a single CSV field. The default is 128k, which is typically sufficient
# but given the use of Airbyte in loading a large variety of data it is best to allow for a larger maximum field size to avoid
# skipping data on load. https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
csv.field_size_limit(csv_field_max_bytes)
self._csv_reader = csv_reader if csv_reader else _CsvReader()

def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,25 @@ def test_given_too_few_values_for_columns_when_read_data_then_raise_exception_an
next(data_generator)
assert new_dialect not in csv.list_dialects()

def test_parse_field_size_larger_than_default_python_maximum(self) -> None:
# The field size for the csv module will be set as a side-effect of initializing the CsvParser class.
assert csv.field_size_limit() == 2**31
long_string = 130 * 1024 * "a"
assert len(long_string.encode("utf-8")) > (128 * 1024)
self._stream_reader.open_file.return_value = (
CsvFileBuilder()
.with_data(
[
"header1,header2",
f'1,"{long_string}"',
]
)
.build()
)

data_generator = self._read_data()
assert list(data_generator) == [{"header1": "1", "header2": long_string}]

def _read_data(self) -> Generator[Dict[str, str], None, None]:
data_generator = self._csv_reader.read_data(
self._config,
Expand Down

0 comments on commit 18c9ebc

Please sign in to comment.