Skip to content

Commit

Permalink
feat(rust,python): csv with variable number of columns
Browse files Browse the repository at this point in the history
Infer the number of columns of a header-less csv from the same group
of rows which are used to infer the types.
The old logic, to count only the columns in the first row is used if
`infer_schema_length` is set to 0.

Closes pola-rs#1505
  • Loading branch information
cannero committed Mar 20, 2023
1 parent 8364186 commit 878a5b0
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 14 deletions.
46 changes: 32 additions & 14 deletions polars/polars-io/src/csv/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -267,16 +267,38 @@ pub fn infer_file_schema(
}
final_headers
} else {
let mut column_names: Vec<String> = byterecord
.enumerate()
.map(|(i, _s)| format!("column_{}", i + 1))
.collect();
// needed because SplitLines does not return the \n char, so SplitFields does not catch
// the latest value if ending with a delimiter.
if header_line.ends_with(&[delimiter]) {
column_names.push(format!("column_{}", column_names.len() + 1))
}
column_names
// re-init lines so that the header line is included in column and type inference.
lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
let max_column_count = (&mut lines)
.take(max_read_lines.unwrap_or(usize::MAX))
.skip(skip_rows_after_header)
.map(|line| {
let s = SplitFields::new(line, delimiter, quote_char, eol_char);
// needed because SplitLines does not return the \n char, so SplitFields does not catch
// the latest value if ending with a delimiter.
if line.ends_with(&[delimiter]) {
s.count() + 1
} else {
s.count()
}
})
.max()
// Iterator can be empty if max_read_lines (infer_schema_length) is set to 0 or
// if there is only one line without an eol.
// A test for at least one existing line was already done when searching for the header.
.unwrap_or_else(|| {
if header_line.ends_with(&[delimiter]) {
byterecord.count() + 1
} else {
byterecord.count()
}
});
// reset iterator
lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);

(0..max_column_count)
.map(|i| format!("column_{}", i + 1))
.collect()
}
} else if has_header && !bytes.is_empty() {
// there was no new line char. So we copy the whole buf and add one
Expand All @@ -302,10 +324,6 @@ pub fn infer_file_schema(
} else {
polars_bail!(NoData: "empty CSV");
};
if !has_header {
// re-init lines so that the header is included in type inference.
lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
}

let header_length = headers.len();
// keep track of inferred field types
Expand Down
6 changes: 6 additions & 0 deletions py-polars/polars/io/csv/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ def read_csv(
inferred dtype for those columns with ``dtypes``.
If set to 0, all columns will be read as ``pl.Utf8``.
If set to ``None``, a full table scan will be done (slow).
If ``has_header == False``, this value is also used to determine the
number of columns. If set to 0, only the first row will be used.
batch_size
Number of lines to read into the buffer at once.
Modify this to change performance.
Expand Down Expand Up @@ -483,6 +485,8 @@ def read_csv_batched(
Maximum number of lines to read to infer schema.
If set to 0, all columns will be read as ``pl.Utf8``.
If set to ``None``, a full table scan will be done (slow).
If ``has_header == False``, this value is also used to determine the
number of columns. If set to 0, only the first row will be used.
batch_size
Number of lines to read into the buffer at once.
Expand Down Expand Up @@ -768,6 +772,8 @@ def scan_csv(
Maximum number of lines to read to infer schema.
If set to 0, all columns will be read as ``pl.Utf8``.
If set to ``None``, a full table scan will be done (slow).
If ``has_header == False``, this value is also used to determine the
number of columns. If set to 0, only the first row will be used.
n_rows
Stop reading from CSV file after reading ``n_rows``.
encoding : {'utf8', 'utf8-lossy'}
Expand Down
52 changes: 52 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1246,3 +1246,55 @@ def some_multiline_str(n: int) -> str:

f.seek(0)
assert pl.read_csv(f, has_header=False).shape == (4, 3)


def test_csv_no_header_variable_number_of_columns() -> None:
# 1505
csv_middle_column_longest = textwrap.dedent(
"""\
a,b,c
d,e,f,g,1,i
j,k,l,m,2
"""
).encode()

expected_df = {
"column_1": ["a", "d", "j"],
"column_2": ["b", "e", "k"],
"column_3": ["c", "f", "l"],
"column_4": [None, "g", "m"],
"column_5": [None, 1, 2],
"column_6": [None, "i", None],
}

df = pl.read_csv(csv_middle_column_longest, has_header=False)
assert df.to_dict(False) == expected_df

df = pl.read_csv(csv_middle_column_longest, has_header=False, infer_schema_length=2)
assert df.to_dict(False) == expected_df

# with infer_schema_length == 0, only the first row is used
df = pl.read_csv(csv_middle_column_longest, has_header=False, infer_schema_length=0)
assert df.to_dict(False) == {
"column_1": ["a", "d", "j"],
"column_2": ["b", "e", "k"],
"column_3": ["c", "f", "l"],
}

csv_first_column_longest = textwrap.dedent(
"""\
a,b,c,d,1,f
g,h,i
j,k,l,m,2
"""
).encode()

df = pl.read_csv(csv_first_column_longest, has_header=False)
assert df.to_dict(False) == {
"column_1": ["a", "g", "j"],
"column_2": ["b", "h", "k"],
"column_3": ["c", "i", "l"],
"column_4": ["d", None, "m"],
"column_5": [1, None, 2],
"column_6": ["f", None, None],
}

0 comments on commit 878a5b0

Please sign in to comment.