diff --git a/polars/polars-io/src/csv/utils.rs b/polars/polars-io/src/csv/utils.rs index e61380c153858..061d046970fd6 100644 --- a/polars/polars-io/src/csv/utils.rs +++ b/polars/polars-io/src/csv/utils.rs @@ -267,16 +267,38 @@ pub fn infer_file_schema( } final_headers } else { - let mut column_names: Vec = byterecord - .enumerate() - .map(|(i, _s)| format!("column_{}", i + 1)) - .collect(); - // needed because SplitLines does not return the \n char, so SplitFields does not catch - // the latest value if ending with a delimiter. - if header_line.ends_with(&[delimiter]) { - column_names.push(format!("column_{}", column_names.len() + 1)) - } - column_names + // re-init lines so that the header line is included in column and type inference. + lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows); + let max_column_count = (&mut lines) + .take(max_read_lines.unwrap_or(usize::MAX)) + .skip(skip_rows_after_header) + .map(|line| { + let s = SplitFields::new(line, delimiter, quote_char, eol_char); + // needed because SplitLines does not return the \n char, so SplitFields does not catch + // the latest value if ending with a delimiter. + if line.ends_with(&[delimiter]) { + s.count() + 1 + } else { + s.count() + } + }) + .max() + // Iterator can be empty if max_read_lines (infer_schema_length) is set to 0 or + // if there is only one line without an eol. + // A test for at least one existing line was already done when searching for the header. + .unwrap_or_else(|| { + if header_line.ends_with(&[delimiter]) { + byterecord.count() + 1 + } else { + byterecord.count() + } + }); + // reset iterator + lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows); + + (0..max_column_count) + .map(|i| format!("column_{}", i + 1)) + .collect() } } else if has_header && !bytes.is_empty() { // there was no new line char. So we copy the whole buf and add one @@ -302,10 +324,6 @@ pub fn infer_file_schema( } else { polars_bail!(NoData: "empty CSV"); }; - if !has_header { - // re-init lines so that the header is included in type inference. - lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows); - } let header_length = headers.len(); // keep track of inferred field types diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 1da61d168bdd7..47454f3c30407 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -133,6 +133,8 @@ def read_csv( inferred dtype for those columns with ``dtypes``. If set to 0, all columns will be read as ``pl.Utf8``. If set to ``None``, a full table scan will be done (slow). + If ``has_header == False``, this value is also used to determine the + number of columns. If set to 0, only the first row will be used. batch_size Number of lines to read into the buffer at once. Modify this to change performance. @@ -483,6 +485,8 @@ def read_csv_batched( Maximum number of lines to read to infer schema. If set to 0, all columns will be read as ``pl.Utf8``. If set to ``None``, a full table scan will be done (slow). + If ``has_header == False``, this value is also used to determine the + number of columns. If set to 0, only the first row will be used. batch_size Number of lines to read into the buffer at once. @@ -768,6 +772,8 @@ def scan_csv( Maximum number of lines to read to infer schema. If set to 0, all columns will be read as ``pl.Utf8``. If set to ``None``, a full table scan will be done (slow). + If ``has_header == False``, this value is also used to determine the + number of columns. If set to 0, only the first row will be used. n_rows Stop reading from CSV file after reading ``n_rows``. encoding : {'utf8', 'utf8-lossy'} diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 5fbc27a4cd78d..d1366a17233e3 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1246,3 +1246,55 @@ def some_multiline_str(n: int) -> str: f.seek(0) assert pl.read_csv(f, has_header=False).shape == (4, 3) + + +def test_csv_no_header_variable_number_of_columns() -> None: + # 1505 + csv_middle_column_longest = textwrap.dedent( + """\ + a,b,c + d,e,f,g,1,i + j,k,l,m,2 + """ + ).encode() + + expected_df = { + "column_1": ["a", "d", "j"], + "column_2": ["b", "e", "k"], + "column_3": ["c", "f", "l"], + "column_4": [None, "g", "m"], + "column_5": [None, 1, 2], + "column_6": [None, "i", None], + } + + df = pl.read_csv(csv_middle_column_longest, has_header=False) + assert df.to_dict(False) == expected_df + + df = pl.read_csv(csv_middle_column_longest, has_header=False, infer_schema_length=2) + assert df.to_dict(False) == expected_df + + # with infer_schema_length == 0, only the first row is used + df = pl.read_csv(csv_middle_column_longest, has_header=False, infer_schema_length=0) + assert df.to_dict(False) == { + "column_1": ["a", "d", "j"], + "column_2": ["b", "e", "k"], + "column_3": ["c", "f", "l"], + } + + csv_first_column_longest = textwrap.dedent( + """\ + a,b,c,d,1,f + g,h,i + j,k,l,m,2 + """ + ).encode() + + df = pl.read_csv(csv_first_column_longest, has_header=False) + assert df.to_dict(False) == { + "column_1": ["a", "g", "j"], + "column_2": ["b", "h", "k"], + "column_3": ["c", "i", "l"], + "column_4": ["d", None, "m"], + "column_5": [1, None, 2], + "column_6": ["f", None, None], + }