feat(rust,python): csv with variable number of columns

Infer the number of columns of a header-less csv from the same group of rows which are used to infer the types. The old logic, to count only the columns in the first row is used if `infer_schema_length` is set to 0. Closes pola-rs#1505
cannero · Mar 20, 2023 · 878a5b0 · 878a5b0
1 parent 8364186
commit 878a5b0
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 14 deletions.
diff --git a/polars/polars-io/src/csv/utils.rs b/polars/polars-io/src/csv/utils.rs
@@ -267,16 +267,38 @@ pub fn infer_file_schema(
             }
             final_headers
         } else {
-            let mut column_names: Vec<String> = byterecord
-                .enumerate()
-                .map(|(i, _s)| format!("column_{}", i + 1))
-                .collect();
-            // needed because SplitLines does not return the \n char, so SplitFields does not catch
-            // the latest value if ending with a delimiter.
-            if header_line.ends_with(&[delimiter]) {
-                column_names.push(format!("column_{}", column_names.len() + 1))
-            }
-            column_names
+            // re-init lines so that the header line is included in column and type inference.
+            lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
+            let max_column_count = (&mut lines)
+                .take(max_read_lines.unwrap_or(usize::MAX))
+                .skip(skip_rows_after_header)
+                .map(|line| {
+                    let s = SplitFields::new(line, delimiter, quote_char, eol_char);
+                    // needed because SplitLines does not return the \n char, so SplitFields does not catch
+                    // the latest value if ending with a delimiter.
+                    if line.ends_with(&[delimiter]) {
+                        s.count() + 1
+                    } else {
+                        s.count()
+                    }
+                })
+                .max()
+                // Iterator can be empty if max_read_lines (infer_schema_length) is set to 0 or
+                // if there is only one line without an eol.
+                // A test for at least one existing line was already done when searching for the header.
+                .unwrap_or_else(|| {
+                    if header_line.ends_with(&[delimiter]) {
+                        byterecord.count() + 1
+                    } else {
+                        byterecord.count()
+                    }
+                });
+            // reset iterator
+            lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
+
+            (0..max_column_count)
+                .map(|i| format!("column_{}", i + 1))
+                .collect()
         }
     } else if has_header && !bytes.is_empty() {
         // there was no new line char. So we copy the whole buf and add one
@@ -302,10 +324,6 @@ pub fn infer_file_schema(
     } else {
         polars_bail!(NoData: "empty CSV");
     };
-    if !has_header {
-        // re-init lines so that the header is included in type inference.
-        lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
-    }
 
     let header_length = headers.len();
     // keep track of inferred field types

diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
@@ -133,6 +133,8 @@ def read_csv(
         inferred dtype for those columns with ``dtypes``.
         If set to 0, all columns will be read as ``pl.Utf8``.
         If set to ``None``, a full table scan will be done (slow).
+        If ``has_header == False``, this value is also used to determine the
+        number of columns. If set to 0, only the first row will be used.
     batch_size
         Number of lines to read into the buffer at once.
         Modify this to change performance.
@@ -483,6 +485,8 @@ def read_csv_batched(
         Maximum number of lines to read to infer schema.
         If set to 0, all columns will be read as ``pl.Utf8``.
         If set to ``None``, a full table scan will be done (slow).
+        If ``has_header == False``, this value is also used to determine the
+        number of columns. If set to 0, only the first row will be used.
     batch_size
         Number of lines to read into the buffer at once.
 
@@ -768,6 +772,8 @@ def scan_csv(
         Maximum number of lines to read to infer schema.
         If set to 0, all columns will be read as ``pl.Utf8``.
         If set to ``None``, a full table scan will be done (slow).
+        If ``has_header == False``, this value is also used to determine the
+        number of columns. If set to 0, only the first row will be used.
     n_rows
         Stop reading from CSV file after reading ``n_rows``.
     encoding : {'utf8', 'utf8-lossy'}

diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
@@ -1246,3 +1246,55 @@ def some_multiline_str(n: int) -> str:
 
     f.seek(0)
     assert pl.read_csv(f, has_header=False).shape == (4, 3)
+
+
+def test_csv_no_header_variable_number_of_columns() -> None:
+    # 1505
+    csv_middle_column_longest = textwrap.dedent(
+        """\
+        a,b,c
+        d,e,f,g,1,i
+        j,k,l,m,2
+        """
+    ).encode()
+
+    expected_df = {
+        "column_1": ["a", "d", "j"],
+        "column_2": ["b", "e", "k"],
+        "column_3": ["c", "f", "l"],
+        "column_4": [None, "g", "m"],
+        "column_5": [None, 1, 2],
+        "column_6": [None, "i", None],
+    }
+
+    df = pl.read_csv(csv_middle_column_longest, has_header=False)
+    assert df.to_dict(False) == expected_df
+
+    df = pl.read_csv(csv_middle_column_longest, has_header=False, infer_schema_length=2)
+    assert df.to_dict(False) == expected_df
+
+    # with infer_schema_length == 0, only the first row is used
+    df = pl.read_csv(csv_middle_column_longest, has_header=False, infer_schema_length=0)
+    assert df.to_dict(False) == {
+        "column_1": ["a", "d", "j"],
+        "column_2": ["b", "e", "k"],
+        "column_3": ["c", "f", "l"],
+    }
+
+    csv_first_column_longest = textwrap.dedent(
+        """\
+        a,b,c,d,1,f
+        g,h,i
+        j,k,l,m,2
+        """
+    ).encode()
+
+    df = pl.read_csv(csv_first_column_longest, has_header=False)
+    assert df.to_dict(False) == {
+        "column_1": ["a", "g", "j"],
+        "column_2": ["b", "h", "k"],
+        "column_3": ["c", "i", "l"],
+        "column_4": ["d", None, "m"],
+        "column_5": [1, None, 2],
+        "column_6": ["f", None, None],
+    }