diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 811b32676fc6a..9cbfbcda0e4af 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -298,6 +298,7 @@ Bug Fixes - Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) - Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) - Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`) +- Bug in C csv parser causing spurious NaNs when data started with newline followed by whitespace. (:issue:`10022`) - Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`) - Bug where .iloc and .loc behavior is not consistent on empty dataframes (:issue:`9964`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 59fb3f14de8d2..7d52c6ad4cb3b 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2287,6 +2287,12 @@ def test_single_char_leading_whitespace(self): result = self.read_csv(StringIO(data), skipinitialspace=True) tm.assert_frame_equal(result, expected) + def test_chunk_begins_with_newline_whitespace(self): + # GH 10022 + data = '\n hello\nworld\n' + result = self.read_csv(StringIO(data), header=None) + self.assertEqual(len(result), 2) + class TestPythonParser(ParserTests, tm.TestCase): def test_negative_skipfooter_raises(self): diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index e7b5db9c5e361..3be17f17d6afa 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -854,7 +854,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) --i; } while (i + 1 > self->datapos && *buf != '\n'); - if (i + 1 > self->datapos) // reached a newline rather than the beginning + if (*buf == '\n') // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline ++i; @@ -1172,7 +1172,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) --i; } while (i + 1 > self->datapos && *buf != self->lineterminator); - if (i + 1 > self->datapos) // reached a newline rather than the beginning + if (*buf == self->lineterminator) // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline ++i;