Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix spaces around CSV quoted strings #15727

Merged
Merged
28 changes: 28 additions & 0 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ class csv_reader_options {
char _quotechar = '"';
// Whether a quote inside a value is double-quoted
bool _doublequote = true;
// Detect quotes even if surrounded by spaces e.g. ` "data" `
bool _updated_quotes_detection = false;
shrshi marked this conversation as resolved.
Show resolved Hide resolved
// Names of columns to read as datetime
std::vector<std::string> _parse_dates_names;
// Indexes of columns to read as datetime
Expand Down Expand Up @@ -375,6 +377,13 @@ class csv_reader_options {
*/
[[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }

/**
* @brief Detect quotes even if surrounded by spaces e.g. ` "data" `
*
* @return `true` if updated_quotes_detection is enabled
*/
[[nodiscard]] bool is_enabled_updated_quotes_detection() const { return _updated_quotes_detection; }

/**
* @brief Returns names of columns to read as datetime.
*
Expand Down Expand Up @@ -698,6 +707,13 @@ class csv_reader_options {
*/
void enable_doublequote(bool val) { _doublequote = val; }

/**
* @brief Sets whether to use updated_quotes_detection
*
* @param val Boolean value to enable/disable
*/
void enable_updated_quotes_detection(bool val) { _updated_quotes_detection = val; }

/**
* @brief Sets names of columns to read as datetime.
*
Expand Down Expand Up @@ -1126,6 +1142,18 @@ class csv_reader_options_builder {
return *this;
}

/**
* @brief Sets whether to use updated_quotes_detection
*
* @param val Boolean value to enable/disable
* @return this for chaining
*/
csv_reader_options_builder& updated_quotes_detection(bool val)
{
options._updated_quotes_detection = val;
return *this;
}

/**
* @brief Sets names of columns to read as datetime.
*
Expand Down
17 changes: 14 additions & 3 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -351,9 +351,20 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
if (dtypes[actual_col].id() == cudf::type_id::STRING) {
auto end = next_delimiter;
if (not options.keepquotes) {
if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
++field_start;
--end;
if (not options.updatedquotesdetection)
{
if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
++field_start;
--end;
}
} else {
// If the string is quoted, whitespace around the quotes get removed as well
auto const trimmed_field =
shrshi marked this conversation as resolved.
Show resolved Hide resolved
trim_whitespaces(field_start, end);
if ((*trimmed_field.first == options.quotechar) && (*(trimmed_field.second-1) == options.quotechar)) {
field_start = trimmed_field.first+1;
end = trimmed_field.second-1;
}
}
}
auto str_list = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,7 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
if (reader_opts.get_quotechar() != '\0' && reader_opts.get_quoting() != quote_style::NONE) {
parse_opts.quotechar = reader_opts.get_quotechar();
parse_opts.keepquotes = false;
parse_opts.updatedquotesdetection = reader_opts.is_enabled_updated_quotes_detection();
parse_opts.doublequote = reader_opts.is_enabled_doublequote();
} else {
parse_opts.quotechar = '\0';
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/utilities/parsing_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ struct parse_options_view {
char thousands;
char comment;
bool keepquotes;
bool updatedquotesdetection;
bool doublequote;
bool dayfirst;
bool skipblanklines;
Expand All @@ -80,6 +81,7 @@ struct parse_options {
char thousands;
char comment;
bool keepquotes;
bool updatedquotesdetection;
bool doublequote;
bool dayfirst;
bool skipblanklines;
Expand All @@ -105,6 +107,7 @@ struct parse_options {
thousands,
comment,
keepquotes,
updatedquotesdetection,
doublequote,
dayfirst,
skipblanklines,
Expand Down
41 changes: 41 additions & 0 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,47 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
view.column(1));
}

TEST_F(CsvReaderTest, StringsQuotesWhitespace)
{
std::vector<std::string> names{"line", "verse"};

auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
{
std::ofstream outfile(filepath, std::ofstream::out);
outfile << names[0] << ',' << names[1] << '\n';
outfile << "A,a" << '\n'; // unquoted no whitespace
outfile << " B,b" << '\n'; // unquoted leading whitespace
outfile << "C ,c" << '\n'; // unquoted trailing whitespace
outfile << " D ,d" << '\n'; // unquoted leading and trailing whitespace
outfile << "\"E\",e" << '\n'; // quoted no whitespace
outfile << "\"F\" ,f" << '\n'; // quoted trailing whitespace
outfile << " \"G\",g" << '\n'; // quoted leading whitespace
outfile << " \"H\" ,h" << '\n'; // quoted leading and trailing whitespace
outfile << " \" I \" ,i" << '\n'; // quoted leading and trailing whitespace with spaces inside quotes
}

cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
.names(names)
.dtypes(std::vector<data_type>{dtype<cudf::string_view>(), dtype<cudf::string_view>()})
.quoting(cudf::io::quote_style::ALL)
.doublequote(false)
.updated_quotes_detection(true);
auto result = cudf::io::read_csv(in_opts);

auto const view = result.tbl->view();
EXPECT_EQ(2, view.num_columns());
shrshi marked this conversation as resolved.
Show resolved Hide resolved
ASSERT_EQ(type_id::STRING, view.column(0).type().id());
ASSERT_EQ(type_id::STRING, view.column(1).type().id());

expect_column_data_equal(
std::vector<std::string>{"A", " B", "C ", " D ", "E", "F", "G", "H", " I "},
view.column(0));
expect_column_data_equal(
std::vector<std::string>{"a", "b", "c", "d", "e", "f", "g", "h", "i"},
view.column(1));
}

TEST_F(CsvReaderTest, SkiprowsNrows)
{
auto filepath = temp_env->get_temp_dir() + "SkiprowsNrows.csv";
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/cpp/io/csv.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ cdef extern from "cudf/io/csv.hpp" \
bool is_enabled_skip_blank_lines() except +
cudf_io_types.quote_style get_quoting() except +
char get_quotechar() except +
bool is_enabled_doublequote() except +
bool is_enabled_updated_quotes_detection() except +
vector[string] get_parse_dates_names() except +
vector[int] get_parse_dates_indexes() except +
vector[string] get_parse_hex_names() except +
Expand Down Expand Up @@ -95,6 +95,7 @@ cdef extern from "cudf/io/csv.hpp" \
void set_quoting(cudf_io_types.quote_style style) except +
void set_quotechar(char val) except +
void set_doublequote(bool val) except +
void set_updated_quotes_detection(bool val) except +
void set_parse_dates(vector[string]) except +
void set_parse_dates(vector[int]) except +
void set_parse_hex(vector[string]) except +
Expand Down Expand Up @@ -163,6 +164,7 @@ cdef extern from "cudf/io/csv.hpp" \
) except +
csv_reader_options_builder& quotechar(char val) except +
csv_reader_options_builder& doublequote(bool val) except +
csv_reader_options_builder& updated_quotes_detection(bool val) except +
csv_reader_options_builder& parse_dates(vector[string]) except +
csv_reader_options_builder& parse_dates(vector[int]) except +

Expand Down
Loading