Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Whitespace normalization of nested column coerced as string column in JSONL inputs #16759

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
8e87b60
post tree whitespace normalization
shrshi Sep 5, 2024
30603f1
formatting
shrshi Sep 5, 2024
e45587d
Merge branch 'branch-24.10' into json-whitespace-normalization-post
shrshi Sep 5, 2024
a6ca1b8
removed unnecessary copy
shrshi Sep 5, 2024
88f06e4
formatting
shrshi Sep 5, 2024
b04c6a8
Merge branch 'json-whitespace-normalization-post' of github.com:shrsh…
shrshi Sep 5, 2024
d4189c5
addressed reviews - 1
shrshi Sep 6, 2024
cd8a840
added more null rows to the test example
shrshi Sep 6, 2024
81beb04
forced column as string impl
shrshi Sep 10, 2024
49b5f26
formatting
shrshi Sep 10, 2024
8f43a05
Merge branch 'branch-24.10' into json-whitespace-normalization-post
shrshi Sep 10, 2024
274f48f
replace mixed type as string with prune column
shrshi Sep 10, 2024
db9c783
formatting
shrshi Sep 10, 2024
70c6a70
Merge branch 'json-whitespace-normalization-post' of github.com:shrsh…
shrshi Sep 10, 2024
d801111
addressing pr reviews - part 1
shrshi Sep 10, 2024
e63faaa
formatting
shrshi Sep 10, 2024
110856d
addressing pr reviews - part 2
shrshi Sep 10, 2024
8728964
formatting
shrshi Sep 10, 2024
c1842e2
merge
shrshi Sep 10, 2024
a6d1646
added check for whitespace normalization
shrshi Sep 10, 2024
87fca8e
removing all old code
shrshi Sep 11, 2024
e76d74e
formatting
shrshi Sep 11, 2024
9921a26
Merge branch 'branch-24.10' into json-whitespace-normalization-post
shrshi Sep 11, 2024
55dbe92
addressing PR reviews
shrshi Sep 17, 2024
d4a2135
formatting
shrshi Sep 17, 2024
bdf3c19
Merge branch 'json-whitespace-normalization-post' of github.com:shrsh…
shrshi Sep 17, 2024
efd75b3
addressing PR reviews
shrshi Sep 17, 2024
85f5427
formatting
shrshi Sep 17, 2024
993eb15
more pr reviews
shrshi Sep 17, 2024
00a650e
formatting
shrshi Sep 17, 2024
39bedb8
merge
shrshi Sep 17, 2024
0c18a3a
formatting
shrshi Sep 17, 2024
7412bbd
simplifying namespace and variable names
shrshi Sep 18, 2024
b359e80
changing stencil to bool
shrshi Sep 18, 2024
d12ff96
Merge branch 'branch-24.10' into json-whitespace-normalization-post
karthikeyann Sep 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
formatting
  • Loading branch information
shrshi committed Sep 10, 2024
commit 49b5f26610ccfdb286c6e1c7a8fb88ef43e5347c
15 changes: 8 additions & 7 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ void make_device_json_column(device_span<SymbolT const> input,
std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
std::vector<uint8_t> is_pruned(num_columns, 0);
// for columns that are not mixed type but have been forced as string
std::vector<uint8_t> forced_as_string_column(num_columns, 0);
std::vector<uint8_t> forced_as_string_column(num_columns, 0);
shrshi marked this conversation as resolved.
Show resolved Hide resolved
columns.try_emplace(parent_node_sentinel, std::ref(root));

std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
Expand Down Expand Up @@ -703,10 +703,11 @@ void make_device_json_column(device_span<SymbolT const> input,
// Struct, List, String, Value
auto [name, parent_col_id] = name_and_parent_index(this_col_id);

// if parent is mixed type column or this column is pruned or if parent
// if parent is mixed type column or this column is pruned or if parent
// has been forced as string, ignore this column.
if (parent_col_id != parent_node_sentinel &&
(is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || forced_as_string_column[parent_col_id]) {
(is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
forced_as_string_column[parent_col_id]) {
ignore_vals[this_col_id] = 1;
if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = 1; }
shrshi marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -794,7 +795,7 @@ void make_device_json_column(device_span<SymbolT const> input,
if ((column_categories[this_col_id] == NC_STRUCT or
column_categories[this_col_id] == NC_LIST) and
user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
this_column_category = NC_STR;
this_column_category = NC_STR;
}

CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
Expand All @@ -804,8 +805,8 @@ void make_device_json_column(device_span<SymbolT const> input,
if ((column_categories[this_col_id] == NC_STRUCT or
column_categories[this_col_id] == NC_LIST) and
user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
//std::printf("this_col_id forced as string = %d\n", this_col_id);
col.forced_as_string_column = true;
// std::printf("this_col_id forced as string = %d\n", this_col_id);
shrshi marked this conversation as resolved.
Show resolved Hide resolved
col.forced_as_string_column = true;
forced_as_string_column[this_col_id] = 1;
shrshi marked this conversation as resolved.
Show resolved Hide resolved
}

Expand Down Expand Up @@ -842,7 +843,7 @@ void make_device_json_column(device_span<SymbolT const> input,
auto parent_col_id = column_parent_ids[this_col_id];
if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id] == 1) {
shrshi marked this conversation as resolved.
Show resolved Hide resolved
forced_as_string_column[this_col_id] = 1;
shrshi marked this conversation as resolved.
Show resolved Hide resolved
ignore_vals[this_col_id] = 1;
ignore_vals[this_col_id] = 1;
}
// Convert only mixed type columns as string (so to copy), but not its children
if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id] == 0 and
Expand Down
22 changes: 12 additions & 10 deletions cpp/tests/io/json/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2784,9 +2784,9 @@ TEST_F(JsonReaderTest, JsonDtypeSchema)
{"a": 1, "b": {"0": "lolol "}, "c": true}
)";


std::map<std::string, cudf::io::schema_element> dtype_schema{
{"c", {data_type{type_id::STRING}}}, {"b", {data_type{type_id::STRING}}}, {"a", {dtype<double>()}}};
std::map<std::string, cudf::io::schema_element> dtype_schema{{"c", {data_type{type_id::STRING}}},
{"b", {data_type{type_id::STRING}}},
{"a", {dtype<double>()}}};
cudf::io::json_reader_options in_options =
cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
.dtypes(dtype_schema)
Expand All @@ -2806,15 +2806,17 @@ TEST_F(JsonReaderTest, JsonDtypeSchema)
EXPECT_EQ(result.metadata.schema_info[1].name, "b");
EXPECT_EQ(result.metadata.schema_info[2].name, "c");

//cudf::column::contents contents = result.tbl->get_column(1).release();
// cudf::column::contents contents = result.tbl->get_column(1).release();
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), cudf::test::strings_column_wrapper({
"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}",
"{\"0\": \"abc\" }",
"{\"0\": \"lolol \"}"
}), cudf::test::debug_output_level::ALL_ERRORS);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(
result.tbl->get_column(1),
cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}",
"{\"0\": \"abc\" }",
"{\"0\": \"lolol \"}"}),
cudf::test::debug_output_level::ALL_ERRORS);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2),
cudf::test::strings_column_wrapper({"true", "false", "true"}), cudf::test::debug_output_level::ALL_ERRORS);
cudf::test::strings_column_wrapper({"true", "false", "true"}),
cudf::test::debug_output_level::ALL_ERRORS);
}

CUDF_TEST_PROGRAM_MAIN()
Loading