Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix all null list column with missing child column in JSON reader #17348

Merged
merged 25 commits into from
Dec 6, 2024
Merged
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
67691c4
expose make_empty_column
karthikeyann Nov 16, 2024
d735731
fix all null list with empty nested child on missing list child column
karthikeyann Nov 16, 2024
d15300a
add unit test
karthikeyann Nov 16, 2024
9c7e692
Update cpp/src/io/json/json_column.cu
karthikeyann Nov 16, 2024
6e3e2bf
fix clang-format
karthikeyann Nov 18, 2024
6939da2
add comments on list, struct factories to avoid purge nulls
karthikeyann Nov 18, 2024
cdce1cb
cleanup tests
karthikeyann Nov 18, 2024
858687a
Merge branch 'branch-24.12' into fix-json_pruned_empty_list_column
karthikeyann Nov 18, 2024
ba5291d
fix unit test
karthikeyann Nov 18, 2024
39c24ff
return all_null list column, if no child column present
karthikeyann Nov 18, 2024
b8632e0
Merge branch 'branch-24.12' into fix-json_pruned_empty_list_column
karthikeyann Nov 20, 2024
0bcf84d
Merge branch 'branch-24.12' into fix-json_pruned_empty_list_column
karthikeyann Nov 25, 2024
9d475aa
make mixed child list as null (spark)
karthikeyann Nov 25, 2024
6c9083e
nullify mixed types only if experimental
karthikeyann Nov 25, 2024
ff80733
fix no child case, update unit test
karthikeyann Nov 26, 2024
78af57e
address review comments
karthikeyann Dec 3, 2024
0dbd99f
Merge branch 'branch-25.02' into fix-json_pruned_empty_list_column
karthikeyann Dec 3, 2024
9be71ca
empty column will skip purge nulls
karthikeyann Dec 4, 2024
862ad76
use thrust::distance
karthikeyann Dec 4, 2024
bcd9b28
style fix
karthikeyann Dec 4, 2024
0c181bb
Merge branch 'branch-25.02' into fix-json_pruned_empty_list_column
karthikeyann Dec 4, 2024
b983e53
Merge branch 'branch-25.02' into fix-json_pruned_empty_list_column
karthikeyann Dec 5, 2024
3d9bbda
fix typo and style
karthikeyann Dec 5, 2024
f76c53c
remove duplicate line
karthikeyann Dec 6, 2024
125fe35
Merge branch 'branch-25.02' into fix-json_pruned_empty_list_column
karthikeyann Dec 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 22 additions & 20 deletions cpp/src/io/json/host_tree_algorithms.cu
Original file line number Diff line number Diff line change
Expand Up @@ -947,10 +947,12 @@ void scatter_offsets(tree_meta_t const& tree,
});
// For children of list and in ignore_vals, find it's parent node id, and set corresponding
// parent's null mask to null. Setting mixed type list rows to null.
auto const num_list_children = thrust::distance(
thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end);
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
thrust::for_each_n(
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::distance(thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin(), list_children_end),
num_list_children,
[node_ids = node_ids.begin(),
parent_node_ids = tree.parent_node_ids.begin(),
column_categories = d_column_tree.node_categories.begin(),
Expand All @@ -959,13 +961,13 @@ void scatter_offsets(tree_meta_t const& tree,
d_is_mixed_pruned = d_is_mixed_pruned.begin(),
d_ignore_vals = d_ignore_vals.begin(),
d_columns_data = d_columns_data.begin()] __device__(size_type i) {
auto const node_id = node_ids[i];
auto const parent_node_id = parent_node_ids[node_id];
if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return;
if (column_categories[col_ids[parent_node_id]] == NC_LIST and
d_is_mixed_pruned[col_ids[node_id]]) {
clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]);
}
auto const node_id = node_ids[i];
auto const parent_node_id = parent_node_ids[node_id];
if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return;
if (column_categories[col_ids[parent_node_id]] == NC_LIST and
d_is_mixed_pruned[col_ids[node_id]]) {
clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]);
}
});

auto const num_list_children =
Expand All @@ -985,18 +987,18 @@ void scatter_offsets(tree_meta_t const& tree,
row_offsets = row_offsets.begin(),
d_columns_data = d_columns_data.begin(),
num_list_children] __device__(size_type i) {
auto const node_id = node_ids[i];
auto const parent_node_id = parent_node_ids[node_id];
// scatter to list_offset
if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
row_offsets[node_id];
}
// last value of list child_offset is its size.
if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
row_offsets[node_id] + 1;
}
auto const node_id = node_ids[i];
auto const parent_node_id = parent_node_ids[node_id];
// scatter to list_offset
if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
row_offsets[node_id];
}
// last value of list child_offset is its size.
if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
row_offsets[node_id] + 1;
}
});

// 5. scan on offsets.
Expand Down
Loading