Adding support for explode to cuDF (#7140)

This is an operation that expands lists into rows and duplicates the existing rows from other columns. Explanation can be found in the issue #6151 partially fixes #6151 Missing pos_explode support required to completely close out #6151 Authors: - Mike Wilson (@hyperbolic2346) Approvers: - Robert (Bobby) Evans (@revans2) - Jake Hemstad (@jrhemstad) - Karthikeyan (@karthikeyann) - @nvdbaranec URL: #7140
rapidsai · Jan 25, 2021 · f422391 · f422391
1 parent 2e0889a
commit f422391
Show file tree

Hide file tree

Showing 4 changed files with 643 additions and 0 deletions.
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
@@ -97,6 +97,48 @@ std::unique_ptr<column> byte_cast(
   flip_endianness endian_configuration,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Explodes a list column's elements.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [5,         100],
+ * [10,        100],
+ * [15,        100],
+ * [20,        200],
+ * [25,        200],
+ * [30,        300],
+ * ```
+ *
+ * Nulls and empty lists propagate in different ways depending on what is null or empty.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [5,           100],
+ * [null,        100],
+ * [15,          100],
+ * ```
+ * Note that null lists are completely removed from the output
+ * and nulls and empty lists inside lists are pulled out and remain.
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <memory>
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+namespace {
+/**
+ * @brief Function object for exploding a column.
+ */
+struct explode_functor {
+  template <typename T>
+  std::unique_ptr<table> operator()(table_view const& input_table,
+                                    size_type explode_column_idx,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr) const
+  {
+    CUDF_FAIL("Unsupported non-list column");
+
+    return std::make_unique<table>();
+  }
+};
+
+template <>
+std::unique_ptr<table> explode_functor::operator()<list_view>(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  /* we explode by building a gather map that includes the number of entries in each list inside
+   the column for each index. Interestingly, this can be done with lower_bound across the offsets
+   as values between the offsets will all map down to the index below. We have some off-by-one
+   manipulations we need to do with the output, but it's almost our gather map by itself. Once we
+   build the gather map we need to remove the explode column from the table and run gather on it.
+   Next we build the explode column, which turns out is simply lifting the child column out of the
+   explode column. This unrolls the top level of lists. Then we need to insert the explode column
+   back into the table and return it. */
+  lists_column_view lc{input_table.column(explode_column_idx)};
+  auto sliced_child = lc.get_sliced_child(stream);
+  rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream, mr);
+
+  // sliced columns can make this a little tricky. We have to start iterating at the start of the
+  // offsets for this column, which could be > 0. Then we also have to handle rebasing the offsets
+  // as we go.
+  auto offsets           = lc.offsets().begin<size_type>() + lc.offset();
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  auto counting_iter = thrust::make_counting_iterator(0);
+
+  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
+  // result from `lower_bound` by 1 to build the correct gather map. It was pointed out that
+  // this can be accomplished by simply skipping the first entry and using the result of
+  // `lower_bound` directly.
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      offsets_minus_one + 1,
+                      offsets_minus_one + lc.size() + 1,
+                      counting_iter,
+                      counting_iter + gather_map_indices.size(),
+                      gather_map_indices.begin());
+
+  auto select_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
+  std::vector<size_type> selected_columns(select_iter, select_iter + input_table.num_columns() - 1);
+
+  auto gathered_table = cudf::detail::gather(
+    input_table.select(selected_columns),
+    column_view(data_type(type_to_id<size_type>()), sliced_child.size(), gather_map_indices.data()),
+    cudf::out_of_bounds_policy::DONT_CHECK,
+    cudf::detail::negative_index_policy::ALLOWED,
+    stream,
+    mr);
+
+  std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
+
+  columns.insert(columns.begin() + explode_column_idx,
+                 std::make_unique<column>(column(sliced_child, stream, mr)));
+
+  return std::make_unique<table>(std::move(columns));
+}
+}  // namespace
+
+/**
+ * @copydoc
+ * cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type explode_column_idx,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(input_table.column(explode_column_idx).type(),
+                         explode_functor{},
+                         input_table,
+                         explode_column_idx,
+                         stream,
+                         mr);
+}
+
+}  // namespace detail
+
+/**
+ * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type explode_column_idx,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
@@ -516,6 +516,7 @@ ConfigureTest(SEARCH_TEST "${SEARCH_TEST_SRC}")
 
 set(RESHAPE_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/reshape/byte_cast_tests.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/reshape/explode_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/reshape/interleave_columns_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/reshape/tile_tests.cpp")