Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] handle const column in array_filter #51363

Merged
merged 2 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 114 additions & 57 deletions be/src/exprs/array_functions.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "column/column_viewer.h"
#include "column/json_column.h"
#include "column/type_traits.h"
#include "column/vectorized_fwd.h"
#include "exec/sorting/sorting.h"
#include "exprs/arithmetic_operation.h"
#include "exprs/function_context.h"
Expand Down Expand Up @@ -49,6 +50,7 @@ private:

size_t chunk_size = columns[0]->size();
ColumnPtr src_column = ColumnHelper::unpack_and_duplicate_const_column(chunk_size, columns[0]);

ColumnPtr dest_column = src_column->clone_empty();

HashSet hash_set;
Expand Down Expand Up @@ -904,65 +906,107 @@ private:
class ArrayFilter {
public:
static ColumnPtr process([[maybe_unused]] FunctionContext* ctx, const Columns& columns) {
return _array_filter(columns);
const auto& src_column = columns[0];
const auto& filter_column = columns[1];
return _array_filter(src_column, filter_column);
}

private:
static ColumnPtr _array_filter(const Columns& columns) {
if (columns[0]->only_null()) {
return columns[0];
}

size_t chunk_size = columns[0]->size();
ColumnPtr src_column = ColumnHelper::unpack_and_duplicate_const_column(chunk_size, columns[0]);
ColumnPtr dest_column = src_column->clone_empty();
if (columns[1]->only_null()) { // return empty array for non-null array by design, keep the same null with src.
auto data_column = dest_column;
if (dest_column->is_nullable()) {
// set null from src
auto* dest_nullable_column = down_cast<NullableColumn*>(dest_column.get());
const auto* src_nullable_column = down_cast<const NullableColumn*>(src_column.get());
dest_nullable_column->mutable_null_column()->get_data().assign(
src_nullable_column->null_column()->get_data().begin(),
src_nullable_column->null_column()->get_data().end());
dest_nullable_column->set_has_null(src_nullable_column->has_null());

data_column = dest_nullable_column->data_column();
static ColumnPtr _array_filter(const ColumnPtr& src_column, const ColumnPtr& filter_column) {
if (src_column->only_null()) {
return src_column;
}
bool is_src_const = src_column->is_constant();
bool is_filter_const = filter_column->is_constant();

size_t chunk_size = src_column->size();
if (filter_column->only_null()) {
if (is_src_const) {
trueeyu marked this conversation as resolved.
Show resolved Hide resolved
// return a const column with empty array
auto data_column = FunctionHelper::get_data_column_of_const(src_column);
auto dest_data_column = data_column->clone_empty();
dest_data_column->append_default();
return ConstColumn::create(std::move(dest_data_column), chunk_size);
} else {
// return a nullable column with only empty arrays, the null column shoule be same with src column.
ColumnPtr dest_column = src_column->clone_empty();
ColumnPtr data_column = dest_column;
if (src_column->is_nullable()) {
const auto src_null_column = down_cast<const NullableColumn*>(src_column.get())->null_column();
auto dest_nullable_column = down_cast<NullableColumn*>(dest_column.get());
auto dest_null_column = dest_nullable_column->mutable_null_column();
dest_null_column->get_data().assign(src_null_column->get_data().begin(),
src_null_column->get_data().end());
dest_nullable_column->set_has_null(src_column->has_null());
data_column = dest_nullable_column->data_column();
}
data_column->append_default(chunk_size);
return dest_column;
trueeyu marked this conversation as resolved.
Show resolved Hide resolved
}
data_column->append_default(chunk_size);
return dest_column;
}

ColumnPtr bool_column = ColumnHelper::unpack_and_duplicate_const_column(chunk_size, columns[1]);
ColumnPtr dest_column = is_src_const ? FunctionHelper::get_data_column_of_const(src_column)->clone_empty()
: src_column->clone_empty();

NullColumn* dest_null_column = nullptr;
if (src_column->is_nullable()) {
const auto* src_nullable_column = down_cast<const NullableColumn*>(src_column.get());
const auto& src_data_column = src_nullable_column->data_column();
const auto& src_null_column = src_nullable_column->null_column();

auto* dest_nullable_column = down_cast<NullableColumn*>(dest_column.get());
auto* dest_null_column = dest_nullable_column->mutable_null_column();
auto* dest_data_column = dest_nullable_column->mutable_data_column();
dest_null_column = dest_nullable_column->mutable_null_column();

if (src_column->has_null()) {
dest_null_column->get_data().assign(src_null_column->get_data().begin(),
src_null_column->get_data().end());
} else {
dest_null_column->get_data().resize(chunk_size, 0);
}
dest_null_column->get_data().assign(src_null_column->get_data().begin(), src_null_column->get_data().end());
dest_nullable_column->set_has_null(src_nullable_column->has_null());
}

_filter_array_items(down_cast<ArrayColumn*>(src_data_column.get()), bool_column,
down_cast<ArrayColumn*>(dest_data_column), dest_null_column);
ColumnPtr src_data_column = src_column;
ColumnPtr dest_data_column = dest_column;
if (is_src_const) {
src_data_column = FunctionHelper::get_data_column_of_const(src_column);
src_data_column = FunctionHelper::get_data_column_of_nullable(src_data_column);
dest_data_column = FunctionHelper::get_data_column_of_const(dest_column);
dest_data_column = FunctionHelper::get_data_column_of_nullable(dest_data_column);
} else {
src_data_column = FunctionHelper::get_data_column_of_nullable(src_data_column);
dest_data_column = FunctionHelper::get_data_column_of_nullable(dest_data_column);
}

ColumnPtr filter_data_column =
is_filter_const ? FunctionHelper::get_data_column_of_const(filter_column) : filter_column;
size_t num_rows = (is_src_const && is_filter_const) ? 1 : chunk_size;
if (is_src_const && is_filter_const) {
_filter_array_items<true, true>(down_cast<ArrayColumn*>(src_data_column.get()), filter_data_column,
down_cast<ArrayColumn*>(dest_data_column.get()), dest_null_column,
num_rows);
} else if (is_src_const && !is_filter_const) {
_filter_array_items<true, false>(down_cast<ArrayColumn*>(src_data_column.get()), filter_data_column,
down_cast<ArrayColumn*>(dest_data_column.get()), dest_null_column,
num_rows);
} else if (!is_src_const && is_filter_const) {
_filter_array_items<false, true>(down_cast<ArrayColumn*>(src_data_column.get()), filter_data_column,
down_cast<ArrayColumn*>(dest_data_column.get()), dest_null_column,
num_rows);
} else {
_filter_array_items<false, false>(down_cast<ArrayColumn*>(src_data_column.get()), filter_data_column,
down_cast<ArrayColumn*>(dest_data_column.get()), dest_null_column,
num_rows);
}
dest_column->check_or_die();
if (is_src_const && is_filter_const) {
auto const_column = ConstColumn::create(std::move(dest_column), chunk_size);
const_column->check_or_die();
return const_column;
} else {
_filter_array_items(down_cast<ArrayColumn*>(src_column.get()), bool_column,
down_cast<ArrayColumn*>(dest_column.get()), nullptr);
return dest_column;
}
return dest_column;
}

template <bool ConstSrc, bool ConstFilter>
static void _filter_array_items(const ArrayColumn* src_column, const ColumnPtr& raw_filter,
ArrayColumn* dest_column, NullColumn* dest_null_map) {
ArrayColumn* dest_column, NullColumn* dest_null_map, size_t src_rows) {
if constexpr (!ConstSrc) {
trueeyu marked this conversation as resolved.
Show resolved Hide resolved
DCHECK_EQ(src_column->size(), src_rows);
}
ArrayColumn* filter;
NullColumn* filter_null_map = nullptr;
auto& dest_offsets = dest_column->offsets_column()->get_data();
Expand All @@ -974,28 +1018,41 @@ private:
} else {
filter = down_cast<ArrayColumn*>(raw_filter.get());
}

std::vector<uint32_t> indexes;
// only keep the elements whose filter is not null and not 0.
for (size_t i = 0; i < src_column->size(); ++i) {
if (dest_null_map == nullptr || !dest_null_map->get_data()[i]) { // dest_null_map[i] is not null
if (filter_null_map == nullptr || !filter_null_map->get_data()[i]) { // filter_null_map[i] is not null
size_t elem_size = 0;
size_t filter_elem_id = filter->offsets().get_data()[i];
size_t filter_elem_limit = filter->offsets().get_data()[i + 1];
for (size_t src_elem_id = src_column->offsets().get_data()[i];
src_elem_id < src_column->offsets().get_data()[i + 1]; ++filter_elem_id, ++src_elem_id) {
// only keep the valid elements
if (filter_elem_id < filter_elem_limit && !filter->elements().is_null(filter_elem_id) &&
filter->elements().get(filter_elem_id).get_int8() != 0) {
indexes.emplace_back(src_elem_id);
++elem_size;
size_t num_rows = ConstSrc ? src_rows : src_column->size();
for (size_t i = 0; i < num_rows; i++) {
if (filter_null_map == nullptr || !filter_null_map->get_data()[i]) {
bool filter_is_not_null =
(filter_null_map == nullptr ||
(ConstFilter ? !filter_null_map->get_data()[0] : !filter_null_map->get_data()[i]));
if (filter_is_not_null) {
// if filter is not null, we should filter each elements in array
const auto& src_offsets = src_column->offsets().get_data();
size_t src_start = ConstSrc ? src_offsets[0] : src_offsets[i];
size_t src_end = ConstSrc ? src_offsets[1] : src_offsets[i + 1];
size_t src_elements_num = src_end - src_start;

const auto& filter_offsets = filter->offsets().get_data();
size_t filter_start = ConstFilter ? filter_offsets[0] : filter_offsets[i];
size_t filter_end = ConstFilter ? filter_offsets[1] : filter_offsets[i + 1];
size_t filter_elements_num = filter_end - filter_start;

const auto& filter_elements = filter->elements();
size_t valid_elements_num = 0;

for (size_t idx = 0; idx < src_elements_num; idx++) {
if (idx < filter_elements_num && !filter_elements.is_null(filter_start + idx) &&
filter_elements.get(filter_start + idx).get_int8() != 0) {
indexes.emplace_back(src_start + idx);
valid_elements_num++;
}
}
dest_offsets.emplace_back(dest_offsets.back() + elem_size);
} else { // filter_null_map[i] is null, empty the array by design[, alternatively keep all elements]
dest_offsets.emplace_back(dest_offsets.back() + valid_elements_num);
} else {
dest_offsets.emplace_back(dest_offsets.back());
}
} else { // dest_null_map[i] is null
} else {
dest_offsets.emplace_back(dest_offsets.back());
}
}
Expand Down
113 changes: 113 additions & 0 deletions test/sql/test_array_fn/R/test_array_filter
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
-- name: test_array_filter
CREATE TABLE `t` (
`k` bigint(20) NOT NULL COMMENT "",
`arr_0` array<bigint(20)> NOT NULL COMMENT "",
`arr_1` array<bigint(20)> NULL COMMENT "",
`arr_2` array<bigint(20)> NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`k`)
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_num" = "1"
);
-- result:
-- !result
insert into t values (1,[1,2],[1,2],[0,1]),(2,[1,2],null,[1,1]),(3,[1,2],[1,2],null),(4,[1,2],[null,null],[null,null]),(5,[1],[1,2],[0,0,1]);
-- result:
-- !result
select array_filter(arr_0, arr_2) from t order by k;
-- result:
[2]
[1,2]
[]
[]
[]
-- !result
select array_filter(arr_1, arr_2) from t order by k;
-- result:
[2]
None
[]
[]
[]
-- !result
select array_filter(arr_0,[0,0,0,1]) from t order by k;
-- result:
[]
[]
[]
[]
[]
-- !result
select array_filter(arr_0,[1,0,1]) from t order by k;
-- result:
[1]
[1]
[1]
[1]
[1]
-- !result
select array_filter(arr_0,null) from t order by k;
-- result:
[]
[]
[]
[]
[]
-- !result
select array_filter(arr_1,null) from t order by k;
-- result:
[]
None
[]
[]
[]
-- !result
select array_filter([1,2,3,4],arr_2) from t order by k;
-- result:
[2]
[1,2]
[]
[]
[3]
-- !result
select array_filter(null, arr_2) from t order by k;
-- result:
None
None
None
None
None
-- !result
select array_filter([1,2,3,4],[0,0,1,1]) from t;
-- result:
[3,4]
[3,4]
[3,4]
[3,4]
[3,4]
-- !result
select array_filter(null, null) from t;
-- result:
None
None
None
None
None
-- !result
select array_filter([1,2,3,4],null) from t;
-- result:
[]
[]
[]
[]
[]
-- !result
select array_filter(null, [1,0,1,null]) from t;
-- result:
None
None
None
None
None
-- !result
26 changes: 26 additions & 0 deletions test/sql/test_array_fn/T/test_array_filter
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- name: test_array_filter
CREATE TABLE `t` (
`k` bigint(20) NOT NULL COMMENT "",
`arr_0` array<bigint(20)> NOT NULL COMMENT "",
`arr_1` array<bigint(20)> NULL COMMENT "",
`arr_2` array<bigint(20)> NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`k`)
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_num" = "1"
);
insert into t values (1,[1,2],[1,2],[0,1]),(2,[1,2],null,[1,1]),(3,[1,2],[1,2],null),(4,[1,2],[null,null],[null,null]),(5,[1],[1,2],[0,0,1]);

select array_filter(arr_0, arr_2) from t order by k;
select array_filter(arr_1, arr_2) from t order by k;
select array_filter(arr_0,[0,0,0,1]) from t order by k;
select array_filter(arr_0,[1,0,1]) from t order by k;
select array_filter(arr_0,null) from t order by k;
select array_filter(arr_1,null) from t order by k;
select array_filter([1,2,3,4],arr_2) from t order by k;
select array_filter(null, arr_2) from t order by k;
select array_filter([1,2,3,4],[0,0,1,1]) from t;
select array_filter(null, null) from t;
select array_filter([1,2,3,4],null) from t;
select array_filter(null, [1,0,1,null]) from t;
Loading