Skip to content

Commit

Permalink
[Fix](orc-reader) Fix incorrect result if null partition fields in or…
Browse files Browse the repository at this point in the history
…c file. (apache#23369)

Fix incorrect result if null partition fields in orc file. 

### Root Cause
Theoretically, the underlying file of the hive partition table should not contain partition fields. But we found that in some user scenarios, the partition field will exist in the underlying orc/parquet file and are null values. As a result, the  pushed down partition field which are null values. filter incorrectly.

### Solution
we handle this case by only reading non-partition fields. The parquet reader is already handled this way, this PR handles the orc reader.
  • Loading branch information
kaka11chen authored Aug 25, 2023
1 parent a3a951c commit 8af1e7f
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 12 deletions.
29 changes: 17 additions & 12 deletions be/src/vec/exec/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,19 +632,24 @@ bool OrcReader::_init_search_argument(
for (int i = 0; i < root_type.getSubtypeCount(); ++i) {
type_map.emplace(_get_field_name_lower_case(&root_type, i), root_type.getSubtype(i));
}
for (auto it = colname_to_value_range->begin(); it != colname_to_value_range->end(); ++it) {
auto type_it = type_map.find(it->first);
if (type_it != type_map.end()) {
std::visit(
[&](auto& range) {
std::vector<OrcPredicate> value_predicates =
value_range_to_predicate(range, type_it->second);
for (auto& range_predicate : value_predicates) {
predicates.emplace_back(range_predicate);
}
},
it->second);
for (auto& col_name : _lazy_read_ctx.all_read_columns) {
auto iter = colname_to_value_range->find(col_name);
if (iter == colname_to_value_range->end()) {
continue;
}
auto type_it = type_map.find(col_name);
if (type_it == type_map.end()) {
continue;
}
std::visit(
[&](auto& range) {
std::vector<OrcPredicate> value_predicates =
value_range_to_predicate(range, type_it->second);
for (auto& range_predicate : value_predicates) {
predicates.emplace_back(range_predicate);
}
},
iter->second);
}
if (predicates.empty()) {
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,39 @@ Z6n2t4XA2n7CXTECJ,PE,iBbsCh0RE1Dd2A,z48
-- !null_expr_dict_filter_parquet --
4844 4363

-- !par_fields_in_file_orc1 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_parquet1 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_orc2 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_parquet2 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_orc3 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_parquet3 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_orc4 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_parquet4 --
1 Alice 100.0 2023 8
2 Bob 150.0 2023 8

-- !par_fields_in_file_orc5 --

-- !par_fields_in_file_parquet5 --

Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,17 @@ suite("test_external_catalog_hive", "p2,external,hive,external_remote,external_r
qt_null_expr_dict_filter_orc """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_orc WHERE partitions in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """
qt_null_expr_dict_filter_parquet """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_parquet WHERE partitions in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """

// test par fields in file
qt_par_fields_in_file_orc1 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 and month = 8 order by id; """
qt_par_fields_in_file_parquet1 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 and month = 8 order by id; """
qt_par_fields_in_file_orc2 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 order by id; """
qt_par_fields_in_file_parquet2 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 order by id; """
qt_par_fields_in_file_orc3 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 order by id; """
qt_par_fields_in_file_parquet3 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 order by id; """
qt_par_fields_in_file_orc4 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year >= 2022 order by id; """
qt_par_fields_in_file_parquet4 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year >= 2022 order by id; """
qt_par_fields_in_file_orc5 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year = 2022 order by id; """
qt_par_fields_in_file_parquet5 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year = 2022 order by id; """

// test remember last used database after switch / rename catalog
sql """switch ${catalog_name};"""
Expand Down

0 comments on commit 8af1e7f

Please sign in to comment.