Skip to content

Commit

Permalink
[fix](orc) fix the count(*) pushdown issue in orc format (apache#24446)
Browse files Browse the repository at this point in the history
In previous, when querying hive table in orc format, and the file is splitted.
the result of select count(*) may be multiple of the real row number.

This is because the number of rows should be got after orc strip prune,
otherwise, it may return wrong result
  • Loading branch information
morningman authored Sep 16, 2023
1 parent cac089c commit 4dad7c9
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 4 deletions.
2 changes: 1 addition & 1 deletion be/src/apache-orc
5 changes: 3 additions & 2 deletions be/src/vec/exec/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,6 @@ Status OrcReader::_create_file_reader() {
}
return Status::InternalError("Init OrcReader failed. reason = {}", _err_msg);
}
_remaining_rows = _reader->getNumberOfRows();

return Status::OK();
}

Expand Down Expand Up @@ -789,6 +787,9 @@ Status OrcReader::set_fill_columns(
auto& selected_type = _row_reader->getSelectedType();
int idx = 0;
_init_select_types(selected_type, idx);

_remaining_rows = _row_reader->getNumberOfRows();

} catch (std::exception& e) {
return Status::InternalError("Failed to create orc row reader. reason = {}", e.what());
}
Expand Down
2 changes: 2 additions & 0 deletions be/src/vec/exec/format/orc/vorc_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,8 @@ class OrcReader : public GenericReader {
void set_remaining_rows(int64_t rows) { _remaining_rows = rows; }

private:
// This is only for count(*) short circuit read.
// save the total number of rows in range
int64_t _remaining_rows = 0;
RuntimeProfile* _profile = nullptr;
RuntimeState* _state = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public HdfsTableValuedFunction(Map<String, String> params) throws AnalysisExcept
// because HADOOP_FS_NAME contains upper and lower case
locationProperties.put(HdfsResource.HADOOP_FS_NAME, params.get(key));
} else {
throw new AnalysisException(key + " is invalid property");
locationProperties.put(key, params.get(key));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,21 @@ c133 TEXT Yes false \N NONE

-- !plain_2 --

-- !count_parquet_0 --
1062734

-- !count_parquet_1 --
1062734

-- !count_orc_0 --
2777636

-- !count_orc_1 --
2777636

-- !count_text_0 --
144730

-- !count_text_1 --
144730

Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,70 @@ suite("test_hdfs_tvf_compression", "p2,external,tvf,external_remote,external_rem
"column_separator" = '\001',
"compress_type" = "plain") where c2="abc" order by c3,c4,c10 limit 5;
"""

// test count(*) push down
def test_data_dir = "hdfs://${nameNodeHost}:${hdfsPort}"
// parquet
sql """set file_split_size=0;"""
qt_count_parquet_0 """
select count(*) from
HDFS(
"uri" = "${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet",
"fs.defaultFS" = "${baseFs}",
"format" = "parquet"
);
"""

sql """set file_split_size=388608;"""
qt_count_parquet_1 """
select count(*) from
HDFS(
"uri" = "${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet",
"fs.defaultFS" = "${baseFs}",
"format" = "parquet"
);
"""

// orc
sql """set file_split_size=0;"""
qt_count_orc_0 """
select count(*) from
HDFS(
"uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc",
"fs.defaultFS" = "${baseFs}",
"format" = "orc"
);
"""

sql """set file_split_size=388608;"""
qt_count_orc_1 """
select count(*) from
HDFS(
"uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc",
"fs.defaultFS" = "${baseFs}",
"format" = "orc"
);
"""

// text
sql """set file_split_size=0;"""
qt_count_text_0 """
select count(*) from
HDFS(
"uri" = "${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt",
"fs.defaultFS" = "${baseFs}",
"format" = "csv"
);
"""

sql """set file_split_size=388608;"""
qt_count_text_1 """
select count(*) from
HDFS(
"uri" = "${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt",
"fs.defaultFS" = "${baseFs}",
"format" = "csv"
);
"""
}
}

0 comments on commit 4dad7c9

Please sign in to comment.