Skip to content

Commit

Permalink
[bugfix](paimon)fix paimon testcases (#30514)
Browse files Browse the repository at this point in the history
1. set default timezone
2. not supported `char` type to pushdown
  • Loading branch information
wuwenchi authored Jan 30, 2024
1 parent f3c8ee6 commit f6d4e41
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 175 deletions.
23 changes: 18 additions & 5 deletions be/src/vec/exec/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ void ORCFileInputStream::read(void* buf, uint64_t length, uint64_t offset) {
OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state,
const TFileScanRangeParams& params, const TFileRangeDesc& range,
size_t batch_size, const std::string& ctz, io::IOContext* io_ctx,
bool enable_lazy_mat)
bool enable_lazy_mat, std::vector<orc::TypeKind>* unsupported_pushdown_types)
: _profile(profile),
_state(state),
_scan_params(params),
Expand All @@ -148,7 +148,8 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state,
_ctz(ctz),
_io_ctx(io_ctx),
_enable_lazy_mat(enable_lazy_mat),
_is_dict_cols_converted(false) {
_is_dict_cols_converted(false),
_unsupported_pushdown_types(unsupported_pushdown_types) {
TimezoneUtils::find_cctz_time_zone(ctz, _time_zone);
VecDateTimeValue t;
t.from_unixtime(0, ctz);
Expand Down Expand Up @@ -524,8 +525,20 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const orc::Type* type, con

template <PrimitiveType primitive_type>
std::vector<OrcPredicate> value_range_to_predicate(
const ColumnValueRange<primitive_type>& col_val_range, const orc::Type* type) {
const ColumnValueRange<primitive_type>& col_val_range, const orc::Type* type,
std::vector<orc::TypeKind>* unsupported_pushdown_types) {
std::vector<OrcPredicate> predicates;

if (unsupported_pushdown_types != nullptr) {
for (vector<orc::TypeKind>::iterator it = unsupported_pushdown_types->begin();
it != unsupported_pushdown_types->end(); ++it) {
if (*it == type->getKind()) {
// Unsupported type
return predicates;
}
}
}

orc::PredicateDataType predicate_data_type;
auto type_it = TYPEKIND_TO_PREDICATE_TYPE.find(type->getKind());
if (type_it == TYPEKIND_TO_PREDICATE_TYPE.end()) {
Expand Down Expand Up @@ -667,8 +680,8 @@ bool OrcReader::_init_search_argument(
}
std::visit(
[&](auto& range) {
std::vector<OrcPredicate> value_predicates =
value_range_to_predicate(range, type_it->second);
std::vector<OrcPredicate> value_predicates = value_range_to_predicate(
range, type_it->second, _unsupported_pushdown_types);
for (auto& range_predicate : value_predicates) {
predicates.emplace_back(range_predicate);
}
Expand Down
4 changes: 3 additions & 1 deletion be/src/vec/exec/format/orc/vorc_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ class OrcReader : public GenericReader {

OrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params,
const TFileRangeDesc& range, size_t batch_size, const std::string& ctz,
io::IOContext* io_ctx, bool enable_lazy_mat = true);
io::IOContext* io_ctx, bool enable_lazy_mat = true,
std::vector<orc::TypeKind>* unsupported_pushdown_types = nullptr);

OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range,
const std::string& ctz, io::IOContext* io_ctx, bool enable_lazy_mat = true);
Expand Down Expand Up @@ -571,6 +572,7 @@ class OrcReader : public GenericReader {
std::unique_ptr<orc::StringDictFilter> _string_dict_filter;
bool _is_dict_cols_converted;
bool _has_complex_type = false;
std::vector<orc::TypeKind>* _unsupported_pushdown_types;
};

class ORCFileInputStream : public orc::InputStream {
Expand Down
25 changes: 22 additions & 3 deletions be/src/vec/exec/scan/vfile_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -785,9 +785,20 @@ Status VFileScanner::_get_next_reader() {
break;
}
case TFileFormatType::FORMAT_PARQUET: {
static const cctz::time_zone utc0 = cctz::utc_time_zone();
cctz::time_zone* tz;
if (range.__isset.table_format_params &&
range.table_format_params.table_format_type == "paimon") {
// The timestmap generated by paimon does not carry metadata information (e.g., isAdjustToUTC, etc.),
// and the stored data is UTC0 by default, so it is directly set to the UTC time zone.
// In version 0.7, paimon fixed this issue and can remove the judgment here
tz = const_cast<cctz::time_zone*>(&utc0);
} else {
tz = const_cast<cctz::time_zone*>(&_state->timezone_obj());
}
std::unique_ptr<ParquetReader> parquet_reader = ParquetReader::create_unique(
_profile, *_params, range, _state->query_options().batch_size,
const_cast<cctz::time_zone*>(&_state->timezone_obj()), _io_ctx.get(), _state,
_profile, *_params, range, _state->query_options().batch_size, tz,
_io_ctx.get(), _state,
config::max_external_file_meta_cache_num <= 0
? nullptr
: ExecEnv::GetInstance()->file_meta_cache(),
Expand Down Expand Up @@ -825,9 +836,17 @@ Status VFileScanner::_get_next_reader() {
break;
}
case TFileFormatType::FORMAT_ORC: {
std::vector<orc::TypeKind>* unsupported_pushdown_types = nullptr;
if (range.__isset.table_format_params &&
range.table_format_params.table_format_type == "paimon") {
static std::vector<orc::TypeKind> paimon_unsupport_type =
std::vector<orc::TypeKind> {orc::TypeKind::CHAR};
unsupported_pushdown_types = &paimon_unsupport_type;
}
std::unique_ptr<OrcReader> orc_reader = OrcReader::create_unique(
_profile, _state, *_params, range, _state->query_options().batch_size,
_state->timezone(), _io_ctx.get(), _state->query_options().enable_orc_lazy_mat);
_state->timezone(), _io_ctx.get(), _state->query_options().enable_orc_lazy_mat,
unsupported_pushdown_types);
if (push_down_predicates) {
RETURN_IF_ERROR(_process_late_arrival_conjuncts());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
import org.apache.doris.analysis.FloatLiteral;
import org.apache.doris.analysis.IntLiteral;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.analysis.StringLiteral;

import com.google.common.base.Strings;
import org.apache.paimon.data.BinaryString;
import org.apache.paimon.data.Decimal;
import org.apache.paimon.data.Timestamp;
Expand Down Expand Up @@ -64,10 +62,8 @@ public BinaryString visit(VarCharType varCharType) {
}

public BinaryString visit(CharType charType) {
if (expr instanceof StringLiteral) {
StringLiteral stringLiteral = (StringLiteral) expr;
return BinaryString.fromString(Strings.padEnd(stringLiteral.getStringValue(), charType.getLength(), ' '));
}
// Currently, Paimon does not support predicate push-down for char
// ref: org.apache.paimon.predicate.PredicateBuilder.convertJavaObject
return null;
}

Expand Down
Loading

0 comments on commit f6d4e41

Please sign in to comment.