From dd53bc1c8dbac94f945a24a442e1a24dab22ebbe Mon Sep 17 00:00:00 2001 From: yiguolei <676222867@qq.com> Date: Sun, 19 Mar 2023 14:05:02 +0800 Subject: [PATCH] [unify type system](remove unused type desc) remove some code (#17921) There are many type definitions in BE. Should unify the type system and simplify the development. --------- Co-authored-by: yiguolei --- be/src/exec/base_scanner.cpp | 1 - be/src/exec/schema_scanner.cpp | 74 ------- be/src/exec/schema_scanner.h | 3 - be/src/exec/tablet_info.cpp | 1 + be/src/exec/tablet_info.h | 1 - be/src/runtime/buffer_control_block.cpp | 1 - be/src/runtime/collection_value.cpp | 1 - be/src/runtime/collection_value.h | 1 - be/src/runtime/raw_value.h | 183 +----------------- be/src/runtime/result_buffer_mgr.cpp | 1 - be/src/runtime/result_queue_mgr.h | 1 - be/src/runtime/result_writer.h | 1 - be/src/runtime/struct_value.h | 2 - be/src/util/CMakeLists.txt | 1 - be/src/util/arrow/row_block.cpp | 127 ------------ be/src/util/arrow/row_block.h | 38 ---- be/src/util/hash_util.hpp | 1 + be/src/vec/columns/column_const.cpp | 3 +- be/src/vec/core/block.cpp | 11 -- be/src/vec/core/block.h | 3 - be/src/vec/exec/scan/vfile_scanner.cpp | 1 - be/src/vec/runtime/vdata_stream_mgr.cpp | 7 +- .../utils/arrow_column_to_doris_column.cpp | 55 ------ .../vec/utils/arrow_column_to_doris_column.h | 2 - be/test/olap/row_cursor_test.cpp | 2 - .../vec/exec/parquet/parquet_thrift_test.cpp | 83 +++++++- be/test/vec/exprs/vexpr_test.cpp | 76 +++++++- 27 files changed, 154 insertions(+), 527 deletions(-) delete mode 100644 be/src/util/arrow/row_block.cpp delete mode 100644 be/src/util/arrow/row_block.h diff --git a/be/src/exec/base_scanner.cpp b/be/src/exec/base_scanner.cpp index edb057a55d31db..84329c832d7128 100644 --- a/be/src/exec/base_scanner.cpp +++ b/be/src/exec/base_scanner.cpp @@ -23,7 +23,6 @@ #include "common/utils.h" #include "exec/exec_node.h" #include "runtime/descriptors.h" -#include "runtime/raw_value.h" #include "runtime/runtime_state.h" #include "vec/data_types/data_type_factory.hpp" diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index adc52d024d2b49..7b8e625d6131f7 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -84,8 +84,6 @@ Status SchemaScanner::init(SchemaScannerParam* param, ObjectPool* pool) { return Status::InternalError("invalid parameter"); } - RETURN_IF_ERROR(create_tuple_desc(pool)); - _param = param; _is_init = true; @@ -302,76 +300,4 @@ Status SchemaScanner::fill_dest_column_for_range(vectorized::Block* block, size_ return Status::OK(); } -Status SchemaScanner::create_tuple_desc(ObjectPool* pool) { - int null_column = 0; - for (int i = 0; i < _columns.size(); ++i) { - if (_columns[i].is_null) { - null_column++; - } - } - - int offset = (null_column + 7) / 8; - std::vector slots; - int null_byte = 0; - int null_bit = 0; - - for (int i = 0; i < _columns.size(); ++i) { - TSlotDescriptor t_slot_desc; - if (_columns[i].type == TYPE_DECIMALV2) { - t_slot_desc.__set_slotType(TypeDescriptor::create_decimalv2_type(27, 9).to_thrift()); - } else { - TypeDescriptor descriptor(_columns[i].type); - if (_columns[i].precision >= 0 && _columns[i].scale >= 0) { - descriptor.precision = _columns[i].precision; - descriptor.scale = _columns[i].scale; - } - t_slot_desc.__set_slotType(descriptor.to_thrift()); - } - t_slot_desc.__set_colName(_columns[i].name); - t_slot_desc.__set_columnPos(i); - t_slot_desc.__set_byteOffset(offset); - - if (_columns[i].is_null) { - t_slot_desc.__set_nullIndicatorByte(null_byte); - t_slot_desc.__set_nullIndicatorBit(null_bit); - null_bit = (null_bit + 1) % 8; - - if (0 == null_bit) { - null_byte++; - } - } else { - t_slot_desc.__set_nullIndicatorByte(0); - t_slot_desc.__set_nullIndicatorBit(-1); - } - - t_slot_desc.id = i; - t_slot_desc.__set_slotIdx(i); - t_slot_desc.__set_isMaterialized(true); - - SlotDescriptor* slot = pool->add(new (std::nothrow) SlotDescriptor(t_slot_desc)); - - if (nullptr == slot) { - return Status::InternalError("no memory for _tuple_desc."); - } - - slots.push_back(slot); - offset += _columns[i].size; - } - - TTupleDescriptor t_tuple_desc; - t_tuple_desc.__set_byteSize(offset); - t_tuple_desc.__set_numNullBytes((null_byte * 8 + null_bit + 7) / 8); - _tuple_desc = pool->add(new (std::nothrow) TupleDescriptor(t_tuple_desc)); - - if (nullptr == _tuple_desc) { - return Status::InternalError("no memory for _tuple_desc."); - } - - for (int i = 0; i < slots.size(); ++i) { - _tuple_desc->add_slot(slots[i]); - } - - return Status::OK(); -} - } // namespace doris diff --git a/be/src/exec/schema_scanner.h b/be/src/exec/schema_scanner.h index 4f7faac06c739a..ca62f5ac02d081 100644 --- a/be/src/exec/schema_scanner.h +++ b/be/src/exec/schema_scanner.h @@ -88,7 +88,6 @@ class SchemaScanner { const std::vector& get_column_desc() const { return _columns; } // factory function static SchemaScanner* create(TSchemaTableType::type type); - const TupleDescriptor* tuple_desc() const { return _tuple_desc; } TSchemaTableType::type type() const { return _schema_table_type; } static void set_doris_server(DorisServer* doris_server) { _s_doris_server = doris_server; } @@ -96,14 +95,12 @@ class SchemaScanner { protected: Status fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas); - Status create_tuple_desc(ObjectPool* pool); bool _is_init; // this is used for sub class SchemaScannerParam* _param; // schema table's column desc std::vector _columns; - TupleDescriptor* _tuple_desc; static DorisServer* _s_doris_server; diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index d2d2e07a5c990d..a5c53294f14e63 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -18,6 +18,7 @@ #include "exec/tablet_info.h" #include "runtime/large_int_value.h" +#include "runtime/raw_value.h" #include "util/string_parser.hpp" namespace doris { diff --git a/be/src/exec/tablet_info.h b/be/src/exec/tablet_info.h index 9753b71349b62b..3556adb13fffc0 100644 --- a/be/src/exec/tablet_info.h +++ b/be/src/exec/tablet_info.h @@ -29,7 +29,6 @@ #include "gen_cpp/descriptors.pb.h" #include "olap/tablet_schema.h" #include "runtime/descriptors.h" -#include "runtime/raw_value.h" #include "vec/core/block.h" namespace doris { diff --git a/be/src/runtime/buffer_control_block.cpp b/be/src/runtime/buffer_control_block.cpp index 7fa0c50923d546..89ee93a0126582 100644 --- a/be/src/runtime/buffer_control_block.cpp +++ b/be/src/runtime/buffer_control_block.cpp @@ -20,7 +20,6 @@ #include "gen_cpp/PaloInternalService_types.h" #include "gen_cpp/internal_service.pb.h" #include "runtime/exec_env.h" -#include "runtime/raw_value.h" #include "runtime/thread_context.h" #include "service/brpc.h" #include "util/thrift_util.h" diff --git a/be/src/runtime/collection_value.cpp b/be/src/runtime/collection_value.cpp index 255fce195b0553..f4543d361d7d22 100644 --- a/be/src/runtime/collection_value.cpp +++ b/be/src/runtime/collection_value.cpp @@ -22,7 +22,6 @@ #include "common/object_pool.h" #include "common/utils.h" #include "runtime/mem_pool.h" -#include "runtime/raw_value.h" #include "runtime/types.h" #include "vec/common/string_ref.h" diff --git a/be/src/runtime/collection_value.h b/be/src/runtime/collection_value.h index 6b59066f97c340..02a51e4bd18f5c 100644 --- a/be/src/runtime/collection_value.h +++ b/be/src/runtime/collection_value.h @@ -36,7 +36,6 @@ struct ArrayIteratorFunctionsBase; class ArrayIterator; class Status; class ObjectPool; -struct TypeDescriptor; template struct ArrayIteratorFunctions; diff --git a/be/src/runtime/raw_value.h b/be/src/runtime/raw_value.h index 32633d0ae77128..b1e635ea1693a1 100644 --- a/be/src/runtime/raw_value.h +++ b/be/src/runtime/raw_value.h @@ -36,191 +36,14 @@ class SlotDescriptor; // Useful utility functions for runtime values (which are passed around as void*). class RawValue { public: - // Ascii output precision for double/float - static const int ASCII_PRECISION; - - static uint32_t get_hash_value(const void* value, const PrimitiveType& type) { - return get_hash_value(value, type, 0); - } - - static uint32_t get_hash_value(const void* value, const PrimitiveType& type, uint32_t seed); - - // Returns hash value for 'value' interpreted as 'type'. The resulting hash value - // is combined with the seed value. - static uint32_t get_hash_value(const void* value, const TypeDescriptor& type, uint32_t seed) { - return get_hash_value(value, type.type, seed); - } - - static uint32_t get_hash_value(const void* value, const TypeDescriptor& type) { - return get_hash_value(value, type.type, 0); - } - - // Get the hash value using the fvn hash function. Using different seeds with FVN - // results in different hash functions. get_hash_value() does not have this property - // and cannot be safely used as the first step in data repartitioning. - // However, get_hash_value() can be significantly faster. - // TODO: fix get_hash_value - static uint32_t zlib_crc32(const void* value, const TypeDescriptor& type, uint32_t seed); - // Same as the up function, only use in vec exec engine. - static uint32_t zlib_crc32(const void* value, size_t len, const TypeDescriptor& type, + static uint32_t zlib_crc32(const void* value, size_t len, const PrimitiveType& type, uint32_t seed); - - // Compares both values. - // Return value is < 0 if v1 < v2, 0 if v1 == v2, > 0 if v1 > v2. - static int compare(const void* v1, const void* v2, const TypeDescriptor& type); - - // Returns true if v1 == v2. - // This is more performant than compare() == 0 for string equality, mostly because of - // the length comparison check. - static bool eq(const void* v1, const void* v2, const TypeDescriptor& type); - - static bool lt(const void* v1, const void* v2, const TypeDescriptor& type); }; -// Use boost::hash_combine for corner cases. boost::hash_combine is reimplemented -// here to use int32t's (instead of size_t) -// boost::hash_combine does: -// seed ^= v + 0x9e3779b9 + (seed << 6) + (seed >> 2); -inline uint32_t RawValue::get_hash_value(const void* v, const PrimitiveType& type, uint32_t seed) { - // Hash_combine with v = 0 - if (v == nullptr) { - uint32_t value = 0x9e3779b9; - return seed ^ (value + (seed << 6) + (seed >> 2)); - } - - switch (type) { - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_HLL: - case TYPE_STRING: { - const StringRef* string_value = reinterpret_cast(v); - return HashUtil::hash(string_value->data, string_value->size, seed); - } - - case TYPE_BOOLEAN: { - uint32_t value = *reinterpret_cast(v) + 0x9e3779b9; - return seed ^ (value + (seed << 6) + (seed >> 2)); - } - - case TYPE_TINYINT: - return HashUtil::hash(v, 1, seed); - - case TYPE_SMALLINT: - return HashUtil::hash(v, 2, seed); - - case TYPE_INT: - return HashUtil::hash(v, 4, seed); - - case TYPE_BIGINT: - return HashUtil::hash(v, 8, seed); - - case TYPE_FLOAT: - return HashUtil::hash(v, 4, seed); - - case TYPE_DOUBLE: - return HashUtil::hash(v, 8, seed); - - case TYPE_DATE: - case TYPE_DATETIME: - return HashUtil::hash(v, 16, seed); - - case TYPE_DATEV2: - return HashUtil::hash(v, 4, seed); - - case TYPE_DATETIMEV2: - return HashUtil::hash(v, 8, seed); - - case TYPE_DECIMALV2: - return HashUtil::hash(v, 16, seed); - case TYPE_DECIMAL32: - return HashUtil::hash(v, 4, seed); - case TYPE_DECIMAL64: - return HashUtil::hash(v, 8, seed); - case TYPE_DECIMAL128I: - return HashUtil::hash(v, 16, seed); - - case TYPE_LARGEINT: - return HashUtil::hash(v, 16, seed); - - default: - DCHECK(false) << "invalid type: " << type; - return 0; - } -} - // NOTE: this is just for split data, decimal use old doris hash function // Because crc32 hardware is not equal with zlib crc32 -inline uint32_t RawValue::zlib_crc32(const void* v, const TypeDescriptor& type, uint32_t seed) { - // Hash_combine with v = 0 - if (v == nullptr) { - uint32_t value = 0x9e3779b9; - return seed ^ (value + (seed << 6) + (seed >> 2)); - } - - switch (type.type) { - case TYPE_VARCHAR: - case TYPE_HLL: - case TYPE_STRING: { - const StringRef* string_value = reinterpret_cast(v); - return HashUtil::zlib_crc_hash(string_value->data, string_value->size, seed); - } - case TYPE_CHAR: { - // TODO(zc): ugly, use actual value to compute hash value - const StringRef* string_value = reinterpret_cast(v); - int len = 0; - while (len < string_value->size) { - if (string_value->data[len] == '\0') { - break; - } - len++; - } - return HashUtil::zlib_crc_hash(string_value->data, len, seed); - } - case TYPE_BOOLEAN: - case TYPE_TINYINT: - return HashUtil::zlib_crc_hash(v, 1, seed); - case TYPE_SMALLINT: - return HashUtil::zlib_crc_hash(v, 2, seed); - case TYPE_INT: - case TYPE_DATEV2: - case TYPE_DECIMAL32: - return HashUtil::zlib_crc_hash(v, 4, seed); - case TYPE_BIGINT: - case TYPE_DATETIMEV2: - case TYPE_DECIMAL64: - return HashUtil::zlib_crc_hash(v, 8, seed); - case TYPE_LARGEINT: - case TYPE_DECIMAL128I: - return HashUtil::zlib_crc_hash(v, 16, seed); - case TYPE_FLOAT: - return HashUtil::zlib_crc_hash(v, 4, seed); - case TYPE_DOUBLE: - return HashUtil::zlib_crc_hash(v, 8, seed); - case TYPE_DATE: - case TYPE_DATETIME: { - const DateTimeValue* date_val = (const DateTimeValue*)v; - char buf[64]; - int len = date_val->to_buffer(buf); - return HashUtil::zlib_crc_hash(buf, len, seed); - } - - case TYPE_DECIMALV2: { - const DecimalV2Value* dec_val = (const DecimalV2Value*)v; - int64_t int_val = dec_val->int_value(); - int32_t frac_val = dec_val->frac_value(); - seed = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), seed); - return HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), seed); - } - default: - DCHECK(false) << "invalid type: " << type; - return 0; - } -} - -// NOTE: this is just for split data, decimal use old doris hash function -// Because crc32 hardware is not equal with zlib crc32 -inline uint32_t RawValue::zlib_crc32(const void* v, size_t len, const TypeDescriptor& type, +inline uint32_t RawValue::zlib_crc32(const void* v, size_t len, const PrimitiveType& type, uint32_t seed) { // Hash_combine with v = 0 if (v == nullptr) { @@ -228,7 +51,7 @@ inline uint32_t RawValue::zlib_crc32(const void* v, size_t len, const TypeDescri return seed ^ (value + (seed << 6) + (seed >> 2)); } - switch (type.type) { + switch (type) { case TYPE_VARCHAR: case TYPE_HLL: case TYPE_STRING: diff --git a/be/src/runtime/result_buffer_mgr.cpp b/be/src/runtime/result_buffer_mgr.cpp index 19fb2522b87c2f..02229efe23726d 100644 --- a/be/src/runtime/result_buffer_mgr.cpp +++ b/be/src/runtime/result_buffer_mgr.cpp @@ -22,7 +22,6 @@ #include "gen_cpp/PaloInternalService_types.h" #include "gen_cpp/types.pb.h" #include "runtime/buffer_control_block.h" -#include "runtime/raw_value.h" #include "util/doris_metrics.h" namespace doris { diff --git a/be/src/runtime/result_queue_mgr.h b/be/src/runtime/result_queue_mgr.h index 088ad787d3f7c3..28dcabfaa54f6d 100644 --- a/be/src/runtime/result_queue_mgr.h +++ b/be/src/runtime/result_queue_mgr.h @@ -24,7 +24,6 @@ #include "common/status.h" #include "runtime/primitive_type.h" -#include "runtime/raw_value.h" #include "runtime/record_batch_queue.h" #include "util/hash_util.hpp" diff --git a/be/src/runtime/result_writer.h b/be/src/runtime/result_writer.h index 1d0bfde5895c59..bd5a11cc8f36d5 100644 --- a/be/src/runtime/result_writer.h +++ b/be/src/runtime/result_writer.h @@ -24,7 +24,6 @@ namespace doris { class Status; class RuntimeState; -struct TypeDescriptor; namespace vectorized { class Block; diff --git a/be/src/runtime/struct_value.h b/be/src/runtime/struct_value.h index ec243d729c3cf2..d02ddc994caafa 100644 --- a/be/src/runtime/struct_value.h +++ b/be/src/runtime/struct_value.h @@ -45,8 +45,6 @@ class StructValue { void shallow_copy(const StructValue* other); - // size_t get_byte_size(const TypeDescriptor& type) const; - const void** values() const { return const_cast(_values); } void** mutable_values() { return _values; } void set_values(void** values) { _values = values; } diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index 71ff0e089ffcc9..23e14e3b4c8d2a 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -23,7 +23,6 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/util") set(UTIL_FILES arrow/row_batch.cpp - arrow/row_block.cpp arrow/utils.cpp arrow/block_convertor.cpp bfd_parser.cpp diff --git a/be/src/util/arrow/row_block.cpp b/be/src/util/arrow/row_block.cpp deleted file mode 100644 index a7758707fbe5a0..00000000000000 --- a/be/src/util/arrow/row_block.cpp +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "util/arrow/row_block.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gutil/strings/substitute.h" -#include "olap/field.h" -#include "olap/olap_common.h" -#include "olap/schema.h" -#include "olap/tablet_schema.h" -#include "util/arrow/utils.h" - -namespace doris { - -using strings::Substitute; - -Status convert_to_arrow_type(FieldType type, std::shared_ptr* result) { - switch (type) { - case OLAP_FIELD_TYPE_TINYINT: - *result = arrow::int8(); - break; - case OLAP_FIELD_TYPE_SMALLINT: - *result = arrow::int16(); - break; - case OLAP_FIELD_TYPE_INT: - *result = arrow::int32(); - break; - case OLAP_FIELD_TYPE_BIGINT: - *result = arrow::int64(); - break; - case OLAP_FIELD_TYPE_FLOAT: - *result = arrow::float32(); - break; - case OLAP_FIELD_TYPE_DOUBLE: - *result = arrow::float64(); - break; - default: - return Status::InvalidArgument("Unknown FieldType({})", type); - } - return Status::OK(); -} - -Status convert_to_arrow_field(uint32_t cid, const Field* field, - std::shared_ptr* result) { - std::shared_ptr type; - RETURN_IF_ERROR(convert_to_arrow_type(field->type(), &type)); - *result = arrow::field(strings::Substitute("Col$0", cid), type, field->is_nullable()); - return Status::OK(); -} - -Status convert_to_arrow_schema(const Schema& schema, std::shared_ptr* result) { - std::vector> fields; - size_t num_fields = schema.num_column_ids(); - fields.resize(num_fields); - for (int i = 0; i < num_fields; ++i) { - auto cid = schema.column_ids()[i]; - RETURN_IF_ERROR(convert_to_arrow_field(cid, schema.column(cid), &fields[i])); - } - *result = arrow::schema(std::move(fields)); - return Status::OK(); -} - -Status convert_to_type_name(const arrow::DataType& type, std::string* name) { - switch (type.id()) { - case arrow::Type::INT8: - *name = "TINYINT"; - break; - case arrow::Type::INT16: - *name = "SMALLINT"; - break; - case arrow::Type::INT32: - *name = "INT"; - break; - case arrow::Type::INT64: - *name = "BIGINT"; - break; - case arrow::Type::FLOAT: - *name = "FLOAT"; - break; - case arrow::Type::DOUBLE: - *name = "DOUBLE"; - break; - default: - return Status::InvalidArgument("Unknown arrow type id({})", type.id()); - } - return Status::OK(); -} - -Status convert_to_tablet_column(const arrow::Field& field, int32_t cid, TabletColumn* output) { - ColumnPB column_pb; - std::string type_name; - RETURN_IF_ERROR(convert_to_type_name(*field.type(), &type_name)); - - column_pb.set_unique_id(cid); - column_pb.set_name(field.name()); - column_pb.set_type(type_name); - column_pb.set_is_key(true); - column_pb.set_is_nullable(field.nullable()); - - output->init_from_pb(column_pb); - return Status::OK(); -} - -} // namespace doris diff --git a/be/src/util/arrow/row_block.h b/be/src/util/arrow/row_block.h deleted file mode 100644 index afdecdd52a7a85..00000000000000 --- a/be/src/util/arrow/row_block.h +++ /dev/null @@ -1,38 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "common/status.h" - -namespace arrow { - -class Schema; -class MemoryPool; -class RecordBatch; - -} // namespace arrow - -namespace doris { -class Schema; - -// Convert Doris Schema to Arrow Schema. -Status convert_to_arrow_schema(const Schema& row_desc, std::shared_ptr* result); - -} // namespace doris diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index 726ad50f92d1c3..ecc0f04a191cb9 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -37,6 +37,7 @@ #include #include "gen_cpp/Types_types.h" +#include "runtime/define_primitive_type.h" #include "util/cpu_info.h" #include "util/murmur_hash3.h" diff --git a/be/src/vec/columns/column_const.cpp b/be/src/vec/columns/column_const.cpp index 8d79ec32f91d14..7b7fc4d753160c 100644 --- a/be/src/vec/columns/column_const.cpp +++ b/be/src/vec/columns/column_const.cpp @@ -131,8 +131,7 @@ void ColumnConst::update_crcs_with_value(std::vector& hashes, doris::P } } else { for (int i = 0; i < hashes.size(); ++i) { - hashes[i] = RawValue::zlib_crc32(real_data.data, real_data.size, TypeDescriptor {type}, - hashes[i]); + hashes[i] = RawValue::zlib_crc32(real_data.data, real_data.size, type, hashes[i]); } } } diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index a5534f8075e959..ebd7f3726d7957 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -825,17 +825,6 @@ Status Block::serialize(int be_exec_version, PBlock* pblock, return Status::OK(); } -inline bool Block::is_column_data_null(const doris::TypeDescriptor& type_desc, - const StringRef& data_ref, const IColumn* column, int row) { - if (type_desc.type != TYPE_ARRAY) { - return data_ref.data == nullptr; - } else { - Field array; - column->get(row, array); - return array.is_null(); - } -} - MutableBlock::MutableBlock(const std::vector& tuple_descs, int reserve_size, bool ignore_trivial_slot) { for (auto tuple_desc : tuple_descs) { diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index 35026b846d9e61..eef530659ccf44 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -42,7 +42,6 @@ namespace doris { class RowDescriptor; class Status; class TupleDescriptor; -struct TypeDescriptor; namespace vectorized { @@ -371,8 +370,6 @@ class Block { private: void erase_impl(size_t position); - bool is_column_data_null(const doris::TypeDescriptor& type_desc, const StringRef& data_ref, - const IColumn* column_with_type_and_name, int row); }; using Blocks = std::vector; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index c9cade7ec86482..7ab2f5bc50c898 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -28,7 +28,6 @@ #include "io/cache/block/block_file_cache_profile.h" #include "olap/iterators.h" #include "runtime/descriptors.h" -#include "runtime/raw_value.h" #include "runtime/runtime_state.h" #include "vec/exec/format/csv/csv_reader.h" #include "vec/exec/format/json/new_json_reader.h" diff --git a/be/src/vec/runtime/vdata_stream_mgr.cpp b/be/src/vec/runtime/vdata_stream_mgr.cpp index 282d641431fc22..09c9cd77c71548 100644 --- a/be/src/vec/runtime/vdata_stream_mgr.cpp +++ b/be/src/vec/runtime/vdata_stream_mgr.cpp @@ -20,7 +20,6 @@ #include "gen_cpp/internal_service.pb.h" #include "runtime/descriptors.h" #include "runtime/primitive_type.h" -#include "runtime/raw_value.h" #include "runtime/runtime_state.h" #include "util/doris_metrics.h" #include "util/runtime_profile.h" @@ -39,9 +38,9 @@ VDataStreamMgr::~VDataStreamMgr() { inline uint32_t VDataStreamMgr::get_hash_value(const TUniqueId& fragment_instance_id, PlanNodeId node_id) { - uint32_t value = RawValue::get_hash_value(&fragment_instance_id.lo, TYPE_BIGINT, 0); - value = RawValue::get_hash_value(&fragment_instance_id.hi, TYPE_BIGINT, value); - value = RawValue::get_hash_value(&node_id, TYPE_INT, value); + uint32_t value = HashUtil::hash(&fragment_instance_id.lo, 8, 0); + value = HashUtil::hash(&fragment_instance_id.hi, 8, value); + value = HashUtil::hash(&node_id, 4, value); return value; } diff --git a/be/src/vec/utils/arrow_column_to_doris_column.cpp b/be/src/vec/utils/arrow_column_to_doris_column.cpp index 35d5d4956a7290..c0d64df21161f3 100644 --- a/be/src/vec/utils/arrow_column_to_doris_column.cpp +++ b/be/src/vec/utils/arrow_column_to_doris_column.cpp @@ -408,59 +408,4 @@ Status arrow_column_to_doris_column(const arrow::Array* arrow_column, size_t arr fmt::format("Not support arrow type:{}", arrow_column->type()->name())); } -Status arrow_type_to_doris_type(arrow::Type::type type, TypeDescriptor* return_type) { - switch (type) { - case arrow::Type::STRING: - case arrow::Type::BINARY: - case arrow::Type::FIXED_SIZE_BINARY: - return_type->type = TYPE_STRING; - break; - case arrow::Type::INT8: - return_type->type = TYPE_TINYINT; - break; - case arrow::Type::UINT8: - case arrow::Type::INT16: - return_type->type = TYPE_SMALLINT; - break; - case arrow::Type::UINT16: - case arrow::Type::INT32: - return_type->type = TYPE_INT; - break; - case arrow::Type::UINT32: - case arrow::Type::INT64: - return_type->type = TYPE_BIGINT; - break; - case arrow::Type::UINT64: - return_type->type = TYPE_LARGEINT; - break; - case arrow::Type::HALF_FLOAT: - case arrow::Type::FLOAT: - return_type->type = TYPE_FLOAT; - break; - case arrow::Type::DOUBLE: - return_type->type = TYPE_DOUBLE; - break; - case arrow::Type::BOOL: - return_type->type = TYPE_BOOLEAN; - break; - case arrow::Type::DATE32: - return_type->type = TYPE_DATEV2; - break; - case arrow::Type::DATE64: - return_type->type = TYPE_DATETIMEV2; - break; - case arrow::Type::TIMESTAMP: - return_type->type = TYPE_BIGINT; - break; - case arrow::Type::DECIMAL: - return_type->type = TYPE_DECIMALV2; - return_type->precision = 27; - return_type->scale = 9; - break; - default: - return Status::InternalError("unsupport type: {}", type); - } - return Status::OK(); -} - } // namespace doris::vectorized diff --git a/be/src/vec/utils/arrow_column_to_doris_column.h b/be/src/vec/utils/arrow_column_to_doris_column.h index 84e26a70117673..73704c95f5da7a 100644 --- a/be/src/vec/utils/arrow_column_to_doris_column.h +++ b/be/src/vec/utils/arrow_column_to_doris_column.h @@ -42,6 +42,4 @@ Status arrow_column_to_doris_column(const arrow::Array* arrow_column, size_t arr ColumnPtr& doris_column, const DataTypePtr& type, size_t num_elements, const cctz::time_zone& ctz); -Status arrow_type_to_doris_type(arrow::Type::type type, TypeDescriptor* return_type); - } // namespace doris::vectorized diff --git a/be/test/olap/row_cursor_test.cpp b/be/test/olap/row_cursor_test.cpp index 5899f557f3a11a..9ee6ae7943d4e2 100644 --- a/be/test/olap/row_cursor_test.cpp +++ b/be/test/olap/row_cursor_test.cpp @@ -286,8 +286,6 @@ TEST_F(TestRowCursor, InitRowCursorWithColumnCount) { EXPECT_EQ(res, Status::OK()); EXPECT_EQ(row.get_fixed_len(), 23); EXPECT_EQ(row.get_variable_len(), 0); - row.allocate_memory_for_string_type(tablet_schema); - EXPECT_EQ(row.get_variable_len(), 0); } TEST_F(TestRowCursor, InitRowCursorWithColIds) { diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index 94265054890809..484d5adbd6c9a1 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -26,6 +26,7 @@ #include "io/buffered_reader.h" #include "io/fs/local_file_system.h" #include "olap/iterators.h" +#include "runtime/descriptors.h" #include "util/runtime_profile.h" #include "util/timezone_utils.h" #include "vec/common/string_ref.h" @@ -226,6 +227,74 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column } } +// Only the unit test depend on this, but it is wrong, should not use TTupleDesc to create tuple desc, not +// use columndesc +static doris::TupleDescriptor* create_tuple_desc( + doris::ObjectPool* pool, std::vector& column_descs) { + using namespace doris; + int null_column = 0; + for (int i = 0; i < column_descs.size(); ++i) { + if (column_descs[i].is_null) { + null_column++; + } + } + + int offset = (null_column + 7) / 8; + std::vector slots; + int null_byte = 0; + int null_bit = 0; + + for (int i = 0; i < column_descs.size(); ++i) { + TSlotDescriptor t_slot_desc; + if (column_descs[i].type == TYPE_DECIMALV2) { + t_slot_desc.__set_slotType(TypeDescriptor::create_decimalv2_type(27, 9).to_thrift()); + } else { + TypeDescriptor descriptor(column_descs[i].type); + if (column_descs[i].precision >= 0 && column_descs[i].scale >= 0) { + descriptor.precision = column_descs[i].precision; + descriptor.scale = column_descs[i].scale; + } + t_slot_desc.__set_slotType(descriptor.to_thrift()); + } + t_slot_desc.__set_colName(column_descs[i].name); + t_slot_desc.__set_columnPos(i); + t_slot_desc.__set_byteOffset(offset); + + if (column_descs[i].is_null) { + t_slot_desc.__set_nullIndicatorByte(null_byte); + t_slot_desc.__set_nullIndicatorBit(null_bit); + null_bit = (null_bit + 1) % 8; + + if (0 == null_bit) { + null_byte++; + } + } else { + t_slot_desc.__set_nullIndicatorByte(0); + t_slot_desc.__set_nullIndicatorBit(-1); + } + + t_slot_desc.id = i; + t_slot_desc.__set_slotIdx(i); + t_slot_desc.__set_isMaterialized(true); + + SlotDescriptor* slot = pool->add(new (std::nothrow) SlotDescriptor(t_slot_desc)); + slots.push_back(slot); + offset += column_descs[i].size; + } + + TTupleDescriptor t_tuple_desc; + t_tuple_desc.__set_byteSize(offset); + t_tuple_desc.__set_numNullBytes((null_byte * 8 + null_bit + 7) / 8); + doris::TupleDescriptor* tuple_desc = + pool->add(new (std::nothrow) doris::TupleDescriptor(t_tuple_desc)); + + for (int i = 0; i < slots.size(); ++i) { + tuple_desc->add_slot(slots[i]); + } + + return tuple_desc; +} + static void create_block(std::unique_ptr& block) { // Current supported column type: std::vector column_descs = { @@ -247,11 +316,9 @@ static void create_block(std::unique_ptr& block) { {"date_col", TYPE_DATE, sizeof(DateTimeValue), true}, {"date_v2_col", TYPE_DATEV2, sizeof(uint32_t), true}, {"timestamp_v2_col", TYPE_DATETIMEV2, sizeof(DateTimeValue), true, 18, 0}}; - SchemaScanner schema_scanner(column_descs); ObjectPool object_pool; - SchemaScannerParam param; - schema_scanner.init(¶m, &object_pool); - auto tuple_slots = const_cast(schema_scanner.tuple_desc())->slots(); + doris::TupleDescriptor* tuple_desc = create_tuple_desc(&object_pool, column_descs); + auto tuple_slots = tuple_desc->slots(); block.reset(new vectorized::Block()); for (const auto& slot_desc : tuple_slots) { auto data_type = slot_desc->get_data_type_ptr(); @@ -347,7 +414,7 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) { } TEST_F(ParquetThriftReaderTest, group_reader) { - std::vector column_descs = { + std::vector column_descs = { {"tinyint_col", TYPE_TINYINT, sizeof(int8_t), true}, {"smallint_col", TYPE_SMALLINT, sizeof(int16_t), true}, {"int_col", TYPE_INT, sizeof(int32_t), true}, @@ -362,11 +429,9 @@ TEST_F(ParquetThriftReaderTest, group_reader) { {"char_col", TYPE_CHAR, sizeof(StringRef), true}, {"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true}, {"date_col", TYPE_DATE, sizeof(DateTimeValue), true}}; - SchemaScanner schema_scanner(column_descs); ObjectPool object_pool; - SchemaScannerParam param; - schema_scanner.init(¶m, &object_pool); - auto tuple_slots = const_cast(schema_scanner.tuple_desc())->slots(); + doris::TupleDescriptor* tuple_desc = create_tuple_desc(&object_pool, column_descs); + auto tuple_slots = tuple_desc->slots(); TSlotDescriptor tslot_desc; { diff --git a/be/test/vec/exprs/vexpr_test.cpp b/be/test/vec/exprs/vexpr_test.cpp index 0e91ba42e39731..45b7ca502545e6 100644 --- a/be/test/vec/exprs/vexpr_test.cpp +++ b/be/test/vec/exprs/vexpr_test.cpp @@ -26,6 +26,7 @@ #include "exec/schema_scanner.h" #include "gen_cpp/Exprs_types.h" #include "gen_cpp/Types_types.h" +#include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/large_int_value.h" #include "runtime/memory/chunk_allocator.h" @@ -63,15 +64,80 @@ TEST(TEST_VEXPR, ABSTEST) { context->close(&runtime_stat); } +// Only the unit test depend on this, but it is wrong, should not use TTupleDesc to create tuple desc, not +// use columndesc +static doris::TupleDescriptor* create_tuple_desc( + doris::ObjectPool* pool, std::vector& column_descs) { + using namespace doris; + int null_column = 0; + for (int i = 0; i < column_descs.size(); ++i) { + if (column_descs[i].is_null) { + null_column++; + } + } + + int offset = (null_column + 7) / 8; + std::vector slots; + int null_byte = 0; + int null_bit = 0; + + for (int i = 0; i < column_descs.size(); ++i) { + TSlotDescriptor t_slot_desc; + if (column_descs[i].type == TYPE_DECIMALV2) { + t_slot_desc.__set_slotType(TypeDescriptor::create_decimalv2_type(27, 9).to_thrift()); + } else { + TypeDescriptor descriptor(column_descs[i].type); + if (column_descs[i].precision >= 0 && column_descs[i].scale >= 0) { + descriptor.precision = column_descs[i].precision; + descriptor.scale = column_descs[i].scale; + } + t_slot_desc.__set_slotType(descriptor.to_thrift()); + } + t_slot_desc.__set_colName(column_descs[i].name); + t_slot_desc.__set_columnPos(i); + t_slot_desc.__set_byteOffset(offset); + + if (column_descs[i].is_null) { + t_slot_desc.__set_nullIndicatorByte(null_byte); + t_slot_desc.__set_nullIndicatorBit(null_bit); + null_bit = (null_bit + 1) % 8; + + if (0 == null_bit) { + null_byte++; + } + } else { + t_slot_desc.__set_nullIndicatorByte(0); + t_slot_desc.__set_nullIndicatorBit(-1); + } + + t_slot_desc.id = i; + t_slot_desc.__set_slotIdx(i); + t_slot_desc.__set_isMaterialized(true); + + SlotDescriptor* slot = pool->add(new (std::nothrow) SlotDescriptor(t_slot_desc)); + slots.push_back(slot); + offset += column_descs[i].size; + } + + TTupleDescriptor t_tuple_desc; + t_tuple_desc.__set_byteSize(offset); + t_tuple_desc.__set_numNullBytes((null_byte * 8 + null_bit + 7) / 8); + doris::TupleDescriptor* tuple_desc = + pool->add(new (std::nothrow) doris::TupleDescriptor(t_tuple_desc)); + + for (int i = 0; i < slots.size(); ++i) { + tuple_desc->add_slot(slots[i]); + } + + return tuple_desc; +} + TEST(TEST_VEXPR, ABSTEST2) { using namespace doris; - std::vector column_descs = { + std::vector column_descs = { {"k1", TYPE_INT, sizeof(int32_t), false}}; - SchemaScanner schema_scanner(column_descs); ObjectPool object_pool; - SchemaScannerParam param; - schema_scanner.init(¶m, &object_pool); - auto tuple_desc = const_cast(schema_scanner.tuple_desc()); + doris::TupleDescriptor* tuple_desc = create_tuple_desc(&object_pool, column_descs); RowDescriptor row_desc(tuple_desc, false); std::string expr_json = R"|({"1":{"lst":["rec",2,{"1":{"i32":20},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":6}}}}]}}},"4":{"i32":1},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"abs"}}},"2":{"i32":0},"3":{"lst":["rec",1,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]}}]},"4":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":6}}}}]}}},"5":{"tf":0},"7":{"str":"abs(INT)"},"9":{"rec":{"1":{"str":"_ZN5doris13MathFunctions3absEPN9doris_udf15FunctionContextERKNS1_6IntValE"}}},"11":{"i64":0}}}},{"1":{"i32":16},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]}}},"4":{"i32":0},"15":{"rec":{"1":{"i32":0},"2":{"i32":0}}},"20":{"i32":-1},"23":{"i32":-1}}]}})|";