From 9c9249e9112a79963ab1f29af15f264bc0b2400a Mon Sep 17 00:00:00 2001 From: qiye Date: Tue, 19 Dec 2023 18:54:36 +0800 Subject: [PATCH] =?UTF-8?q?[feature](inverted=20index)=20add=20ignore=5Fab?= =?UTF-8?q?ove=20property=20to=20prevent=20long=20s=E2=80=A6=20(#28585)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When string is too long, clucene will throw an error. And the string is too long to analyze. So we ignore the string in index process when the string is longer than 256 bytes by default. We add an poperty `ignore_above` for user to customize. --- be/src/olap/inverted_index_parser.cpp | 9 +++++++ be/src/olap/inverted_index_parser.h | 7 ++++++ .../segment_v2/inverted_index_writer.cpp | 24 ++++++++++++++++--- .../docs/data-table/index/inverted-index.md | 3 +++ .../docs/data-table/index/inverted-index.md | 3 +++ .../doris/analysis/InvertedIndexUtil.java | 13 ++++++++++ .../ddl/large_records_t1_dk.sql | 4 ++-- .../ddl/large_records_t1_uk.sql | 4 ++-- .../ddl/large_records_t2_dk.sql | 4 ++-- .../ddl/large_records_t2_uk.sql | 4 ++-- .../ddl/large_records_t3_dk.sql | 4 ++-- .../ddl/large_records_t3_uk.sql | 4 ++-- .../ddl/large_records_t4_dk.sql | 4 ++-- .../ddl/large_records_t4_uk.sql | 4 ++-- 14 files changed, 72 insertions(+), 19 deletions(-) diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 5678a217b537f6..3d498ff5382518 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -119,4 +119,13 @@ CharFilterMap get_parser_char_filter_map_from_properties( return char_filter_map; } +std::string get_parser_ignore_above_value_from_properties( + const std::map& properties) { + if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY); + } else { + return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index bf931a3ce4773d..ca1efe773af558 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -82,4 +85,8 @@ std::string get_parser_phrase_support_string_from_properties( CharFilterMap get_parser_char_filter_map_from_properties( const std::map& properties); +// get parser ignore_above value from properties +std::string get_parser_ignore_above_value_from_properties( + const std::map& properties); + } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 0724559895746b..d397910891fb52 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -285,7 +285,16 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } auto* v = (Slice*)values; for (int i = 0; i < count; ++i) { - new_fulltext_field(v->get_data(), v->get_size()); + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); + if (v->get_size() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most 256, but got " + << "value length:" << v->get_size() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + } else { + new_fulltext_field(v->get_data(), v->get_size()); + } RETURN_IF_ERROR(add_document()); ++v; _rid++; @@ -325,9 +334,18 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } auto value = join(strings, " "); - new_fulltext_field(value.c_str(), value.length()); + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); + if (value.length() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most 256, but got " + << "value length:" << value.length() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + } else { + new_fulltext_field(value.c_str(), value.length()); + } _rid++; - _index_writer->addDocument(_doc.get()); + RETURN_IF_ERROR(add_document()); } } else if constexpr (field_is_numeric_type(field_type)) { for (int i = 0; i < count; ++i) { diff --git a/docs/en/docs/data-table/index/inverted-index.md b/docs/en/docs/data-table/index/inverted-index.md index f86d47c8bbe167..f10b543807c13a 100644 --- a/docs/en/docs/data-table/index/inverted-index.md +++ b/docs/en/docs/data-table/index/inverted-index.md @@ -89,6 +89,9 @@ The features for inverted index is as follows: - char_replace: replace each char in the pattern with a char in the replacement - char_filter_pattern: character array to be replaced - char_filter_replacement: replaced character array, can be left unset, defaults to a space character + - ignore_above: Controls whether strings are indexed. + - Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed. + - default value is 256 bytes. - COMMENT is optional ```sql diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index ad4c9a011d989e..e3cba26ed8f4ab 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下: - char_replace 将pattern中每个char替换为一个replacement中的char - char_filter_pattern:需要被替换掉的字符数组 - char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符 + - ignore_above:控制字符串是否建索引。 + - 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。 + - 默认为 256 字节 - COMMENT 是可选的,用于指定注释 ```sql diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index e6fcefb7e010b3..daeecede096aaa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -43,6 +43,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; + public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above"; + public static String getInvertedIndexParser(Map properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -98,6 +100,17 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c if (parser == null && !properties.isEmpty()) { throw new AnalysisException("invalid index properties, please check the properties"); } + String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE); + if (ignoreAbove != null) { + try { + int ignoreAboveValue = Integer.parseInt(ignoreAbove); + if (ignoreAboveValue <= 0) { + throw new AnalysisException("invalid index properties, ignore_above must be positive"); + } + } catch (NumberFormatException e) { + throw new AnalysisException("invalid index properties, ignore_above must be integer"); + } + } } // default is "none" if not set diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql index e15884d8c87c62..57164f24d643b2 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql index 4b921139046980..d4649382337ac7 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql index 2a8954609d52bf..8974b5de553491 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql index 733c398ccc9247..2761ae00c57444 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql index 03e3099aed4561..4be7d0bbaa8321 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql index e46c254da76f12..019a470786c1c2 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql index 5faf2da04bbb4b..ddf83cd7ae6c10 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql index b594d5cd3cb2a4..3f4df358c23e27 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3