Skip to content

Commit 1afc39e

Browse files
nicola-cabjedelbo
andauthored
RCORE-2064 String EQ/NEQ optimisations for compressed strings (#7820)
* find_first optimization for compressed strings * core test passing * compression tests for collection of strings * code review * Fixes (#7872) --------- Co-authored-by: Jørgen Edelbo <jorgen.edelbo@mongodb.com>
1 parent ce6f196 commit 1afc39e

File tree

6 files changed

+135
-26
lines changed

6 files changed

+135
-26
lines changed

src/realm/array_string.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,14 @@ StringData ArrayString::get(size_t ndx) const
192192
return {};
193193
}
194194

195+
std::optional<StringID> realm::ArrayString::get_string_id(size_t ndx) const
196+
{
197+
if (m_type == Type::interned_strings) {
198+
return StringID(static_cast<Array*>(m_arr)->get(ndx));
199+
}
200+
return m_string_interner->lookup(get(ndx));
201+
}
202+
195203
Mixed ArrayString::get_any(size_t ndx) const
196204
{
197205
return Mixed(get(ndx));
@@ -274,6 +282,16 @@ void ArrayString::clear()
274282
}
275283

276284
size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const noexcept
285+
{
286+
// This should only be called if we don't have a string id for this particular array (aka no string interner)
287+
std::optional<StringID> id;
288+
if (m_type == Type::interned_strings)
289+
id = m_string_interner->lookup(value);
290+
291+
return find_first(value, begin, end, id);
292+
}
293+
294+
size_t ArrayString::find_first(StringData value, size_t begin, size_t end, std::optional<StringID> id) const noexcept
277295
{
278296
switch (m_type) {
279297
case Type::small_strings:
@@ -289,14 +307,13 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const
289307
break;
290308
}
291309
case Type::interned_strings: {
292-
// we need a way to avoid this lookup for each leaf array. The lookup must appear
293-
// higher up the call stack and passed down.
294-
auto id = m_string_interner->lookup(value);
295310
if (id) {
296311
return static_cast<Array*>(m_arr)->find_first(*id, begin, end);
297312
}
298313
break;
299314
}
315+
default:
316+
break;
300317
}
301318
return not_found;
302319
}

src/realm/array_string.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <realm/array_string_short.hpp>
2323
#include <realm/array_blobs_small.hpp>
2424
#include <realm/array_blobs_big.hpp>
25+
#include <realm/string_interner.hpp>
2526

2627
namespace realm {
2728

@@ -74,6 +75,10 @@ class ArrayString : public ArrayPayload {
7475
{
7576
m_string_interner = string_interner;
7677
}
78+
bool is_compressed() const
79+
{
80+
return m_type == Type::interned_strings;
81+
}
7782

7883
void update_parent()
7984
{
@@ -99,6 +104,7 @@ class ArrayString : public ArrayPayload {
99104
}
100105
void insert(size_t ndx, StringData value);
101106
StringData get(size_t ndx) const;
107+
std::optional<StringID> get_string_id(size_t ndx) const;
102108
Mixed get_any(size_t ndx) const override;
103109
bool is_null(size_t ndx) const;
104110
void erase(size_t ndx);
@@ -107,6 +113,9 @@ class ArrayString : public ArrayPayload {
107113

108114
size_t find_first(StringData value, size_t begin, size_t end) const noexcept;
109115

116+
/// Special version for searching in an array or compressed strings.
117+
size_t find_first(StringData value, size_t begin, size_t end, std::optional<StringID>) const noexcept;
118+
110119
size_t lower_bound(StringData value);
111120

112121
/// Get the specified element without the cost of constructing an

src/realm/query_engine.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ bool StringNode<Equal>::do_consume_condition(ParentNode& node)
453453
size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
454454
{
455455
if (m_needles.empty()) {
456-
return m_leaf->find_first(m_string_value, start, end);
456+
return m_leaf->find_first(m_string_value, start, end, m_interned_string_id);
457457
}
458458
else {
459459
if (end == npos)
@@ -505,7 +505,8 @@ size_t StringNode<EqualIns>::_find_first_local(size_t start, size_t end)
505505
}
506506

507507
StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_ptr<LinkMap> lm)
508-
: StringNodeEqualBase(v, column)
508+
: m_value(v)
509+
, m_col(column)
509510
, m_link_map(std::move(lm))
510511
{
511512
if (!m_link_map)
@@ -518,17 +519,21 @@ void StringNodeFulltext::table_changed()
518519
}
519520

520521
StringNodeFulltext::StringNodeFulltext(const StringNodeFulltext& other)
521-
: StringNodeEqualBase(other)
522+
: ParentNode(other)
523+
, m_value(other.m_value)
524+
, m_col(other.m_col)
525+
, m_link_map(std::make_unique<LinkMap>(*other.m_link_map))
522526
{
523-
m_link_map = std::make_unique<LinkMap>(*other.m_link_map);
524527
}
525528

526-
void StringNodeFulltext::_search_index_init()
529+
void StringNodeFulltext::init(bool will_query_ranges)
527530
{
528-
StringIndex* index = m_link_map->get_target_table()->get_string_index(ParentNode::m_condition_column_key);
531+
ParentNode::init(will_query_ranges);
532+
533+
StringIndex* index = m_link_map->get_target_table()->get_string_index(m_col);
529534
REALM_ASSERT(index && index->is_fulltext_index());
530535
m_index_matches.clear();
531-
index->find_all_fulltext(m_index_matches, StringNodeBase::m_string_value);
536+
index->find_all_fulltext(m_index_matches, m_value);
532537

533538
// If links exists, use backlinks to find the original objects
534539
if (m_link_map->links_exist()) {
@@ -541,7 +546,7 @@ void StringNodeFulltext::_search_index_init()
541546
}
542547

543548
m_index_evaluator = IndexEvaluator{};
544-
m_index_evaluator->init(&m_index_matches);
549+
m_index_evaluator.init(&m_index_matches);
545550
}
546551

547552
std::unique_ptr<ArrayPayload> TwoColumnsNodeBase::update_cached_leaf_pointers_for_column(Allocator& alloc,

src/realm/query_engine.hpp

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ class ParentNode {
151151
{
152152
m_dD = 100.0;
153153

154+
if (m_condition_column_key)
155+
m_table->check_column(m_condition_column_key);
154156
if (m_child)
155157
m_child->init(will_query_ranges);
156158
}
@@ -1647,6 +1649,11 @@ class StringNodeBase : public ParentNode {
16471649
m_dT = 10.0;
16481650
}
16491651

1652+
void table_changed() override
1653+
{
1654+
m_string_interner = m_table.unchecked_ptr()->get_string_interner(m_condition_column_key);
1655+
}
1656+
16501657
void cluster_changed() override
16511658
{
16521659
m_leaf.emplace(m_table.unchecked_ptr()->get_alloc());
@@ -1662,6 +1669,7 @@ class StringNodeBase : public ParentNode {
16621669
m_end_s = 0;
16631670
m_leaf_start = 0;
16641671
m_leaf_end = 0;
1672+
m_interned_string_id = m_string_interner->lookup(m_value);
16651673
}
16661674

16671675
virtual void clear_leaf_state()
@@ -1673,6 +1681,8 @@ class StringNodeBase : public ParentNode {
16731681
: ParentNode(from)
16741682
, m_value(from.m_value)
16751683
, m_string_value(m_value)
1684+
, m_string_interner(from.m_string_interner)
1685+
, m_interned_string_id(from.m_interned_string_id)
16761686
{
16771687
}
16781688

@@ -1687,6 +1697,8 @@ class StringNodeBase : public ParentNode {
16871697
std::optional<std::string> m_value;
16881698
std::optional<ArrayString> m_leaf;
16891699
StringData m_string_value;
1700+
StringInterner* m_string_interner = nullptr;
1701+
std::optional<StringID> m_interned_string_id;
16901702

16911703
size_t m_end_s = 0;
16921704
size_t m_leaf_start = 0;
@@ -1703,7 +1715,7 @@ template <class TConditionFunction>
17031715
class StringNode : public StringNodeBase {
17041716
public:
17051717
constexpr static bool case_sensitive_comparison =
1706-
is_any_v<TConditionFunction, Greater, GreaterEqual, Less, LessEqual>;
1718+
is_any_v<TConditionFunction, NotEqual, Greater, GreaterEqual, Less, LessEqual>;
17071719
StringNode(StringData v, ColKey column)
17081720
: StringNodeBase(v, column)
17091721
{
@@ -1732,8 +1744,21 @@ class StringNode : public StringNodeBase {
17321744
TConditionFunction cond;
17331745

17341746
for (size_t s = start; s < end; ++s) {
1747+
if constexpr (std::is_same_v<TConditionFunction, NotEqual>) {
1748+
if (m_leaf->is_compressed()) {
1749+
if (m_interned_string_id) {
1750+
// The search string has been interned, so there might be a match
1751+
// We can compare the string IDs directly
1752+
const auto id = m_leaf->get_string_id(s);
1753+
if (m_string_interner->compare(*m_interned_string_id, *id) == 0) {
1754+
// The value matched, so we continue to the next value
1755+
continue;
1756+
}
1757+
}
1758+
return s;
1759+
}
1760+
}
17351761
StringData t = get_string(s);
1736-
17371762
if constexpr (case_sensitive_comparison) {
17381763
// case insensitive not implemented for: >, >=, <, <=
17391764
if (cond(t, m_string_value))
@@ -2061,20 +2086,24 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
20612086
size_t _find_first_local(size_t start, size_t end) override;
20622087
};
20632088

2064-
2065-
class StringNodeFulltext : public StringNodeEqualBase {
2089+
class StringNodeFulltext : public ParentNode {
20662090
public:
20672091
StringNodeFulltext(StringData v, ColKey column, std::unique_ptr<LinkMap> lm = {});
20682092

20692093
void table_changed() override;
20702094

2071-
void _search_index_init() override;
2095+
void init(bool will_query_ranges) override;
20722096

20732097
bool has_search_index() const override
20742098
{
20752099
return true; // it's a required precondition for fulltext queries
20762100
}
20772101

2102+
const IndexEvaluator* index_based_keys() override
2103+
{
2104+
return &m_index_evaluator;
2105+
}
2106+
20782107
std::unique_ptr<ParentNode> clone() const override
20792108
{
20802109
return std::unique_ptr<ParentNode>(new StringNodeFulltext(*this));
@@ -2086,13 +2115,16 @@ class StringNodeFulltext : public StringNodeEqualBase {
20862115
}
20872116

20882117
private:
2089-
std::vector<ObjKey> m_index_matches;
2118+
std::string m_value;
2119+
ColKey m_col;
20902120
std::unique_ptr<LinkMap> m_link_map;
2121+
IndexEvaluator m_index_evaluator;
2122+
std::vector<ObjKey> m_index_matches;
20912123
StringNodeFulltext(const StringNodeFulltext&);
20922124

2093-
size_t _find_first_local(size_t, size_t) override
2125+
size_t find_first_local(size_t start, size_t end) override
20942126
{
2095-
REALM_UNREACHABLE();
2127+
return m_index_evaluator.do_search_index(m_cluster, start, end);
20962128
}
20972129
};
20982130

src/realm/table.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1735,9 +1735,25 @@ ObjKey Table::find_first(ColKey col_key, T value) const
17351735
using LeafType = typename ColumnTypeTraits<T>::cluster_leaf_type;
17361736
LeafType leaf(get_alloc());
17371737

1738-
auto f = [&key, &col_key, &value, &leaf](const Cluster* cluster) {
1738+
// In case of a string column we can try to look up the StringID of the search string,
1739+
// and search for that in case the leaf is compressed.
1740+
std::optional<StringID> string_id;
1741+
if constexpr (std::is_same_v<T, StringData>) {
1742+
auto string_interner = get_string_interner(col_key);
1743+
REALM_ASSERT(string_interner != nullptr);
1744+
string_id = string_interner->lookup(value);
1745+
}
1746+
1747+
auto f = [&](const Cluster* cluster) {
17391748
cluster->init_leaf(col_key, &leaf);
1740-
size_t row = leaf.find_first(value, 0, cluster->node_size());
1749+
size_t row;
1750+
if constexpr (std::is_same_v<T, StringData>) {
1751+
row = leaf.find_first(value, 0, cluster->node_size(), string_id);
1752+
}
1753+
else {
1754+
row = leaf.find_first(value, 0, cluster->node_size());
1755+
}
1756+
17411757
if (row != realm::npos) {
17421758
key = cluster->get_real_key(row);
17431759
return IteratorControl::Stop;

test/test_query.cpp

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -330,10 +330,13 @@ columns or queries involved
330330
*/
331331

332332

333-
TEST(Query_NextGen_StringConditions)
333+
TEST_TYPES(Query_NextGen_StringConditions, std::true_type, std::false_type)
334334
{
335-
Group group;
336-
TableRef table1 = group.add_table("table1");
335+
SHARED_GROUP_TEST_PATH(path);
336+
337+
auto db = DB::create(make_in_realm_history(), path);
338+
auto wt = db->start_write();
339+
TableRef table1 = wt->add_table("table1");
337340
auto col_str1 = table1->add_column(type_String, "str1");
338341
auto col_str2 = table1->add_column(type_String, "str2");
339342

@@ -342,6 +345,11 @@ TEST(Query_NextGen_StringConditions)
342345
table1->create_object().set_all("!", "x").get_key();
343346
ObjKey key_1_2 = table1->create_object().set_all("bar", "r").get_key();
344347

348+
if (TEST_TYPE::value) {
349+
wt->commit_and_continue_as_read();
350+
wt->promote_to_write();
351+
}
352+
345353
ObjKey m;
346354
// Equal
347355
m = table1->column<String>(col_str1).equal("bar", false).find();
@@ -433,7 +441,7 @@ TEST(Query_NextGen_StringConditions)
433441
CHECK_EQUAL(m, null_key);
434442

435443
// Test various compare operations with null
436-
TableRef table2 = group.add_table("table2");
444+
TableRef table2 = wt->add_table("table2");
437445
auto col_str3 = table2->add_column(type_String, "str3", true);
438446

439447
ObjKey key_2_0 = table2->create_object().set(col_str3, "foo").get_key();
@@ -442,6 +450,11 @@ TEST(Query_NextGen_StringConditions)
442450
ObjKey key_2_3 = table2->create_object().set(col_str3, "bar").get_key();
443451
ObjKey key_2_4 = table2->create_object().set(col_str3, "").get_key();
444452

453+
if (TEST_TYPE::value) {
454+
wt->commit_and_continue_as_read();
455+
wt->promote_to_write();
456+
}
457+
445458
size_t cnt;
446459
cnt = table2->column<String>(col_str3).contains(StringData("")).count();
447460
CHECK_EQUAL(cnt, 4);
@@ -522,6 +535,12 @@ TEST(Query_NextGen_StringConditions)
522535
}
523536
};
524537

538+
// not equal
539+
check_results((table2->column<String>(col_str3) != StringData("")), {StringData(), "foo", "bar", "!"});
540+
check_results((table2->column<String>(col_str3) != StringData()), {"", "foo", "bar", "!"});
541+
check_results((table2->column<String>(col_str3) != StringData("foo")), {StringData(), "", "bar", "!"});
542+
check_results((table2->column<String>(col_str3) != StringData("barr")), {StringData(), "", "foo", "bar", "!"});
543+
525544
// greater
526545
check_results((table2->column<String>(col_str3) > StringData("")), {"foo", "bar", "!"});
527546
check_results((table2->column<String>(col_str3) > StringData("b")), {"foo", "bar"});
@@ -553,7 +572,7 @@ TEST(Query_NextGen_StringConditions)
553572
check_results((table2->column<String>(col_str3) <= StringData("barrrr")), {"bar", "", "!", StringData()});
554573
check_results((table2->column<String>(col_str3) <= StringData("z")), {"foo", "bar", "", "!", StringData()});
555574

556-
TableRef table3 = group.add_table(StringData("table3"));
575+
TableRef table3 = wt->add_table(StringData("table3"));
557576
auto col_link1 = table3->add_column(*table2, "link1");
558577

559578
table3->create_object().set(col_link1, key_2_0);
@@ -562,6 +581,11 @@ TEST(Query_NextGen_StringConditions)
562581
table3->create_object().set(col_link1, key_2_3);
563582
table3->create_object().set(col_link1, key_2_4);
564583

584+
if (TEST_TYPE::value) {
585+
wt->commit_and_continue_as_read();
586+
wt->promote_to_write();
587+
}
588+
565589
cnt = table3->link(col_link1).column<String>(col_str3).contains(StringData("")).count();
566590
CHECK_EQUAL(cnt, 4);
567591

@@ -638,8 +662,14 @@ TEST(Query_NextGen_StringConditions)
638662
"This is a long search string that does not contain the word being searched for!, "
639663
"This is a long search string that does not contain the word being searched for!, "
640664
"needle";
665+
641666
table2->create_object().set(col_str3, long_string).get_key();
642667

668+
if (TEST_TYPE::value) {
669+
wt->commit_and_continue_as_read();
670+
wt->promote_to_write();
671+
}
672+
643673
cnt = table2->column<String>(col_str3).contains(search_1, false).count();
644674
CHECK_EQUAL(cnt, 1);
645675

0 commit comments

Comments
 (0)