From 07e78807e6eb90c84d1d22559d971fd7d5dcfd11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Radwa=C5=84ski?= Date: Mon, 1 Nov 2021 15:31:59 +0100 Subject: [PATCH 1/5] range_tombstone_list: add lower_slice lower_slice returns the range tombstones which have end inside range [start, before). --- range_tombstone_list.cc | 19 +++++++++++++++++++ range_tombstone_list.hh | 9 +++++++-- utils/immutable-collection.hh | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/range_tombstone_list.cc b/range_tombstone_list.cc index 9eeeb91d4e8f..ca4ea4b0e67d 100644 --- a/range_tombstone_list.cc +++ b/range_tombstone_list.cc @@ -345,6 +345,25 @@ range_tombstone_list::slice(const schema& s, position_in_partition_view start, p _tombstones.lower_bound(end, order_by_start{s})); } +range_tombstone_list::iterator_range +range_tombstone_list::lower_slice(const schema& s, bound_view start, position_in_partition_view before) const { + struct bv_order_by_end { + bound_view::compare less; + bv_order_by_end(const schema& s) : less(s) {} + bool operator()(bound_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_bound()); } + bool operator()(const range_tombstone_entry& rt, bound_view v) const { return less(rt.end_bound(), v); } + }; + struct pos_order_by_end { + position_in_partition::less_compare less; + pos_order_by_end(const schema& s) : less(s) {} + bool operator()(position_in_partition_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_position()); } + bool operator()(const range_tombstone_entry& rt, position_in_partition_view v) const { return less(rt.end_position(), v); } + }; + return boost::make_iterator_range( + _tombstones.lower_bound(start, bv_order_by_end{s}), + _tombstones.lower_bound(before, pos_order_by_end{s})); +} + range_tombstone_list::iterator_range range_tombstone_list::upper_slice(const schema& s, position_in_partition_view after, bound_view end) const { struct pos_order_by_start { diff --git a/range_tombstone_list.hh b/range_tombstone_list.hh index 74a169f5ed09..47b5289dc85c 100644 --- a/range_tombstone_list.hh +++ b/range_tombstone_list.hh @@ -230,11 +230,16 @@ public: tombstone search_tombstone_covering(const schema& s, const clustering_key_prefix& key) const; using iterator_range = boost::iterator_range; - // Returns range of tombstones which overlap with given range + // Returns range tombstones which overlap with given range iterator_range slice(const schema& s, const query::clustering_range&) const; // Returns range tombstones which overlap with [start, end) iterator_range slice(const schema& s, position_in_partition_view start, position_in_partition_view end) const; - iterator_range upper_slice(const schema& s, position_in_partition_view start, bound_view end) const; + + // Returns range tombstones with ends inside [start, before). + iterator_range lower_slice(const schema& s, bound_view start, position_in_partition_view before) const; + // Returns range tombstones with starts inside (after, end]. + iterator_range upper_slice(const schema& s, position_in_partition_view after, bound_view end) const; + iterator erase(const_iterator, const_iterator); // Pops the first element and bans (in theory) further additions diff --git a/utils/immutable-collection.hh b/utils/immutable-collection.hh index e90746467535..189c84448da1 100644 --- a/utils/immutable-collection.hh +++ b/utils/immutable-collection.hh @@ -64,6 +64,7 @@ public: WRAP_METHOD(lower_bound) WRAP_METHOD(upper_bound) WRAP_METHOD(slice) + WRAP_METHOD(lower_slice) WRAP_METHOD(upper_slice) WRAP_CONST_METHOD(empty) From 94b263e356c6978a6133e481de41f3ec2a3526d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Radwa=C5=84ski?= Date: Mon, 1 Nov 2021 15:33:29 +0100 Subject: [PATCH 2/5] partition_snapshot_reader: fix obtaining rt_slice, if Reversing and _last_rts was set If Reversing and _last_rts was set, the created rt_slice still contained range tombstones between *_last_rts and (snapshot) clustering range end. This is wrong - the correct range is between (snapshot) clustering range begin and *_last_rts. --- partition_snapshot_reader.hh | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/partition_snapshot_reader.hh b/partition_snapshot_reader.hh index 2e0c6dca34c2..4a7be6520438 100644 --- a/partition_snapshot_reader.hh +++ b/partition_snapshot_reader.hh @@ -152,10 +152,15 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public } range_tombstone_list::iterator_range rt_slice = [&] () { + const auto& tombstones = v.partition().row_tombstones(); if (last_rts) { - return v.partition().row_tombstones().upper_slice(*_snapshot_schema, *last_rts, bound_view::from_range_end(ck_range_snapshot)); + if constexpr (Reversing) { + return tombstones.lower_slice(*_snapshot_schema, bound_view::from_range_start(ck_range_snapshot), *last_rts); + } else { + return tombstones.upper_slice(*_snapshot_schema, *last_rts, bound_view::from_range_end(ck_range_snapshot)); + } } else { - return v.partition().row_tombstones().slice(*_snapshot_schema, ck_range_snapshot); + return tombstones.slice(*_snapshot_schema, ck_range_snapshot); } }(); if (rt_slice.begin() != rt_slice.end()) { @@ -339,6 +344,9 @@ private: std::optional opt_reversed_range; std::optional _last_entry; + // When not Reversing, it's .position() of last emitted range tombstone. + // When Reversing, it's .position().reversed() of last emitted range tombstone, + // so that it is usable from functions expecting position in snapshot domain. std::optional _last_rts; mutation_fragment_opt _next_row; @@ -357,6 +365,18 @@ private: } } + // If `Reversing`, when we pop_range_tombstone(), a reversed rt is returned (the correct + // one in query clustering order). In order to save progress of reading from range_tombstone_list, + // we need to save the end position of rt (as it was stored in the list). This corresponds to + // the start position, with reversed bound weigth. + static position_in_partition rt_position_in_snapshot_order(const range_tombstone& rt) { + position_in_partition pos(rt.position()); + if constexpr (Reversing) { + pos = pos.reversed(); + } + return pos; + } + mutation_fragment_opt read_next() { // We use the names ck_range_snapshot and ck_range_query to denote clustering order. // ck_range_snapshot uses the snapshot order, while ck_range_query uses the @@ -374,7 +394,7 @@ private: auto mf = _reader.next_range_tombstone(ck_range_snapshot, ck_range_query, _last_entry, _last_rts, pos_view); if (mf) { - _last_rts = mf->as_range_tombstone().position(); + _last_rts = rt_position_in_snapshot_order(mf->as_range_tombstone()); return mf; } return std::exchange(_next_row, {}); @@ -382,7 +402,7 @@ private: _no_more_rows_in_current_range = true; auto mf = _reader.next_range_tombstone(ck_range_snapshot, ck_range_query, _last_entry, _last_rts, position_in_partition_view::for_range_end(ck_range_query)); if (mf) { - _last_rts = mf->as_range_tombstone().position(); + _last_rts = rt_position_in_snapshot_order(mf->as_range_tombstone()); } return mf; } From cac9ac512643019d0e6f54391f0a51bf77e29696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Radwa=C5=84ski?= Date: Mon, 1 Nov 2021 15:34:49 +0100 Subject: [PATCH 3/5] test: memtable: add full_compaction in background Add full compaction in test_memtable_with_many_versions_conforms_to_mutation_source in background. Without it, some paths in the partition snapshot reader weren't covered, as the tests always managed to read all range tombstones and rows which cover a given clustering range from just a single snapshot. Now, when full_compaction happens in process of reading from a clustering range, we can force state refresh with non-nullopt positions of last row and last range tombstone. Note: this inability to test affected only the reversing reader. --- test/boost/memtable_test.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/boost/memtable_test.cc b/test/boost/memtable_test.cc index da7e97389f06..8d6299c1ab36 100644 --- a/test/boost/memtable_test.cc +++ b/test/boost/memtable_test.cc @@ -100,6 +100,16 @@ SEASTAR_TEST_CASE(test_memtable_with_many_versions_conforms_to_mutation_source) }; auto cleanup_readers = defer([&] { clear_readers(); }); std::deque ranges_storage; + lw_shared_ptr finished = make_lw_shared(false); + auto full_compaction_in_background = seastar::do_until([finished] {return *finished;}, [] { + // do_refresh_state is called when we detect a new partition snapshot version. + // If snapshot version changes in process of reading mutation fragments from a + // clustering range, the partition_snapshot_reader state is refreshed with saved + // last position of emitted row and range tombstone. full_compaction increases the + // change mark. + logalloc::shard_tracker().full_compaction(); + return seastar::sleep(100us); + }); run_mutation_source_tests([&] (schema_ptr s, const std::vector& muts) { clear_readers(); mt = make_lw_shared(s); @@ -115,6 +125,8 @@ SEASTAR_TEST_CASE(test_memtable_with_many_versions_conforms_to_mutation_source) return mt->as_data_source(); }); + *finished = true; + full_compaction_in_background.get(); }); } From 35b1c3ff521b2027665b5b1292055d7e67bec64c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Radwa=C5=84ski?= Date: Mon, 1 Nov 2021 21:01:14 +0100 Subject: [PATCH 4/5] range_tombstone_list: {lower,upper,}slice share comparator implementation slice (2 overloads), upper_slice, lower_slice previously had implementations of a comparator. Move out the common structs, so that all 4 of them can share implementation. --- range_tombstone_list.cc | 84 ++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 52 deletions(-) diff --git a/range_tombstone_list.cc b/range_tombstone_list.cc index ca4ea4b0e67d..ca68701f1630 100644 --- a/range_tombstone_list.cc +++ b/range_tombstone_list.cc @@ -306,59 +306,51 @@ range_tombstone_list::reverter range_tombstone_list::apply_reversibly(const sche return rev; } +namespace { +struct bv_order_by_end { + bound_view::compare less; + bv_order_by_end(const schema& s) : less(s) {} + bool operator()(bound_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_bound()); } + bool operator()(const range_tombstone_entry& rt, bound_view v) const { return less(rt.end_bound(), v); } +}; +struct bv_order_by_start { + bound_view::compare less; + bv_order_by_start(const schema& s) : less(s) {} + bool operator()(bound_view v, const range_tombstone_entry& rt) const { return less(v, rt.start_bound()); } + bool operator()(const range_tombstone_entry& rt, bound_view v) const { return less(rt.start_bound(), v); } +}; + +struct pos_order_by_end { + position_in_partition::less_compare less; + pos_order_by_end(const schema& s) : less(s) {} + bool operator()(position_in_partition_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_position()); } + bool operator()(const range_tombstone_entry& rt, position_in_partition_view v) const { return less(rt.end_position(), v); } +}; +struct pos_order_by_start { + position_in_partition::less_compare less; + pos_order_by_start(const schema& s) : less(s) {} + bool operator()(position_in_partition_view v, const range_tombstone_entry& rt) const { return less(v, rt.position()); } + bool operator()(const range_tombstone_entry& rt, position_in_partition_view v) const { return less(rt.position(), v); } +}; +} // namespace + range_tombstone_list::iterator_range range_tombstone_list::slice(const schema& s, const query::clustering_range& r) const { auto bv_range = bound_view::from_range(r); - struct order_by_end { - bound_view::compare less; - order_by_end(const schema& s) : less(s) {} - bool operator()(bound_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_bound()); } - bool operator()(const range_tombstone_entry& rt, bound_view v) const { return less(rt.end_bound(), v); } - }; - struct order_by_start { - bound_view::compare less; - order_by_start(const schema& s) : less(s) {} - bool operator()(bound_view v, const range_tombstone_entry& rt) const { return less(v, rt.start_bound()); } - bool operator()(const range_tombstone_entry& rt, bound_view v) const { return less(rt.start_bound(), v); } - }; return boost::make_iterator_range( - _tombstones.lower_bound(bv_range.first, order_by_end{s}), - _tombstones.upper_bound(bv_range.second, order_by_start{s})); + _tombstones.lower_bound(bv_range.first, bv_order_by_end{s}), + _tombstones.upper_bound(bv_range.second, bv_order_by_start{s})); } range_tombstone_list::iterator_range range_tombstone_list::slice(const schema& s, position_in_partition_view start, position_in_partition_view end) const { - struct order_by_end { - position_in_partition::less_compare less; - order_by_end(const schema& s) : less(s) {} - bool operator()(position_in_partition_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_position()); } - bool operator()(const range_tombstone_entry& rt, position_in_partition_view v) const { return less(rt.end_position(), v); } - }; - struct order_by_start { - position_in_partition::less_compare less; - order_by_start(const schema& s) : less(s) {} - bool operator()(position_in_partition_view v, const range_tombstone_entry& rt) const { return less(v, rt.position()); } - bool operator()(const range_tombstone_entry& rt, position_in_partition_view v) const { return less(rt.position(), v); } - }; return boost::make_iterator_range( - _tombstones.upper_bound(start, order_by_end{s}), // end_position() is exclusive, hence upper_bound() - _tombstones.lower_bound(end, order_by_start{s})); + _tombstones.upper_bound(start, pos_order_by_end{s}), // end_position() is exclusive, hence upper_bound() + _tombstones.lower_bound(end, pos_order_by_start{s})); } range_tombstone_list::iterator_range range_tombstone_list::lower_slice(const schema& s, bound_view start, position_in_partition_view before) const { - struct bv_order_by_end { - bound_view::compare less; - bv_order_by_end(const schema& s) : less(s) {} - bool operator()(bound_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_bound()); } - bool operator()(const range_tombstone_entry& rt, bound_view v) const { return less(rt.end_bound(), v); } - }; - struct pos_order_by_end { - position_in_partition::less_compare less; - pos_order_by_end(const schema& s) : less(s) {} - bool operator()(position_in_partition_view v, const range_tombstone_entry& rt) const { return less(v, rt.end_position()); } - bool operator()(const range_tombstone_entry& rt, position_in_partition_view v) const { return less(rt.end_position(), v); } - }; return boost::make_iterator_range( _tombstones.lower_bound(start, bv_order_by_end{s}), _tombstones.lower_bound(before, pos_order_by_end{s})); @@ -366,18 +358,6 @@ range_tombstone_list::lower_slice(const schema& s, bound_view start, position_in range_tombstone_list::iterator_range range_tombstone_list::upper_slice(const schema& s, position_in_partition_view after, bound_view end) const { - struct pos_order_by_start { - position_in_partition::less_compare less; - pos_order_by_start(const schema& s) : less(s) {} - bool operator()(position_in_partition_view v, const range_tombstone_entry& rt) const { return less(v, rt.position()); } - bool operator()(const range_tombstone_entry& rt, position_in_partition_view v) const { return less(rt.position(), v); } - }; - struct bv_order_by_start { - bound_view::compare less; - bv_order_by_start(const schema& s) : less(s) {} - bool operator()(bound_view v, const range_tombstone_entry& rt) const { return less(v, rt.start_bound()); } - bool operator()(const range_tombstone_entry& rt, bound_view v) const { return less(rt.start_bound(), v); } - }; return boost::make_iterator_range( _tombstones.upper_bound(after, pos_order_by_start{s}), _tombstones.upper_bound(end, bv_order_by_start{s})); From ee601b7d877dda385d93225812473dba269d9347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Radwa=C5=84ski?= Date: Tue, 2 Nov 2021 11:17:10 +0100 Subject: [PATCH 5/5] partition_snapshot_reader: fix indentation in fill_buffer --- partition_snapshot_reader.hh | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/partition_snapshot_reader.hh b/partition_snapshot_reader.hh index 4a7be6520438..25431abc7c83 100644 --- a/partition_snapshot_reader.hh +++ b/partition_snapshot_reader.hh @@ -470,18 +470,17 @@ public: } virtual future<> fill_buffer() override { - // FIXME: indentation - return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] { - _reader.with_reserve([&] { - if (!_static_row_done) { - push_static_row(); - on_new_range(); - _static_row_done = true; - } - do_fill_buffer(); + return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] { + _reader.with_reserve([&] { + if (!_static_row_done) { + push_static_row(); + on_new_range(); + _static_row_done = true; + } + do_fill_buffer(); + }); + return make_ready_future<>(); }); - return make_ready_future<>(); - }); } virtual future<> next_partition() override { clear_buffer_to_next_partition();