Skip to content

Commit b22adbb

Browse files
committed
In the scanner, traverse docs by sequence instead of by ID.
Previously, the scanner traversed docs by id, skipping deleted documents. However, some plugins like the conflict checker, or the finder, may want to inspect deleted docs. To fix this, switch to using by-seq order during scanning. This will also let us add more precise checkpoints in the future; we could, for example, checkpoint during the db traversal not just per-db. To let plugins customize the traversal, modify the db_opened/2 callback so it can return start changes sequence and changes folding options. Use this new feature in the finder since there it makes sense to scan backwards to find the most recently added data first (i.e. someone just added something they shouldn't have to the db and we'd like to find it). Since we now consider deleted documents, adjust the QuickJS scanner to discard deleted FDIs before even opening the doc bodies. For the conflict checker, add a test to ensure we do catch deleted conflicts when all of them are deleted. As minor tweak, improved the scanner tests by using `#doc{}` records instead of plain tuples.
1 parent bdc7904 commit b22adbb

File tree

4 files changed

+39
-13
lines changed

4 files changed

+39
-13
lines changed

src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
shards/2,
2424
db_opened/2,
2525
doc_id/3,
26+
doc_fdi/3,
2627
doc/3,
2728
db_closing/2
2829
]).
@@ -149,7 +150,7 @@ db_opened(#st{} = St, Db) ->
149150
#st{max_docs = MaxDocs, max_step = MaxStep} = St,
150151
{ok, DocTotal} = couch_db:get_doc_count(Db),
151152
Step = min(MaxStep, max(1, DocTotal div MaxDocs)),
152-
{ok, St#st{doc_cnt = 0, doc_step = Step, docs = []}}.
153+
{0, [], St#st{doc_cnt = 0, doc_step = Step, docs = []}}.
153154

154155
doc_id(#st{} = St, <<?DESIGN_DOC_PREFIX, _/binary>>, _Db) ->
155156
{skip, St};
@@ -162,6 +163,12 @@ doc_id(#st{doc_cnt = C, doc_step = S} = St, _DocId, _Db) when C rem S /= 0 ->
162163
doc_id(#st{doc_cnt = C} = St, _DocId, _Db) ->
163164
{ok, St#st{doc_cnt = C + 1}}.
164165

166+
doc_fdi(#st{} = St, #full_doc_info{deleted = true}, _Db) ->
167+
% Skip deleted; don't even open the doc body
168+
{stop, St};
169+
doc_fdi(#st{} = St, #full_doc_info{}, _Db) ->
170+
{ok, St}.
171+
165172
doc(#st{} = St, Db, #doc{id = DocId} = Doc) ->
166173
#st{sid = SId} = St,
167174
JsonDoc = couch_query_servers:json_doc(Doc),

src/couch_scanner/src/couch_scanner_plugin.erl

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,14 @@
115115
-callback shards(St :: term(), [#shard{}]) ->
116116
{[#shard{}], St1 :: term()}.
117117

118-
% Optional
118+
% Optional. Called right after a shard file is opened so it gets a Db handle.
119+
% Should return the change feed start sequence and a list of options along with any changes
120+
% in a private context. The change feed start sequence should normally be 0 and the list
121+
% of option can be []. The list of options will be passed directly to couch_db:fold_changes,
122+
% so any {dir, Dir}, {end_key, EndSeq} could work there.
123+
%
119124
-callback db_opened(St :: term(), Db :: term()) ->
120-
{ok, St :: term()}.
125+
{ChangesSeq :: non_neg_integer(), ChangesOpts :: [term()], St1 :: term()}.
121126

122127
% Optional. If doc and doc_fdi are not defined, then doc_id default
123128
% action is {skip, St}. If it is defined, the default action is {ok, St}.
@@ -178,6 +183,8 @@
178183
cursor,
179184
shards_db,
180185
db,
186+
changes_seq = 0,
187+
changes_opts = [],
181188
checkpoint_sec = 0,
182189
start_sec = 0,
183190
skip_dbs,
@@ -370,7 +377,8 @@ scan_docs(#st{} = St, #shard{name = ShardDbName}) ->
370377
try
371378
St2 = St1#st{db = Db},
372379
St3 = db_opened_callback(St2),
373-
{ok, St4} = couch_db:fold_docs(Db, fun scan_docs_fold/2, St3, []),
380+
#st{changes_seq = Seq, changes_opts = Opts} = St3,
381+
{ok, St4} = couch_db:fold_changes(Db, Seq, fun scan_docs_fold/2, St3, Opts),
374382
St5 = db_closing_callback(St4),
375383
erlang:garbage_collect(),
376384
St5#st{db = undefined}
@@ -521,13 +529,13 @@ resume_callback(#{} = Cbks, SId, #{} = EJsonPSt) when is_binary(SId) ->
521529

522530
db_opened_callback(#st{pst = PSt, callbacks = Cbks, db = Db} = St) ->
523531
#{db_opened := DbOpenedCbk} = Cbks,
524-
{ok, PSt1} = DbOpenedCbk(PSt, Db),
525-
St#st{pst = PSt1}.
532+
{Seq, Opts, PSt1} = DbOpenedCbk(PSt, Db),
533+
St#st{pst = PSt1, changes_seq = Seq, changes_opts = Opts}.
526534

527535
db_closing_callback(#st{pst = PSt, callbacks = Cbks, db = Db} = St) ->
528536
#{db_closing := DbClosingCbk} = Cbks,
529537
{ok, PSt1} = DbClosingCbk(PSt, Db),
530-
St#st{pst = PSt1}.
538+
St#st{pst = PSt1, changes_seq = 0, changes_opts = []}.
531539

532540
shards_callback(#st{pst = PSt, callbacks = Cbks} = St, Shards) ->
533541
#{shards := ShardsCbk} = Cbks,
@@ -601,7 +609,7 @@ default_shards(Mod, _F, _A) when is_atom(Mod) ->
601609
end.
602610

603611
default_db_opened(Mod, _F, _A) when is_atom(Mod) ->
604-
fun(St, _Db) -> {ok, St} end.
612+
fun(St, _Db) -> {0, [], St} end.
605613

606614
default_doc_id(Mod, _F, _A) when is_atom(Mod) ->
607615
case is_exported(Mod, doc, 3) orelse is_exported(Mod, doc_fdi, 3) of

src/couch_scanner/src/couch_scanner_plugin_find.erl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ db_opened(#st{sid = SId} = St, Db) ->
8888
true -> ?DEBUG("", [], #{sid => SId, db => Db});
8989
false -> ok
9090
end,
91-
{ok, St}.
91+
% Search backwards with the idea that we may be looking for some recent
92+
% changes we just made to the database.
93+
{couch_db:get_update_seq(Db), [{dir, rev}], St}.
9294

9395
doc_id(#st{} = St, DocId, Db) ->
9496
#st{sid = SId, compiled_regexes = Pats} = St,

src/couch_scanner/test/eunit/couch_scanner_test.erl

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ setup() ->
8585
ok = add_doc(DbName2, ?DOC3, #{foo3 => bax}),
8686
ok = add_doc(DbName2, ?DOC4, #{foo4 => baw, <<>> => this_is_ok_apparently}),
8787
add_docs(DbName3, [
88-
{doc, ?DOC5, {2, [<<"x">>, <<"z">>]}, {[]}, [], false, []},
89-
{doc, ?DOC5, {2, [<<"y">>, <<"z">>]}, {[]}, [], false, []}
88+
#doc{id = ?DOC5, revs = {2, [<<"x">>, <<"z">>]}, deleted = false},
89+
#doc{id = ?DOC5, revs = {2, [<<"y">>, <<"z">>]}, deleted = false}
9090
]),
9191
couch_scanner:reset_checkpoints(),
9292
{Ctx, {DbName1, DbName2, DbName3}}.
@@ -204,11 +204,20 @@ t_conflict_finder_works({_, {_, _, DbName3}}) ->
204204
% Add a deleted conflicting doc to the third database.
205205
% 3 reports are expected: 2 doc reports and 1 db report.
206206
add_docs(DbName3, [
207-
{doc, ?DOC6, {2, [<<"x">>, <<"z">>]}, {[]}, [], false, []},
208-
{doc, ?DOC6, {2, [<<"d">>, <<"z">>]}, {[]}, [], true, []}
207+
#doc{id = ?DOC6, revs = {2, [<<"x">>, <<"z">>]}, deleted = false},
208+
#doc{id = ?DOC6, revs = {2, [<<"d">>, <<"z">>]}, deleted = true}
209209
]),
210210
resume_couch_scanner(Plugin),
211211
?assertEqual(3, meck:num_calls(couch_scanner_util, log, LogArgs)),
212+
% Should work even if all revs are deleted (the whole FDI is deleted)
213+
add_docs(DbName3, [
214+
#doc{id = ?DOC6, revs = {3, [<<"a">>, <<"x">>, <<"z">>]}, deleted = true}
215+
]),
216+
% Confirm it's deleted (we did the revs paths manipulations correctly)
217+
?assertEqual({not_found, deleted}, fabric:open_doc(DbName3, ?DOC6, [])),
218+
% But we can still find the conflicts
219+
resume_couch_scanner(Plugin),
220+
?assertEqual(3, meck:num_calls(couch_scanner_util, log, LogArgs)),
212221
% Set doc_report to false to only have 1 db report.
213222
config:set(Plugin, "doc_report", "false", false),
214223
resume_couch_scanner(Plugin),

0 commit comments

Comments
 (0)