Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.

Commit 249e051

Browse files
bors[bot]loiclec
andauthored
Merge #750
750: Fix hard-deletion of an external id that was soft-deleted and then reimported - main r=irevoire a=loiclec # Pull Request ## Related issue Fixes (when merged into meilisearch) meilisearch/meilisearch#3021 ## What does this PR do? There was a bug happening when: 1. Documents were added 2. Some of these documents were replaced using soft-deletion 3. A deletion of another non-replaced document takes place and triggers a hard-deletion 4. Documents with the same identifiers as the replaced documents are added again Then, search results would return duplicate documents. No crash would happen at any time (this is the reason it wasn't caught by the previous fuzz test. I have updated the new one such that it also checks the result of a placeholder search request, which then finds the bug immediately). The cause of the bug is: 1. When a hard-deletion is triggered, we try to retrieve the external document id associated with each soft-deleted document id. 2. Then, we take this list of external document ids and remove each of them from the `ExternalDocumentsIds` structure. 3. However, this is not correct in case an existing (non-deleted) document shares the external id of a soft-deleted document. ## Implementation of the fix 1. Before we process a permanent deletion, we update the list of soft-deleted document ids. 2. Then, the permanent deletion's job is to remove the soft-deleted documents from all data structures. Therefore, to update `ExternalDocumentsIds`, we can simply call the `delete_soft_deleted_documents_ids_from_fsts` method, which is faster and simpler. ## Correctness A unit test was added to reproduce the bug. The new fuzz test, when adjusted to check the correctness of a placeholder search, could also instantly reproduce the bug, but now does not find any other problem. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
2 parents 97fb64e + fc0e738 commit 249e051

File tree

3 files changed

+116
-154
lines changed

3 files changed

+116
-154
lines changed

milli/src/external_documents_ids.rs

Lines changed: 0 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -47,30 +47,6 @@ impl<'a> ExternalDocumentsIds<'a> {
4747
}
4848
}
4949

50-
pub fn delete_ids<A: AsRef<[u8]>>(&mut self, other: fst::Set<A>) -> fst::Result<()> {
51-
let other = fst::Map::from(other.into_fst());
52-
let union_op = self.soft.op().add(&other).r#union();
53-
54-
let mut iter = union_op.into_stream();
55-
let mut new_soft_builder = fst::MapBuilder::memory();
56-
while let Some((external_id, docids)) = iter.next() {
57-
if docids.iter().any(|v| v.index == 1) {
58-
// If the `other` set returns a value here it means
59-
// that it must be marked as deleted.
60-
new_soft_builder.insert(external_id, DELETED_ID)?;
61-
} else {
62-
let value = docids.iter().find(|v| v.index == 0).unwrap().value;
63-
new_soft_builder.insert(external_id, value)?;
64-
}
65-
}
66-
67-
drop(iter);
68-
69-
// We save this new map as the new soft map.
70-
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
71-
self.merge_soft_into_hard()
72-
}
73-
7450
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
7551
/// don't contain any soft deleted document id.
7652
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
@@ -173,76 +149,3 @@ impl Default for ExternalDocumentsIds<'static> {
173149
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
174150
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
175151
}
176-
177-
#[cfg(test)]
178-
mod tests {
179-
use super::*;
180-
181-
#[test]
182-
fn simple_insert_delete_ids() {
183-
let mut external_documents_ids = ExternalDocumentsIds::default();
184-
185-
let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap();
186-
external_documents_ids.insert_ids(&new_ids).unwrap();
187-
188-
assert_eq!(external_documents_ids.get("a"), Some(1));
189-
assert_eq!(external_documents_ids.get("b"), Some(2));
190-
assert_eq!(external_documents_ids.get("c"), Some(3));
191-
assert_eq!(external_documents_ids.get("d"), Some(4));
192-
193-
let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap();
194-
external_documents_ids.insert_ids(&new_ids).unwrap();
195-
196-
assert_eq!(external_documents_ids.get("a"), Some(1));
197-
assert_eq!(external_documents_ids.get("b"), Some(2));
198-
assert_eq!(external_documents_ids.get("c"), Some(3));
199-
assert_eq!(external_documents_ids.get("d"), Some(4));
200-
assert_eq!(external_documents_ids.get("e"), Some(5));
201-
assert_eq!(external_documents_ids.get("f"), Some(6));
202-
assert_eq!(external_documents_ids.get("g"), Some(7));
203-
204-
let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap();
205-
external_documents_ids.delete_ids(del_ids).unwrap();
206-
207-
assert_eq!(external_documents_ids.get("a"), None);
208-
assert_eq!(external_documents_ids.get("b"), Some(2));
209-
assert_eq!(external_documents_ids.get("c"), None);
210-
assert_eq!(external_documents_ids.get("d"), Some(4));
211-
assert_eq!(external_documents_ids.get("e"), Some(5));
212-
assert_eq!(external_documents_ids.get("f"), None);
213-
assert_eq!(external_documents_ids.get("g"), Some(7));
214-
215-
let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap();
216-
external_documents_ids.insert_ids(&new_ids).unwrap();
217-
218-
assert_eq!(external_documents_ids.get("a"), Some(5));
219-
assert_eq!(external_documents_ids.get("b"), Some(6));
220-
assert_eq!(external_documents_ids.get("c"), None);
221-
assert_eq!(external_documents_ids.get("d"), Some(4));
222-
assert_eq!(external_documents_ids.get("e"), Some(5));
223-
assert_eq!(external_documents_ids.get("f"), None);
224-
assert_eq!(external_documents_ids.get("g"), Some(7));
225-
assert_eq!(external_documents_ids.get("h"), Some(8));
226-
}
227-
228-
#[test]
229-
fn strange_delete_insert_ids() {
230-
let mut external_documents_ids = ExternalDocumentsIds::default();
231-
232-
let new_ids =
233-
fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap();
234-
external_documents_ids.insert_ids(&new_ids).unwrap();
235-
assert_eq!(external_documents_ids.get("1"), Some(0));
236-
assert_eq!(external_documents_ids.get("123"), Some(1));
237-
assert_eq!(external_documents_ids.get("30"), Some(2));
238-
assert_eq!(external_documents_ids.get("456"), Some(3));
239-
240-
let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap();
241-
external_documents_ids.delete_ids(deleted_ids).unwrap();
242-
assert_eq!(external_documents_ids.get("30"), None);
243-
244-
let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap();
245-
external_documents_ids.insert_ids(&new_ids).unwrap();
246-
assert_eq!(external_documents_ids.get("30"), Some(2));
247-
}
248-
}

milli/src/index.rs

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1181,6 +1181,7 @@ impl Index {
11811181

11821182
#[cfg(test)]
11831183
pub(crate) mod tests {
1184+
use std::collections::HashSet;
11841185
use std::ops::Deref;
11851186

11861187
use big_s::S;
@@ -1195,7 +1196,7 @@ pub(crate) mod tests {
11951196
self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
11961197
IndexDocumentsMethod, IndexerConfig, Settings,
11971198
};
1198-
use crate::{db_snap, obkv_to_json, Index};
1199+
use crate::{db_snap, obkv_to_json, Index, Search, SearchResult};
11991200

12001201
pub(crate) struct TempIndex {
12011202
pub inner: Index,
@@ -2188,4 +2189,97 @@ pub(crate) mod tests {
21882189
"###);
21892190
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
21902191
}
2192+
2193+
#[test]
2194+
fn bug_3021_fourth() {
2195+
// https://github.com/meilisearch/meilisearch/issues/3021
2196+
let mut index = TempIndex::new();
2197+
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
2198+
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
2199+
2200+
index
2201+
.update_settings(|settings| {
2202+
settings.set_primary_key("primary_key".to_owned());
2203+
})
2204+
.unwrap();
2205+
2206+
index
2207+
.add_documents(documents!([
2208+
{ "primary_key": 11 },
2209+
{ "primary_key": 4 },
2210+
]))
2211+
.unwrap();
2212+
2213+
db_snap!(index, documents_ids, @"[0, 1, ]");
2214+
db_snap!(index, external_documents_ids, @r###"
2215+
soft:
2216+
hard:
2217+
11 0
2218+
4 1
2219+
"###);
2220+
db_snap!(index, soft_deleted_documents_ids, @"[]");
2221+
2222+
index
2223+
.add_documents(documents!([
2224+
{ "primary_key": 4, "a": 0 },
2225+
{ "primary_key": 1 },
2226+
]))
2227+
.unwrap();
2228+
2229+
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
2230+
db_snap!(index, external_documents_ids, @r###"
2231+
soft:
2232+
hard:
2233+
1 3
2234+
11 0
2235+
4 2
2236+
"###);
2237+
db_snap!(index, soft_deleted_documents_ids, @"[1, ]");
2238+
2239+
let mut wtxn = index.write_txn().unwrap();
2240+
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
2241+
delete.strategy(DeletionStrategy::AlwaysHard);
2242+
delete.execute().unwrap();
2243+
wtxn.commit().unwrap();
2244+
2245+
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
2246+
db_snap!(index, external_documents_ids, @r###"
2247+
soft:
2248+
hard:
2249+
1 3
2250+
11 0
2251+
4 2
2252+
"###);
2253+
db_snap!(index, soft_deleted_documents_ids, @"[]");
2254+
2255+
index
2256+
.add_documents(documents!([
2257+
{ "primary_key": 4, "a": 1 },
2258+
{ "primary_key": 1, "a": 0 },
2259+
]))
2260+
.unwrap();
2261+
2262+
db_snap!(index, documents_ids, @"[0, 1, 4, ]");
2263+
db_snap!(index, external_documents_ids, @r###"
2264+
soft:
2265+
hard:
2266+
1 4
2267+
11 0
2268+
4 1
2269+
"###);
2270+
db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]");
2271+
2272+
let rtxn = index.read_txn().unwrap();
2273+
let search = Search::new(&rtxn, &index);
2274+
let SearchResult { matching_words: _, candidates: _, mut documents_ids } =
2275+
search.execute().unwrap();
2276+
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
2277+
documents_ids.sort_unstable();
2278+
let docs = index.documents(&rtxn, documents_ids).unwrap();
2279+
let mut all_ids = HashSet::new();
2280+
for (_docid, obkv) in docs {
2281+
let id = obkv.get(primary_key_id).unwrap();
2282+
assert!(all_ids.insert(id));
2283+
}
2284+
}
21912285
}

milli/src/update/delete_documents.rs

Lines changed: 21 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,14 @@ use heed::types::{ByteSlice, DecodeIgnore, Str};
66
use heed::Database;
77
use roaring::RoaringBitmap;
88
use serde::{Deserialize, Serialize};
9-
use serde_json::Value;
109
use time::OffsetDateTime;
1110

1211
use super::facet::delete::FacetsDelete;
1312
use super::ClearDocuments;
14-
use crate::error::{InternalError, UserError};
13+
use crate::error::InternalError;
1514
use crate::facet::FacetType;
1615
use crate::heed_codec::facet::FieldDocIdFacetCodec;
1716
use crate::heed_codec::CboRoaringBitmapCodec;
18-
use crate::index::{db_name, main_key};
1917
use crate::{
2018
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
2119
SmallString32, BEU32,
@@ -186,6 +184,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
186184

187185
soft_deleted_docids |= &self.to_delete_docids;
188186

187+
// We always soft-delete the documents, even if they will be permanently
188+
// deleted immediately after.
189+
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
190+
189191
// decide for a hard or soft deletion depending on the strategy
190192
let soft_deletion = match self.strategy {
191193
DeletionStrategy::Dynamic => {
@@ -214,31 +216,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
214216

215217
if soft_deletion {
216218
// Keep the soft-deleted in the DB
217-
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
218219
return Ok(DetailedDocumentDeletionResult {
219220
deleted_documents: self.to_delete_docids.len(),
220221
remaining_documents: documents_ids.len(),
221222
soft_deletion_used: true,
222223
});
223224
}
224225

225-
// Erase soft-deleted from DB
226226
self.to_delete_docids = soft_deleted_docids;
227-
// and we can reset the soft deleted bitmap
228-
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
229-
230-
let primary_key =
231-
self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry {
232-
db_name: db_name::MAIN,
233-
key: Some(main_key::PRIMARY_KEY_KEY),
234-
})?;
235-
236-
// Since we already checked if the DB was empty, if we can't find the primary key, then
237-
// something is wrong, and we must return an error.
238-
let id_field = match fields_ids_map.id(primary_key) {
239-
Some(field) => field,
240-
None => return Err(UserError::MissingPrimaryKey.into()),
241-
};
242227

243228
let Index {
244229
env: _env,
@@ -262,33 +247,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
262247
documents,
263248
} = self.index;
264249

265-
// Retrieve the words and the external documents ids contained in the documents.
250+
// Retrieve the words contained in the documents.
266251
let mut words = Vec::new();
267-
let mut external_ids = Vec::new();
268252
for docid in &self.to_delete_docids {
269-
// We create an iterator to be able to get the content and delete the document
270-
// content itself. It's faster to acquire a cursor to get and delete,
271-
// as we avoid traversing the LMDB B-Tree two times but only once.
272-
let key = BEU32::new(docid);
273-
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
274-
if let Some((_key, obkv)) = iter.next().transpose()? {
275-
if let Some(content) = obkv.get(id_field) {
276-
let external_id = match serde_json::from_slice(content).unwrap() {
277-
Value::String(string) => SmallString32::from(string.as_str()),
278-
Value::Number(number) => SmallString32::from(number.to_string()),
279-
document_id => {
280-
return Err(UserError::InvalidDocumentId { document_id }.into())
281-
}
282-
};
283-
external_ids.push(external_id);
284-
}
285-
// safety: we don't keep references from inside the LMDB database.
286-
unsafe { iter.del_current()? };
287-
}
288-
drop(iter);
253+
documents.delete(self.wtxn, &BEU32::new(docid))?;
289254

290-
// We iterate through the words positions of the document id,
291-
// retrieve the word and delete the positions.
255+
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
256+
// We create an iterator to be able to get the content and delete the key-value itself.
257+
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
292258
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
293259
while let Some(result) = iter.next() {
294260
let ((_docid, word), _positions) = result?;
@@ -298,17 +264,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
298264
unsafe { iter.del_current()? };
299265
}
300266
}
301-
302-
// We create the FST map of the external ids that we must delete.
303-
external_ids.sort_unstable();
304-
let external_ids_to_delete = fst::Set::from_iter(external_ids)?;
305-
306267
// We acquire the current external documents ids map...
268+
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
307269
let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
308-
// ...and remove the to-delete external ids.
309-
new_external_documents_ids.delete_ids(external_ids_to_delete)?;
310-
311-
// We write the new external ids into the main database.
270+
// We then remove the soft-deleted docids from it
271+
new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
272+
// and write it back to the main database.
312273
let new_external_documents_ids = new_external_documents_ids.into_static();
313274
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
314275

@@ -545,6 +506,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
545506
&self.to_delete_docids,
546507
)?;
547508

509+
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
510+
548511
Ok(DetailedDocumentDeletionResult {
549512
deleted_documents: self.to_delete_docids.len(),
550513
remaining_documents: documents_ids.len(),
@@ -1125,14 +1088,16 @@ mod tests {
11251088
id
11261089
);
11271090
}
1091+
wtxn.commit().unwrap();
1092+
1093+
let rtxn = index.read_txn().unwrap();
11281094

11291095
// get internal docids from deleted external document ids
1130-
let results = index.external_documents_ids(&wtxn).unwrap();
1096+
let results = index.external_documents_ids(&rtxn).unwrap();
11311097
for id in deleted_external_ids {
11321098
assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
11331099
}
1134-
1135-
wtxn.commit().unwrap();
1100+
drop(rtxn);
11361101

11371102
db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
11381103
}

0 commit comments

Comments
 (0)