Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 0 additions & 97 deletions milli/src/external_documents_ids.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,30 +47,6 @@ impl<'a> ExternalDocumentsIds<'a> {
}
}

pub fn delete_ids<A: AsRef<[u8]>>(&mut self, other: fst::Set<A>) -> fst::Result<()> {
let other = fst::Map::from(other.into_fst());
let union_op = self.soft.op().add(&other).r#union();

let mut iter = union_op.into_stream();
let mut new_soft_builder = fst::MapBuilder::memory();
while let Some((external_id, docids)) = iter.next() {
if docids.iter().any(|v| v.index == 1) {
// If the `other` set returns a value here it means
// that it must be marked as deleted.
new_soft_builder.insert(external_id, DELETED_ID)?;
} else {
let value = docids.iter().find(|v| v.index == 0).unwrap().value;
new_soft_builder.insert(external_id, value)?;
}
}

drop(iter);

// We save this new map as the new soft map.
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
self.merge_soft_into_hard()
}

/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
/// don't contain any soft deleted document id.
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
Expand Down Expand Up @@ -173,76 +149,3 @@ impl Default for ExternalDocumentsIds<'static> {
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn simple_insert_delete_ids() {
let mut external_documents_ids = ExternalDocumentsIds::default();

let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap();
external_documents_ids.insert_ids(&new_ids).unwrap();

assert_eq!(external_documents_ids.get("a"), Some(1));
assert_eq!(external_documents_ids.get("b"), Some(2));
assert_eq!(external_documents_ids.get("c"), Some(3));
assert_eq!(external_documents_ids.get("d"), Some(4));

let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap();
external_documents_ids.insert_ids(&new_ids).unwrap();

assert_eq!(external_documents_ids.get("a"), Some(1));
assert_eq!(external_documents_ids.get("b"), Some(2));
assert_eq!(external_documents_ids.get("c"), Some(3));
assert_eq!(external_documents_ids.get("d"), Some(4));
assert_eq!(external_documents_ids.get("e"), Some(5));
assert_eq!(external_documents_ids.get("f"), Some(6));
assert_eq!(external_documents_ids.get("g"), Some(7));

let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap();
external_documents_ids.delete_ids(del_ids).unwrap();

assert_eq!(external_documents_ids.get("a"), None);
assert_eq!(external_documents_ids.get("b"), Some(2));
assert_eq!(external_documents_ids.get("c"), None);
assert_eq!(external_documents_ids.get("d"), Some(4));
assert_eq!(external_documents_ids.get("e"), Some(5));
assert_eq!(external_documents_ids.get("f"), None);
assert_eq!(external_documents_ids.get("g"), Some(7));

let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap();
external_documents_ids.insert_ids(&new_ids).unwrap();

assert_eq!(external_documents_ids.get("a"), Some(5));
assert_eq!(external_documents_ids.get("b"), Some(6));
assert_eq!(external_documents_ids.get("c"), None);
assert_eq!(external_documents_ids.get("d"), Some(4));
assert_eq!(external_documents_ids.get("e"), Some(5));
assert_eq!(external_documents_ids.get("f"), None);
assert_eq!(external_documents_ids.get("g"), Some(7));
assert_eq!(external_documents_ids.get("h"), Some(8));
}

#[test]
fn strange_delete_insert_ids() {
let mut external_documents_ids = ExternalDocumentsIds::default();

let new_ids =
fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap();
external_documents_ids.insert_ids(&new_ids).unwrap();
assert_eq!(external_documents_ids.get("1"), Some(0));
assert_eq!(external_documents_ids.get("123"), Some(1));
assert_eq!(external_documents_ids.get("30"), Some(2));
assert_eq!(external_documents_ids.get("456"), Some(3));

let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap();
external_documents_ids.delete_ids(deleted_ids).unwrap();
assert_eq!(external_documents_ids.get("30"), None);

let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap();
external_documents_ids.insert_ids(&new_ids).unwrap();
assert_eq!(external_documents_ids.get("30"), Some(2));
}
}
95 changes: 94 additions & 1 deletion milli/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1181,6 +1181,7 @@ impl Index {

#[cfg(test)]
pub(crate) mod tests {
use std::collections::HashSet;
use std::ops::Deref;

use big_s::S;
Expand All @@ -1195,7 +1196,7 @@ pub(crate) mod tests {
self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
IndexerConfig, Settings,
};
use crate::{db_snap, obkv_to_json, Index};
use crate::{db_snap, obkv_to_json, Index, Search, SearchResult};

pub(crate) struct TempIndex {
pub inner: Index,
Expand Down Expand Up @@ -2187,4 +2188,96 @@ pub(crate) mod tests {
"###);
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
}

#[test]
fn bug_3021_fourth() {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex::new();
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

index
.update_settings(|settings| {
settings.set_primary_key("primary_key".to_owned());
})
.unwrap();

index
.add_documents(documents!([
{ "primary_key": 11 },
{ "primary_key": 4 },
]))
.unwrap();

db_snap!(index, documents_ids, @"[0, 1, ]");
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
11 0
4 1
"###);
db_snap!(index, soft_deleted_documents_ids, @"[]");

index
.add_documents(documents!([
{ "primary_key": 4, "a": 0 },
{ "primary_key": 1 },
]))
.unwrap();

db_snap!(index, documents_ids, @"[0, 2, 3, ]");
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
1 3
11 0
4 2
"###);
db_snap!(index, soft_deleted_documents_ids, @"[1, ]");

let mut wtxn = index.write_txn().unwrap();
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
delete.disable_soft_deletion(true);
delete.execute().unwrap();
wtxn.commit().unwrap();

db_snap!(index, documents_ids, @"[0, 2, 3, ]");
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
1 3
11 0
4 2
"###);
db_snap!(index, soft_deleted_documents_ids, @"[]");

index
.add_documents(documents!([
{ "primary_key": 4, "a": 1 },
{ "primary_key": 1, "a": 0 },
]))
.unwrap();

db_snap!(index, documents_ids, @"[0, 1, 4, ]");
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
1 4
11 0
4 1
"###);
db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]");

let rtxn = index.read_txn().unwrap();
let search = Search::new(&rtxn, &index);
let SearchResult { matching_words: _, candidates: _, mut documents_ids } =
search.execute().unwrap();
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
documents_ids.sort_unstable();
let docs = index.documents(&rtxn, documents_ids).unwrap();
let mut all_ids = HashSet::new();
for (_docid, obkv) in docs {
let id = obkv.get(primary_key_id).unwrap();
assert!(all_ids.insert(id));
}
}
}
78 changes: 21 additions & 57 deletions milli/src/update/delete_documents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@ use heed::types::{ByteSlice, DecodeIgnore, Str};
use heed::Database;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use time::OffsetDateTime;

use super::facet::delete::FacetsDelete;
use super::ClearDocuments;
use crate::error::{InternalError, UserError};
use crate::error::InternalError;
use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key};
use crate::{
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
SmallString32, BEU32,
Expand Down Expand Up @@ -169,6 +167,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let percentage_used_by_soft_deleted_documents =
estimated_size_used_by_soft_deleted * 100 / map_size;

// We always soft-delete the documents, even if they will be permanently
// deleted immediately after.
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;

// if we have more than 10% of disk space available and the soft deleted
// documents uses less than 10% of the total space available,
// we skip the deletion. Eg.
Expand All @@ -183,31 +185,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&& percentage_available > 10
&& percentage_used_by_soft_deleted_documents < 10
{
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
return Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
soft_deletion_used: true,
});
}

// There is more than documents to delete than the threshold we needs to delete them all
self.to_delete_docids = soft_deleted_docids;
// and we can reset the soft deleted bitmap
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;

let primary_key =
self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
key: Some(main_key::PRIMARY_KEY_KEY),
})?;

// Since we already checked if the DB was empty, if we can't find the primary key, then
// something is wrong, and we must return an error.
let id_field = match fields_ids_map.id(primary_key) {
Some(field) => field,
None => return Err(UserError::MissingPrimaryKey.into()),
};

let Index {
env: _env,
Expand All @@ -231,33 +216,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
documents,
} = self.index;

// Retrieve the words and the external documents ids contained in the documents.
// Retrieve the words contained in the documents.
let mut words = Vec::new();
let mut external_ids = Vec::new();
for docid in &self.to_delete_docids {
// We create an iterator to be able to get the content and delete the document
// content itself. It's faster to acquire a cursor to get and delete,
// as we avoid traversing the LMDB B-Tree two times but only once.
let key = BEU32::new(docid);
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
if let Some((_key, obkv)) = iter.next().transpose()? {
if let Some(content) = obkv.get(id_field) {
let external_id = match serde_json::from_slice(content).unwrap() {
Value::String(string) => SmallString32::from(string.as_str()),
Value::Number(number) => SmallString32::from(number.to_string()),
document_id => {
return Err(UserError::InvalidDocumentId { document_id }.into())
}
};
external_ids.push(external_id);
}
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
}
drop(iter);
documents.delete(self.wtxn, &BEU32::new(docid))?;

// We iterate through the words positions of the document id,
// retrieve the word and delete the positions.
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
// We create an iterator to be able to get the content and delete the key-value itself.
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
while let Some(result) = iter.next() {
let ((_docid, word), _positions) = result?;
Expand All @@ -267,17 +233,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
unsafe { iter.del_current()? };
}
}

// We create the FST map of the external ids that we must delete.
external_ids.sort_unstable();
let external_ids_to_delete = fst::Set::from_iter(external_ids)?;

// We acquire the current external documents ids map...
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
// ...and remove the to-delete external ids.
new_external_documents_ids.delete_ids(external_ids_to_delete)?;

// We write the new external ids into the main database.
// We then remove the soft-deleted docids from it
new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
// and write it back to the main database.
let new_external_documents_ids = new_external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;

Expand Down Expand Up @@ -514,6 +475,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&self.to_delete_docids,
)?;

self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;

Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
Expand Down Expand Up @@ -1085,15 +1048,16 @@ mod tests {
id
);
}
wtxn.commit().unwrap();

let rtxn = index.read_txn().unwrap();

// get internal docids from deleted external document ids
let results = index.external_documents_ids(&wtxn).unwrap();
let results = index.external_documents_ids(&rtxn).unwrap();
for id in deleted_external_ids {
assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
}

wtxn.commit().unwrap();

drop(rtxn);
db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion);
}

Expand Down