Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
c3f49f7
Prepare refactor of facets database
Aug 29, 2022
7913d63
Update Facets indexing to be compatible with new database structure
Aug 30, 2022
63ef0ab
Start porting facet distribution and sort to new database structure
Aug 30, 2022
b8a1caa
Add range search and incremental indexing algorithm
Aug 30, 2022
5a904cf
Reintroduce facet distribution functionality
Aug 31, 2022
6cc9182
Remove unused heed codec files
Aug 31, 2022
22d80ee
Reintroduce facet deletion functionality
Aug 31, 2022
39a4a0a
Reintroduce filter range search and facet extractors
Aug 31, 2022
bd2c0e1
Remove unused code
Aug 31, 2022
e570c23
Reintroduce asc/desc functionality
Aug 31, 2022
fb8d23d
Reintroduce db_snap! for facet databases
Aug 31, 2022
e8a156d
Reorganise facets database indexing code
Aug 31, 2022
d30c89e
Fix compile error+warnings in new tests
Aug 31, 2022
85824ee
Try to make facet indexing incremental
Sep 1, 2022
68cbcdf
Fix compile errors/warnings in http-ui and infos
Sep 1, 2022
6125224
Fix some facet indexing bugs
Sep 1, 2022
07ff92c
Add more snapshots from facet tests
Sep 1, 2022
36296bb
Add facet incremental indexing snapshot tests + fix bug
Sep 1, 2022
a7201ec
cargo fmt
Sep 1, 2022
afdf87f
Fix bugs in asc/desc criterion and facet indexing
Sep 1, 2022
079ed4a
Add more snapshots
Sep 1, 2022
982efab
Fix encoding bugs in facet databases
Sep 5, 2022
3d145d7
Merge the two <facetttype>_faceted_documents_ids methods into one
Sep 5, 2022
9b55e58
Add FacetsUpdate type that wraps incremental and bulk indexing methods
Sep 5, 2022
485a723
Refactor facet-related codecs
Sep 5, 2022
330c9eb
Rename facet codecs and refine FacetsUpdate API
Sep 5, 2022
9026867
Give same interface to bulk and incremental facet indexing types
Sep 5, 2022
b2f01ad
Refactor facet database tests
Sep 6, 2022
bee3c23
Add comparison benchmark between bulk and incremental facet indexing
Sep 6, 2022
27454e9
Document and refine facet indexing algorithms
Sep 7, 2022
fca4577
Return original string in facet distributions, work on facet tests
Sep 7, 2022
3d7ed32
Fix bug in string facet distribution with few candidates
Sep 7, 2022
b1ab091
Remove outdated TODOs
Sep 7, 2022
985a94a
cargo fmt
Sep 7, 2022
de52a9b
Improve documentation of some facet-related algorithms
Sep 8, 2022
86d9f50
Fix bugs in incremental facet indexing with variable parameters
Sep 8, 2022
3baa34d
Fix compiler errors/warnings
Sep 8, 2022
cb8442a
Further unify facet databases of f64s and strings
Sep 8, 2022
51961e1
Polish some details
Sep 8, 2022
1ecd3bb
Fix bug in FieldDocIdFacetCodec
Sep 21, 2022
a2270b7
Change fuzzcheck dependency to point to git repository
Sep 21, 2022
d010962
Fix a bug in facet_range_search and add documentation
Sep 21, 2022
0ade699
Don't crash when failing to decode using StrRef codec
Sep 21, 2022
1165ba2
Make facet deletion incremental
Sep 21, 2022
a034a1e
Move StrRefCodec and ByteSliceRefCodec to their own files
loiclec Oct 12, 2022
acc8cae
Add link to GitHub PR to document of update/facet module
loiclec Oct 12, 2022
2295e0e
Use real delete function in facet indexing fuzz tests
loiclec Oct 12, 2022
ee1abfd
Ignore files generated by fuzzcheck
loiclec Oct 12, 2022
d885de1
Add option to avoid soft deletion of documents
Sep 21, 2022
ab5e56f
Add document deletion snapshot tests and tests for hard-deletion
Aug 25, 2022
e3ba1fc
Make deletion tests for both soft-deletion and hard-deletion
Sep 22, 2022
f198b20
Add facet deletion tests that use both the incremental and bulk methods
loiclec Oct 12, 2022
206a3e0
cargo fmt
loiclec Oct 12, 2022
14ca804
Add some documentation on how to run the facet db fuzzer
loiclec Oct 17, 2022
3b1f908
Revert behaviour of facet distribution to what it was before
loiclec Oct 17, 2022
b7f2428
Fix formatting and warning after rebasing from main
loiclec Oct 26, 2022
2741756
Merge remote-tracking branch 'origin/main' into facet-levels-refactor
loiclec Oct 26, 2022
631e991
Depend on released version of fuzzcheck from crates.io
loiclec Oct 26, 2022
2fa85a2
Remove outdated files from http-ui/ and infos/
loiclec Oct 26, 2022
54c0cf9
Merge remote-tracking branch 'origin/main' into facet-levels-refactor
loiclec Oct 26, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
/target
/Cargo.lock

milli/target/

# datasets
*.csv
*.mmdb
Expand All @@ -11,6 +13,8 @@
# Snapshots
## ... large
*.full.snap

# ... unreviewed
## ... unreviewed
*.snap.new

# Fuzzcheck data for the facet indexing fuzz test
milli/fuzz/update::facet::incremental::fuzz::fuzz/
5 changes: 4 additions & 1 deletion milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ big_s = "1.0.2"
insta = "1.21.0"
maplit = "1.0.2"
md5 = "0.7.0"
rand = "0.8.5"
rand = {version = "0.8.5", features = ["small_rng"] }

[target.'cfg(fuzzing)'.dev-dependencies]
fuzzcheck = "0.12.1"

[features]
default = [ "charabia/default" ]
Expand Down
23 changes: 23 additions & 0 deletions milli/src/heed_codec/byte_slice_ref.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
use std::borrow::Cow;

use heed::{BytesDecode, BytesEncode};

/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated
/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure.
pub struct ByteSliceRefCodec;

impl<'a> BytesEncode<'a> for ByteSliceRefCodec {
type EItem = &'a [u8];

fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item))
}
}

impl<'a> BytesDecode<'a> for ByteSliceRefCodec {
type DItem = &'a [u8];

fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(bytes)
}
}
89 changes: 0 additions & 89 deletions milli/src/heed_codec/facet/facet_level_value_f64_codec.rs

This file was deleted.

53 changes: 0 additions & 53 deletions milli/src/heed_codec/facet/facet_level_value_u32_codec.rs

This file was deleted.

50 changes: 0 additions & 50 deletions milli/src/heed_codec/facet/facet_string_level_zero_codec.rs

This file was deleted.

90 changes: 0 additions & 90 deletions milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs
Original file line number Diff line number Diff line change
@@ -1,90 +0,0 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::{marker, str};

use crate::error::SerializationError;
use crate::heed_codec::RoaringBitmapCodec;
use crate::{try_split_array_at, try_split_at, Result};

pub type FacetStringLevelZeroValueCodec = StringValueCodec<RoaringBitmapCodec>;

/// A codec that encodes a string in front of a value.
///
/// The usecase is for the facet string levels algorithm where we must know the
/// original string of a normalized facet value, the original values are stored
/// in the value to not break the lexicographical ordering of the LMDB keys.
pub struct StringValueCodec<C>(marker::PhantomData<C>);

impl<'a, C> heed::BytesDecode<'a> for StringValueCodec<C>
where
C: heed::BytesDecode<'a>,
{
type DItem = (&'a str, C::DItem);

fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (string, bytes) = decode_prefix_string(bytes)?;
C::bytes_decode(bytes).map(|item| (string, item))
}
}

impl<'a, C> heed::BytesEncode<'a> for StringValueCodec<C>
where
C: heed::BytesEncode<'a>,
{
type EItem = (&'a str, C::EItem);

fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
let value_bytes = C::bytes_encode(value)?;

let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
encode_prefix_string(string, &mut bytes).ok()?;
bytes.extend_from_slice(&value_bytes[..]);

Some(Cow::Owned(bytes))
}
}

pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> {
let (original_length_bytes, bytes) = try_split_array_at(value)?;
let original_length = u16::from_be_bytes(original_length_bytes) as usize;
let (string, bytes) = try_split_at(bytes, original_length)?;
let string = str::from_utf8(string).ok()?;
Some((string, bytes))
}

pub fn encode_prefix_string(string: &str, buffer: &mut Vec<u8>) -> Result<()> {
let string_len: u16 =
string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?;
buffer.extend_from_slice(&string_len.to_be_bytes());
buffer.extend_from_slice(string.as_bytes());
Ok(())
}

#[cfg(test)]
mod tests {
use heed::types::Unit;
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;

use super::*;

#[test]
fn deserialize_roaring_bitmaps() {
let string = "abc";
let docids: RoaringBitmap = (0..100).chain(3500..4398).collect();
let key = (string, docids.clone());
let bytes = StringValueCodec::<RoaringBitmapCodec>::bytes_encode(&key).unwrap();
let (out_string, out_docids) =
StringValueCodec::<RoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
assert_eq!((out_string, out_docids), (string, docids));
}

#[test]
fn deserialize_unit() {
let string = "def";
let key = (string, ());
let bytes = StringValueCodec::<Unit>::bytes_encode(&key).unwrap();
let (out_string, out_unit) = StringValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
assert_eq!((out_string, out_unit), (string, ()));
}
}
Loading