Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.

Commit bc7b676

Browse files
committed
Merge remote-tracking branch 'origin/main' into obkv-documents
2 parents 8537bf8 + 5cbe879 commit bc7b676

30 files changed

+301
-99
lines changed

.github/workflows/benchmarks.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
default: 'search_songs'
1010

1111
env:
12-
HOME: "/home/runner" # The actions-rs/toolchain@v1 can fail we have no $HOME defined
12+
BENCH_NAME: ${{ github.event.inputs.dataset_name }}
1313

1414
jobs:
1515
benchmarks:
@@ -38,14 +38,14 @@ jobs:
3838
id: commit_sha
3939
- name: Set file basename with format "dataset_branch_commitSHA"
4040
shell: bash
41-
run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
41+
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
4242
id: file
4343

4444
# Run benchmarks
45-
- name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
45+
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
4646
run: |
4747
cd benchmarks
48-
cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }}
48+
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
4949
5050
# Generate critcmp files
5151
- name: Install critcmp
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: Benchmarks indexing (cron)
2+
3+
on:
4+
schedule:
5+
- cron: "30 0 * * FRI" # every friday at 00:30
6+
7+
env:
8+
BENCH_NAME: "indexing"
9+
10+
jobs:
11+
benchmarks:
12+
name: Run and upload benchmarks
13+
runs-on: self-hosted
14+
steps:
15+
- uses: actions/checkout@v2
16+
- uses: actions-rs/toolchain@v1
17+
with:
18+
profile: minimal
19+
toolchain: stable
20+
override: true
21+
22+
# Set variables
23+
- name: Set current branch name
24+
shell: bash
25+
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
26+
id: current_branch
27+
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
28+
shell: bash
29+
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
30+
id: normalized_current_branch
31+
- name: Set shorter commit SHA
32+
shell: bash
33+
run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
34+
id: commit_sha
35+
- name: Set file basename with format "dataset_branch_commitSHA"
36+
shell: bash
37+
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
38+
id: file
39+
40+
# Run benchmarks
41+
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
42+
run: |
43+
cd benchmarks
44+
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
45+
46+
# Generate critcmp files
47+
- name: Install critcmp
48+
run: cargo install critcmp
49+
- name: Export cripcmp file
50+
run: |
51+
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
52+
53+
# Upload benchmarks
54+
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
55+
uses: BetaHuhn/do-spaces-action@v2
56+
with:
57+
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
58+
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
59+
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
60+
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
61+
source: ${{ steps.file.outputs.basename }}.json
62+
out_dir: critcmp_results
63+
64+
# Helper
65+
- name: 'README: compare with another benchmark'
66+
run: |
67+
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
68+
echo 'How to compare this benchmark with another one?'
69+
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
70+
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: Benchmarks search songs (cron)
2+
3+
on:
4+
schedule:
5+
- cron: "30 08 * * FRI" # every friday at 08:30
6+
7+
env:
8+
BENCH_NAME: "search_songs"
9+
10+
jobs:
11+
benchmarks:
12+
name: Run and upload benchmarks
13+
runs-on: self-hosted
14+
steps:
15+
- uses: actions/checkout@v2
16+
- uses: actions-rs/toolchain@v1
17+
with:
18+
profile: minimal
19+
toolchain: stable
20+
override: true
21+
22+
# Set variables
23+
- name: Set current branch name
24+
shell: bash
25+
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
26+
id: current_branch
27+
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
28+
shell: bash
29+
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
30+
id: normalized_current_branch
31+
- name: Set shorter commit SHA
32+
shell: bash
33+
run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
34+
id: commit_sha
35+
- name: Set file basename with format "dataset_branch_commitSHA"
36+
shell: bash
37+
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
38+
id: file
39+
40+
# Run benchmarks
41+
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
42+
run: |
43+
cd benchmarks
44+
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
45+
46+
# Generate critcmp files
47+
- name: Install critcmp
48+
run: cargo install critcmp
49+
- name: Export cripcmp file
50+
run: |
51+
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
52+
53+
# Upload benchmarks
54+
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
55+
uses: BetaHuhn/do-spaces-action@v2
56+
with:
57+
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
58+
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
59+
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
60+
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
61+
source: ${{ steps.file.outputs.basename }}.json
62+
out_dir: critcmp_results
63+
64+
# Helper
65+
- name: 'README: compare with another benchmark'
66+
run: |
67+
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
68+
echo 'How to compare this benchmark with another one?'
69+
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
70+
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: Benchmarks search wikipedia articles (cron)
2+
3+
on:
4+
schedule:
5+
- cron: "30 16 * * FRI" # every friday at 16:30 (it’s snacky snack-time!)
6+
7+
env:
8+
BENCH_NAME: "search_wiki"
9+
10+
jobs:
11+
benchmarks:
12+
name: Run and upload benchmarks
13+
runs-on: self-hosted
14+
steps:
15+
- uses: actions/checkout@v2
16+
- uses: actions-rs/toolchain@v1
17+
with:
18+
profile: minimal
19+
toolchain: stable
20+
override: true
21+
22+
# Set variables
23+
- name: Set current branch name
24+
shell: bash
25+
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
26+
id: current_branch
27+
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
28+
shell: bash
29+
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
30+
id: normalized_current_branch
31+
- name: Set shorter commit SHA
32+
shell: bash
33+
run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
34+
id: commit_sha
35+
- name: Set file basename with format "dataset_branch_commitSHA"
36+
shell: bash
37+
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
38+
id: file
39+
40+
# Run benchmarks
41+
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
42+
run: |
43+
cd benchmarks
44+
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
45+
46+
# Generate critcmp files
47+
- name: Install critcmp
48+
run: cargo install critcmp
49+
- name: Export cripcmp file
50+
run: |
51+
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
52+
53+
# Upload benchmarks
54+
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
55+
uses: BetaHuhn/do-spaces-action@v2
56+
with:
57+
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
58+
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
59+
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
60+
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
61+
source: ${{ steps.file.outputs.basename }}.json
62+
out_dir: critcmp_results
63+
64+
# Helper
65+
- name: 'README: compare with another benchmark'
66+
run: |
67+
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
68+
echo 'How to compare this benchmark with another one?'
69+
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
70+
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

helpers/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "helpers"
3-
version = "0.11.0"
3+
version = "0.12.0"
44
authors = ["Clément Renault <clement@meilisearch.com>"]
55
edition = "2018"
66

http-ui/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "http-ui"
33
description = "The HTTP user interface of the milli search engine"
4-
version = "0.11.0"
4+
version = "0.12.0"
55
authors = ["Clément Renault <clement@meilisearch.com>"]
66
edition = "2018"
77

infos/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "infos"
3-
version = "0.11.0"
3+
version = "0.12.0"
44
authors = ["Clément Renault <clement@meilisearch.com>"]
55
edition = "2018"
66

milli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "milli"
3-
version = "0.11.0"
3+
version = "0.12.0"
44
authors = ["Kerollmops <clement@meilisearch.com>"]
55
edition = "2018"
66

milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::{marker, str};
55
use crate::error::SerializationError;
66
use crate::heed_codec::RoaringBitmapCodec;
77
use crate::{try_split_array_at, try_split_at, Result};
8+
89
pub type FacetStringLevelZeroValueCodec = StringValueCodec<RoaringBitmapCodec>;
910

1011
/// A codec that encodes a string in front of a value.
@@ -22,7 +23,6 @@ where
2223

2324
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
2425
let (string, bytes) = decode_prefix_string(bytes)?;
25-
2626
C::bytes_decode(bytes).map(|item| (string, item))
2727
}
2828
}
@@ -35,7 +35,6 @@ where
3535

3636
fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
3737
let value_bytes = C::bytes_encode(value)?;
38-
3938
let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
4039
encode_prefix_string(string, &mut bytes).ok()?;
4140
bytes.extend_from_slice(&value_bytes[..]);

milli/src/search/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,8 @@ impl<'a> Search<'a> {
145145

146146
// We check that we are allowed to use the sort criteria, we check
147147
// that they are declared in the sortable fields.
148-
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
149148
if let Some(sort_criteria) = &self.sort_criteria {
149+
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
150150
for asc_desc in sort_criteria {
151151
let field = asc_desc.field();
152152
if !sortable_fields.contains(field) {

milli/src/update/facets.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
5757
self
5858
}
5959

60+
#[logging_timer::time("Facets::{}")]
6061
pub fn execute(self) -> Result<()> {
6162
self.index.set_updated_at(self.wtxn, &Utc::now())?;
6263
// We get the faceted fields to be able to create the facet levels.

milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,15 +58,16 @@ pub fn extract_fid_docid_facet_values<R: io::Read>(
5858
// insert facet numbers in sorter
5959
for number in numbers {
6060
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
61-
let value_bytes = f64_into_bytes(number).unwrap(); // invalid float
62-
key_buffer.extend_from_slice(&value_bytes);
63-
key_buffer.extend_from_slice(&number.to_be_bytes());
61+
if let Some(value_bytes) = f64_into_bytes(number) {
62+
key_buffer.extend_from_slice(&value_bytes);
63+
key_buffer.extend_from_slice(&number.to_be_bytes());
6464

65-
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
65+
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
66+
}
6667
}
6768

6869
// insert normalized and original facet string in sorter
69-
for (normalized, original) in strings {
70+
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
7071
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
7172
key_buffer.extend_from_slice(normalized.as_bytes());
7273
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;

milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ use super::helpers::{
88
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
99
try_split_array_at, GrenadParameters, MergeFn,
1010
};
11+
use crate::error::SerializationError;
12+
use crate::index::db_name::DOCID_WORD_POSITIONS;
1113
use crate::proximity::extract_position;
1214
use crate::{DocumentId, FieldId, Result};
1315

@@ -36,7 +38,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
3638
let mut current_document_id = None;
3739

3840
while let Some((key, value)) = docid_word_positions.next()? {
39-
let (document_id_bytes, _word_bytes) = try_split_array_at(key).unwrap();
41+
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
42+
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
4043
let document_id = u32::from_be_bytes(document_id_bytes);
4144

4245
let curr_document_id = *current_document_id.get_or_insert(document_id);

0 commit comments

Comments
 (0)