quickwit-oss · lerouxrgd · Mar 19, 2021 · Mar 23, 2021 · fulmicoton · Mar 25, 2021
diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs
@@ -1,12 +1,14 @@
-use crate::common::BitSet;
 use crate::core::SegmentReader;
-use crate::query::ConstScorer;
-use crate::query::{BitSetDocSet, Explanation};
+use crate::query::fuzzy_query::DFAWrapper;
+use crate::query::score_combiner::SumCombiner;
+use crate::query::Explanation;
+use crate::query::{ConstScorer, Union};
 use crate::query::{Scorer, Weight};
 use crate::schema::{Field, IndexRecordOption};
-use crate::termdict::{TermDictionary, TermStreamer};
+use crate::termdict::{TermDictionary, TermWithStateStreamer};
 use crate::TantivyError;
 use crate::{DocId, Score};
+use std::any::{Any, TypeId};
 use std::io;
 use std::sync::Arc;
 use tantivy_fst::Automaton;
@@ -33,9 +35,9 @@ where
     fn automaton_stream<'a>(
         &'a self,
         term_dict: &'a TermDictionary,
-    ) -> io::Result<TermStreamer<'a, &'a A>> {
+    ) -> io::Result<TermWithStateStreamer<'a, &'a A>> {
         let automaton: &A = &*self.automaton;
-        let term_stream_builder = term_dict.search(automaton);
+        let term_stream_builder = term_dict.search_with_state(automaton);
         term_stream_builder.into_stream()
     }
 }
@@ -46,35 +48,27 @@ where
     A::State: Clone,
 {
     fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
-        let max_doc = reader.max_doc();
-        let mut doc_bitset = BitSet::with_max_value(max_doc);
         let inverted_index = reader.inverted_index(self.field)?;
         let term_dict = inverted_index.terms();
         let mut term_stream = self.automaton_stream(term_dict)?;
-        while term_stream.advance() {
-            let term_info = term_stream.value();
-            let mut block_segment_postings = inverted_index
-                .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
-            loop {
-                let docs = block_segment_postings.docs();
-                if docs.is_empty() {
-                    break;
-                }
-                for &doc in docs {
-                    doc_bitset.insert(doc);
-                }
-                block_segment_postings.advance();
-            }
+
+        let mut scorers = vec![];
+        while let Some((_term, term_info, state)) = term_stream.next() {
+            let score = automaton_score(self.automaton.as_ref(), state);
+            let segment_postings =
+                inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
+            let scorer = ConstScorer::new(segment_postings, boost * score);
+            scorers.push(scorer);
         }
-        let doc_bitset = BitSetDocSet::from(doc_bitset);
-        let const_scorer = ConstScorer::new(doc_bitset, boost);
-        Ok(Box::new(const_scorer))
+
+        let scorer = Union::<_, SumCombiner>::from(scorers);
+        Ok(Box::new(scorer))
     }
 
     fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
         let mut scorer = self.scorer(reader, 1.0)?;
         if scorer.seek(doc) == doc {
-            Ok(Explanation::new("AutomatonScorer", 1.0))
+            Ok(Explanation::new("AutomatonScorer", scorer.score()))
         } else {
             Err(TantivyError::InvalidArgument(
                 "Document does not exist".to_string(),
@@ -83,6 +77,25 @@ where
     }
 }
 
+fn automaton_score<A>(automaton: &A, state: A::State) -> f32
+where
+    A: Automaton + Send + Sync + 'static,
+    A::State: Clone,
+{
+    if TypeId::of::<DFAWrapper>() == automaton.type_id() && TypeId::of::<u32>() == state.type_id() {
+        let dfa = automaton as *const A as *const DFAWrapper;
+        let dfa = unsafe { &*dfa };
+
+        let id = &state as *const A::State as *const u32;
+        let id = unsafe { *id };
+
+        let dist = dfa.0.distance(id).to_u8() as f32;
+        1.0 / (1.0 + dist)
+    } else {
+        1.0
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::AutomatonWeight;

diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs
@@ -199,7 +199,7 @@ mod test {
                 .unwrap();
             assert_eq!(top_docs.len(), 1, "Expected only 1 document");
             let (score, _) = top_docs[0];
-            assert_nearly_equals!(1.0, score);
+            assert_nearly_equals!(0.5, score);
         }
 
         // fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')

diff --git a/src/termdict/fst_termdict/mod.rs b/src/termdict/fst_termdict/mod.rs
@@ -23,5 +23,7 @@ mod streamer;
 mod term_info_store;
 mod termdict;
 
-pub use self::streamer::{TermStreamer, TermStreamerBuilder};
+pub use self::streamer::{
+    TermStreamer, TermStreamerBuilder, TermWithStateStreamer, TermWithStateStreamerBuilder,
+};
 pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
diff --git a/src/termdict/fst_termdict/streamer.rs b/src/termdict/fst_termdict/streamer.rs
@@ -4,7 +4,7 @@ use super::TermDictionary;
 use crate::postings::TermInfo;
 use crate::termdict::TermOrdinal;
 use tantivy_fst::automaton::AlwaysMatch;
-use tantivy_fst::map::{Stream, StreamBuilder};
+use tantivy_fst::map::{Stream, StreamBuilder, StreamWithState};
 use tantivy_fst::Automaton;
 use tantivy_fst::{IntoStreamer, Streamer};
 
@@ -149,3 +149,153 @@ where
         }
     }
 }
+
+/// `TermWithStateStreamerBuilder` is a helper object used to define
+/// a range of terms that should be streamed.
+pub struct TermWithStateStreamerBuilder<'a, A = AlwaysMatch>
+where
+    A: Automaton,
+    A::State: Clone,
+{
+    fst_map: &'a TermDictionary,
+    stream_builder: StreamBuilder<'a, A>,
+}
+
+impl<'a, A> TermWithStateStreamerBuilder<'a, A>
+where
+    A: Automaton,
+    A::State: Clone,
+{
+    pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
+        TermWithStateStreamerBuilder {
+            fst_map,
+            stream_builder,
+        }
+    }
+
+    /// Limit the range to terms greater or equal to the bound
+    pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
+        self.stream_builder = self.stream_builder.ge(bound);
+        self
+    }
+
+    /// Limit the range to terms strictly greater than the bound
+    pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
+        self.stream_builder = self.stream_builder.gt(bound);
+        self
+    }
+
+    /// Limit the range to terms lesser or equal to the bound
+    pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
+        self.stream_builder = self.stream_builder.le(bound);
+        self
+    }
+
+    /// Limit the range to terms lesser or equal to the bound
+    pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
+        self.stream_builder = self.stream_builder.lt(bound);
+        self
+    }
+
+    /// Iterate over the range backwards.
+    pub fn backward(mut self) -> Self {
+        self.stream_builder = self.stream_builder.backward();
+        self
+    }
+
+    /// Creates the stream corresponding to the range
+    /// of terms defined using the `TermWithStateStreamerBuilder`.
+    pub fn into_stream(self) -> io::Result<TermWithStateStreamer<'a, A>> {
+        Ok(TermWithStateStreamer {
+            fst_map: self.fst_map,
+            stream: self.stream_builder.with_state().into_stream(),
+            term_ord: 0u64,
+            current_key: Vec::with_capacity(100),
+            current_value: TermInfo::default(),
+            current_state: None,
+        })
+    }
+}
+
+/// `TermWithStateStreamer` acts as a cursor over a range of terms of a segment.
+/// Terms are guaranteed to be sorted.
+pub struct TermWithStateStreamer<'a, A = AlwaysMatch>
+where
+    A: Automaton,
+    A::State: Clone,
+{
+    fst_map: &'a TermDictionary,
+    stream: StreamWithState<'a, A>,
+    term_ord: TermOrdinal,
+    current_key: Vec<u8>,
+    current_value: TermInfo,
+    current_state: Option<A::State>,
+}
+
+impl<'a, A> TermWithStateStreamer<'a, A>
+where
+    A: Automaton,
+    A::State: Clone,
+{
+    /// Advance position the stream on the next item.
+    /// Before the first call to `.advance()`, the stream
+    /// is an unitialized state.
+    pub fn advance(&mut self) -> bool {
+        if let Some((term, term_ord, state)) = self.stream.next() {
+            self.current_key.clear();
+            self.current_key.extend_from_slice(term);
+            self.term_ord = term_ord;
+            self.current_value = self.fst_map.term_info_from_ord(term_ord);
+            self.current_state = Some(state);
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Returns the `TermOrdinal` of the given term.
+    ///
+    /// May panic if the called as `.advance()` as never
+    /// been called before.
+    pub fn term_ord(&self) -> TermOrdinal {
+        self.term_ord
+    }
+
+    /// Accesses the current key.
+    ///
+    /// `.key()` should return the key that was returned
+    /// by the `.next()` method.
+    ///
+    /// If the end of the stream as been reached, and `.next()`
+    /// has been called and returned `None`, `.key()` remains
+    /// the value of the last key encountered.
+    ///
+    /// Before any call to `.next()`, `.key()` returns an empty array.
+    pub fn key(&self) -> &[u8] {
+        &self.current_key
+    }
+
+    /// Accesses the current value.
+    ///
+    /// Calling `.value()` after the end of the stream will return the
+    /// last `.value()` encountered.
+    ///
+    /// # Panics
+    ///
+    /// Calling `.value()` before the first call to `.advance()` returns
+    /// `V::default()`.
+    pub fn value(&self) -> &TermInfo {
+        &self.current_value
+    }
+
+    /// Return the next `(key, value, state)` triplet.
+    #[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
+    pub fn next(&mut self) -> Option<(&[u8], &TermInfo, A::State)> {
+        if self.advance() {
+            let state = self.current_state.take().unwrap(); // always Some(_) after advance
+            Some((self.key(), self.value(), state))
+        } else {
+            None
+        }
+    }
+}
diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs
@@ -1,5 +1,5 @@
 use super::term_info_store::{TermInfoStore, TermInfoStoreWriter};
-use super::{TermStreamer, TermStreamerBuilder};
+use super::{TermStreamer, TermStreamerBuilder, TermWithStateStreamerBuilder};
 use crate::common::{BinarySerializable, CountingWriter};
 use crate::directory::{FileSlice, OwnedBytes};
 use crate::error::DataCorruption;
@@ -201,4 +201,15 @@ impl TermDictionary {
         let stream_builder = self.fst_index.search(automaton);
         TermStreamerBuilder::<A>::new(self, stream_builder)
     }
+
+    /// Returns a search builder, to stream all of the terms
+    /// within the Automaton
+    pub fn search_with_state<'a, A>(&'a self, automaton: A) -> TermWithStateStreamerBuilder<'a, A>
+    where
+        A: Automaton + 'a,
+        A::State: Clone,
+    {
+        let stream_builder = self.fst_index.search(automaton);
+        TermWithStateStreamerBuilder::<A>::new(self, stream_builder)
+    }
 }
diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs
@@ -54,3 +54,7 @@ pub type TermMerger<'a> = self::merger::TermMerger<'a>;
 /// `TermStreamer` acts as a cursor over a range of terms of a segment.
 /// Terms are guaranteed to be sorted.
 pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>;
+
+/// `TermWithStateStreamer` acts as a cursor over a range of terms of a segment.
+/// Terms are guaranteed to be sorted.
+pub type TermWithStateStreamer<'a, A = AlwaysMatch> = self::termdict::TermWithStateStreamer<'a, A>;