Skip to content

Commit e0f3313

Browse files
authored
Merge pull request #14 from vectorlessflow/dev
Dev
2 parents bb2f51d + 52cfbf1 commit e0f3313

File tree

5 files changed

+559
-206
lines changed

5 files changed

+559
-206
lines changed

Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "vectorless"
3-
version = "0.1.15"
3+
version = "0.1.16"
44
edition = "2024"
55
authors = ["zTgx <beautifularea@gmail.com>"]
66
description = "Hierarchical, reasoning-native document intelligence engine"
@@ -80,6 +80,9 @@ roxmltree = "0.20"
8080
# Random number generation (for sampling)
8181
rand = "0.8"
8282

83+
# BM25 scoring
84+
bm25 = { version = "2.3.2", features = ["parallelism"] }
85+
8386
[dev-dependencies]
8487
tempfile = "3.10"
8588
tokio-test = "0.4"

src/retrieval/content/scorer.rs

Lines changed: 10 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
use std::collections::HashMap;
1010

1111
use crate::document::NodeId;
12+
use crate::retrieval::search::{extract_keywords, Bm25Params, STOPWORDS};
1213
use crate::util::estimate_tokens;
1314

1415
use super::config::ScoringStrategyConfig;
@@ -130,8 +131,7 @@ pub struct RelevanceScorer {
130131
/// Scoring strategy to use.
131132
strategy: ScoringStrategyConfig,
132133
/// BM25 parameters.
133-
k1: f32,
134-
b: f32,
134+
params: Bm25Params,
135135
}
136136

137137
impl RelevanceScorer {
@@ -142,8 +142,7 @@ impl RelevanceScorer {
142142
Self {
143143
query_keywords,
144144
strategy,
145-
k1: 1.2,
146-
b: 0.75,
145+
params: Bm25Params::default(),
147146
}
148147
}
149148

@@ -153,8 +152,7 @@ impl RelevanceScorer {
153152
Self {
154153
query_keywords: keywords,
155154
strategy,
156-
k1: 1.2,
157-
b: 0.75,
155+
params: Bm25Params::default(),
158156
}
159157
}
160158

@@ -240,13 +238,15 @@ impl RelevanceScorer {
240238
continue;
241239
}
242240

243-
// IDF calculation
241+
// IDF calculation using BM25L variant
244242
let df = ctx.doc_freq.get(&term_lower).copied().unwrap_or(1) as f32;
245243
let idf = ((ctx.doc_count as f32 - df + 0.5) / (df + 0.5) + 1.0).ln();
246244

247245
// BM25 formula
248-
let numerator = tf * (self.k1 + 1.0);
249-
let denominator = tf + self.k1 * (1.0 - self.b + self.b * doc_len / ctx.avg_doc_len);
246+
let k1 = self.params.k1;
247+
let b = self.params.b;
248+
let numerator = tf * (k1 + 1.0);
249+
let denominator = tf + k1 * (1.0 - b + b * doc_len / ctx.avg_doc_len);
250250

251251
score += idf * numerator / denominator;
252252
}
@@ -263,159 +263,14 @@ impl RelevanceScorer {
263263
}
264264
}
265265

266-
/// Extract keywords from a query string.
267-
fn extract_keywords(query: &str) -> Vec<String> {
268-
// Common English stop words
269-
const STOPWORDS: &[&str] = &[
270-
"a",
271-
"an",
272-
"the",
273-
"is",
274-
"are",
275-
"was",
276-
"were",
277-
"be",
278-
"been",
279-
"being",
280-
"have",
281-
"has",
282-
"had",
283-
"do",
284-
"does",
285-
"did",
286-
"will",
287-
"would",
288-
"could",
289-
"should",
290-
"may",
291-
"might",
292-
"must",
293-
"shall",
294-
"can",
295-
"need",
296-
"dare",
297-
"ought",
298-
"used",
299-
"to",
300-
"of",
301-
"in",
302-
"for",
303-
"on",
304-
"with",
305-
"at",
306-
"by",
307-
"from",
308-
"as",
309-
"into",
310-
"through",
311-
"during",
312-
"before",
313-
"after",
314-
"above",
315-
"below",
316-
"between",
317-
"under",
318-
"again",
319-
"further",
320-
"then",
321-
"once",
322-
"here",
323-
"there",
324-
"when",
325-
"where",
326-
"why",
327-
"how",
328-
"all",
329-
"each",
330-
"few",
331-
"more",
332-
"most",
333-
"other",
334-
"some",
335-
"such",
336-
"no",
337-
"nor",
338-
"not",
339-
"only",
340-
"own",
341-
"same",
342-
"so",
343-
"than",
344-
"too",
345-
"very",
346-
"just",
347-
"and",
348-
"but",
349-
"if",
350-
"or",
351-
"because",
352-
"until",
353-
"while",
354-
"about",
355-
"what",
356-
"which",
357-
"who",
358-
"whom",
359-
"this",
360-
"that",
361-
"these",
362-
"those",
363-
"i",
364-
"me",
365-
"my",
366-
"myself",
367-
"we",
368-
"our",
369-
"ours",
370-
"ourselves",
371-
"you",
372-
"your",
373-
"yours",
374-
"yourself",
375-
"yourselves",
376-
"he",
377-
"him",
378-
"his",
379-
"himself",
380-
"she",
381-
"her",
382-
"hers",
383-
"herself",
384-
"it",
385-
"its",
386-
"itself",
387-
"they",
388-
"them",
389-
"their",
390-
"theirs",
391-
"themselves",
392-
];
393-
394-
query
395-
.to_lowercase()
396-
.split(|c: char| !c.is_alphanumeric())
397-
.filter(|s| {
398-
let s = *s;
399-
!s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s)
400-
})
401-
.map(String::from)
402-
.collect()
403-
}
404-
405266
/// Compute information density of content.
406267
fn compute_density(content: &str) -> f32 {
407268
let words: Vec<&str> = content.split_whitespace().collect();
408269
if words.is_empty() {
409270
return 0.0;
410271
}
411272

412-
// Stopword ratio (lower is better)
413-
const STOPWORDS: &[&str] = &[
414-
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
415-
"do", "does", "did", "will", "would", "could", "should", "may", "might", "must", "shall",
416-
"can", "to", "of", "in", "for", "on", "with", "at", "by", "from", "and", "but", "or", "as",
417-
];
418-
273+
// Use shared STOPWORDS from bm25 module
419274
let stopword_count = words
420275
.iter()
421276
.filter(|w| STOPWORDS.contains(&w.to_lowercase().as_str()))

0 commit comments

Comments
 (0)