99use std:: collections:: HashMap ;
1010
1111use crate :: document:: NodeId ;
12+ use crate :: retrieval:: search:: { extract_keywords, Bm25Params , STOPWORDS } ;
1213use crate :: util:: estimate_tokens;
1314
1415use super :: config:: ScoringStrategyConfig ;
@@ -130,8 +131,7 @@ pub struct RelevanceScorer {
130131 /// Scoring strategy to use.
131132 strategy : ScoringStrategyConfig ,
132133 /// BM25 parameters.
133- k1 : f32 ,
134- b : f32 ,
134+ params : Bm25Params ,
135135}
136136
137137impl RelevanceScorer {
@@ -142,8 +142,7 @@ impl RelevanceScorer {
142142 Self {
143143 query_keywords,
144144 strategy,
145- k1 : 1.2 ,
146- b : 0.75 ,
145+ params : Bm25Params :: default ( ) ,
147146 }
148147 }
149148
@@ -153,8 +152,7 @@ impl RelevanceScorer {
153152 Self {
154153 query_keywords : keywords,
155154 strategy,
156- k1 : 1.2 ,
157- b : 0.75 ,
155+ params : Bm25Params :: default ( ) ,
158156 }
159157 }
160158
@@ -240,13 +238,15 @@ impl RelevanceScorer {
240238 continue ;
241239 }
242240
243- // IDF calculation
241+ // IDF calculation using BM25L variant
244242 let df = ctx. doc_freq . get ( & term_lower) . copied ( ) . unwrap_or ( 1 ) as f32 ;
245243 let idf = ( ( ctx. doc_count as f32 - df + 0.5 ) / ( df + 0.5 ) + 1.0 ) . ln ( ) ;
246244
247245 // BM25 formula
248- let numerator = tf * ( self . k1 + 1.0 ) ;
249- let denominator = tf + self . k1 * ( 1.0 - self . b + self . b * doc_len / ctx. avg_doc_len ) ;
246+ let k1 = self . params . k1 ;
247+ let b = self . params . b ;
248+ let numerator = tf * ( k1 + 1.0 ) ;
249+ let denominator = tf + k1 * ( 1.0 - b + b * doc_len / ctx. avg_doc_len ) ;
250250
251251 score += idf * numerator / denominator;
252252 }
@@ -263,159 +263,14 @@ impl RelevanceScorer {
263263 }
264264}
265265
266- /// Extract keywords from a query string.
267- fn extract_keywords ( query : & str ) -> Vec < String > {
268- // Common English stop words
269- const STOPWORDS : & [ & str ] = & [
270- "a" ,
271- "an" ,
272- "the" ,
273- "is" ,
274- "are" ,
275- "was" ,
276- "were" ,
277- "be" ,
278- "been" ,
279- "being" ,
280- "have" ,
281- "has" ,
282- "had" ,
283- "do" ,
284- "does" ,
285- "did" ,
286- "will" ,
287- "would" ,
288- "could" ,
289- "should" ,
290- "may" ,
291- "might" ,
292- "must" ,
293- "shall" ,
294- "can" ,
295- "need" ,
296- "dare" ,
297- "ought" ,
298- "used" ,
299- "to" ,
300- "of" ,
301- "in" ,
302- "for" ,
303- "on" ,
304- "with" ,
305- "at" ,
306- "by" ,
307- "from" ,
308- "as" ,
309- "into" ,
310- "through" ,
311- "during" ,
312- "before" ,
313- "after" ,
314- "above" ,
315- "below" ,
316- "between" ,
317- "under" ,
318- "again" ,
319- "further" ,
320- "then" ,
321- "once" ,
322- "here" ,
323- "there" ,
324- "when" ,
325- "where" ,
326- "why" ,
327- "how" ,
328- "all" ,
329- "each" ,
330- "few" ,
331- "more" ,
332- "most" ,
333- "other" ,
334- "some" ,
335- "such" ,
336- "no" ,
337- "nor" ,
338- "not" ,
339- "only" ,
340- "own" ,
341- "same" ,
342- "so" ,
343- "than" ,
344- "too" ,
345- "very" ,
346- "just" ,
347- "and" ,
348- "but" ,
349- "if" ,
350- "or" ,
351- "because" ,
352- "until" ,
353- "while" ,
354- "about" ,
355- "what" ,
356- "which" ,
357- "who" ,
358- "whom" ,
359- "this" ,
360- "that" ,
361- "these" ,
362- "those" ,
363- "i" ,
364- "me" ,
365- "my" ,
366- "myself" ,
367- "we" ,
368- "our" ,
369- "ours" ,
370- "ourselves" ,
371- "you" ,
372- "your" ,
373- "yours" ,
374- "yourself" ,
375- "yourselves" ,
376- "he" ,
377- "him" ,
378- "his" ,
379- "himself" ,
380- "she" ,
381- "her" ,
382- "hers" ,
383- "herself" ,
384- "it" ,
385- "its" ,
386- "itself" ,
387- "they" ,
388- "them" ,
389- "their" ,
390- "theirs" ,
391- "themselves" ,
392- ] ;
393-
394- query
395- . to_lowercase ( )
396- . split ( |c : char | !c. is_alphanumeric ( ) )
397- . filter ( |s| {
398- let s = * s;
399- !s. is_empty ( ) && s. len ( ) > 1 && !STOPWORDS . contains ( & s)
400- } )
401- . map ( String :: from)
402- . collect ( )
403- }
404-
405266/// Compute information density of content.
406267fn compute_density ( content : & str ) -> f32 {
407268 let words: Vec < & str > = content. split_whitespace ( ) . collect ( ) ;
408269 if words. is_empty ( ) {
409270 return 0.0 ;
410271 }
411272
412- // Stopword ratio (lower is better)
413- const STOPWORDS : & [ & str ] = & [
414- "a" , "an" , "the" , "is" , "are" , "was" , "were" , "be" , "been" , "being" , "have" , "has" , "had" ,
415- "do" , "does" , "did" , "will" , "would" , "could" , "should" , "may" , "might" , "must" , "shall" ,
416- "can" , "to" , "of" , "in" , "for" , "on" , "with" , "at" , "by" , "from" , "and" , "but" , "or" , "as" ,
417- ] ;
418-
273+ // Use shared STOPWORDS from bm25 module
419274 let stopword_count = words
420275 . iter ( )
421276 . filter ( |w| STOPWORDS . contains ( & w. to_lowercase ( ) . as_str ( ) ) )
0 commit comments