@@ -16,21 +16,26 @@ export const getTermFrequency = (term: string, corpus: string) => {
16
16
} ;
17
17
18
18
/** Inverse document frequency. */
19
- export const getIDF = ( term : string , documents : string [ ] ) => {
19
+ export const getIDF = < T > ( term : string , documents : BMInputDocument < T > [ ] ) => {
20
20
// Number of relevant documents.
21
- const relevantDocuments = documents . filter ( ( document : string ) =>
22
- document . includes ( term )
21
+ const relevantDocuments = documents . filter ( ( document ) =>
22
+ document . text . includes ( term )
23
23
) . length ;
24
24
return Math . log (
25
25
( documents . length - relevantDocuments + 0.5 ) / ( relevantDocuments + 0.5 ) + 1
26
26
) ;
27
27
} ;
28
28
29
+ export interface BMInputDocument < T > {
30
+ text : string ;
31
+ docs : T ;
32
+ }
33
+
29
34
/** Represents a document; useful when sorting results.
30
35
*/
31
- export interface BMDocument {
32
- /** The document is originally scoreed. */
33
- document : string ;
36
+ export interface BMOutputDocument < T > {
37
+ /** The original source */
38
+ docs : T ;
34
39
/** The score that the document recieves. */
35
40
score : number ;
36
41
}
@@ -44,7 +49,10 @@ export interface BMConstants {
44
49
}
45
50
46
51
/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */
47
- export type BMSorter = ( firstEl : BMDocument , secondEl : BMDocument ) => number ;
52
+ export type BMSorter < T > = (
53
+ firstEl : BMOutputDocument < T > ,
54
+ secondEl : BMOutputDocument < T >
55
+ ) => number ;
48
56
49
57
/** Implementation of Okapi BM25 algorithm.
50
58
* @param documents: Collection of documents.
@@ -53,16 +61,16 @@ export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number;
53
61
* @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order.
54
62
* If this option is provided, the return type will not be an array of scores but an array of documents with their scores.
55
63
*/
56
- export function BM25 (
57
- documents : string [ ] ,
64
+ export function BM25 < T > (
65
+ documents : BMInputDocument < T > [ ] ,
58
66
keywords : string [ ] ,
59
67
constants ?: BMConstants ,
60
- sorter ?: BMSorter
61
- ) : number [ ] | BMDocument [ ] {
68
+ sorter ?: BMSorter < T >
69
+ ) : BMOutputDocument < T > [ ] {
62
70
const b = constants && constants . b ? constants . b : 0.75 ;
63
71
const k1 = constants && constants . k1 ? constants . k1 : 1.2 ;
64
- const documentLengths = documents . map ( ( document : string ) =>
65
- getWordCount ( document )
72
+ const documentLengths = documents . map ( ( document ) =>
73
+ getWordCount ( document . text )
66
74
) ;
67
75
const averageDocumentLength =
68
76
documentLengths . reduce ( ( a , b ) => a + b , 0 ) / documents . length ;
@@ -71,14 +79,14 @@ export function BM25(
71
79
return obj ;
72
80
} , new Map < string , number > ( ) ) ;
73
81
74
- const scores = documents . map ( ( document : string , index : number ) => {
82
+ const scoredDocs = documents . map ( ( { text , docs } , index ) => {
75
83
const score = keywords
76
84
. map ( ( keyword : string ) => {
77
85
const inverseDocumentFrequency = idfByKeyword . get ( keyword ) ;
78
86
if ( inverseDocumentFrequency === undefined ) {
79
87
throw new Error ( "Missing keyword." ) ;
80
88
}
81
- const termFrequency = getTermFrequency ( keyword , document ) ;
89
+ const termFrequency = getTermFrequency ( keyword , text ) ;
82
90
const documentLength = documentLengths [ index ] ;
83
91
return (
84
92
( inverseDocumentFrequency * ( termFrequency * ( k1 + 1 ) ) ) /
@@ -87,14 +95,11 @@ export function BM25(
87
95
) ;
88
96
} )
89
97
. reduce ( ( a : number , b : number ) => a + b , 0 ) ;
90
- if ( sorter ) {
91
- return { score, document } as BMDocument ;
92
- }
93
- return score ;
98
+ return { score, docs } as BMOutputDocument < T > ;
94
99
} ) ;
95
100
// sort the results
96
101
if ( sorter ) {
97
- return ( scores as BMDocument [ ] ) . sort ( sorter ) ;
102
+ return scoredDocs . sort ( sorter ) ;
98
103
}
99
- return scores as number [ ] ;
104
+ return scoredDocs ;
100
105
}
0 commit comments