@@ -16,21 +16,28 @@ export const getTermFrequency = (term: string, corpus: string) => {
16
16
} ;
17
17
18
18
/** Inverse document frequency. */
19
- export const getIDF = ( term : string , documents : string [ ] ) => {
19
+ export const getIDF = < T > ( term : string , documents : BMInputDocument < T > [ ] ) => {
20
20
// Number of relevant documents.
21
- const relevantDocuments = documents . filter ( ( document : string ) =>
22
- document . includes ( term )
21
+ const relevantDocuments = documents . filter ( ( document ) =>
22
+ document . text . includes ( term )
23
23
) . length ;
24
24
return Math . log (
25
25
( documents . length - relevantDocuments + 0.5 ) / ( relevantDocuments + 0.5 ) + 1
26
26
) ;
27
27
} ;
28
28
29
+ export interface BMInputDocument < T > {
30
+ /** The text from the original document */
31
+ text : string ;
32
+ /** The original document */
33
+ docs : T ;
34
+ }
35
+
29
36
/** Represents a document; useful when sorting results.
30
37
*/
31
- export interface BMDocument {
32
- /** The document is originally scoreed. */
33
- document : string ;
38
+ export interface BMOutputDocument < T > {
39
+ /** The original document */
40
+ docs : T ;
34
41
/** The score that the document recieves. */
35
42
score : number ;
36
43
}
@@ -44,7 +51,10 @@ export interface BMConstants {
44
51
}
45
52
46
53
/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */
47
- export type BMSorter = ( firstEl : BMDocument , secondEl : BMDocument ) => number ;
54
+ export type BMSorter < T > = (
55
+ firstEl : BMOutputDocument < T > ,
56
+ secondEl : BMOutputDocument < T >
57
+ ) => number ;
48
58
49
59
/** Implementation of Okapi BM25 algorithm.
50
60
* @param documents: Collection of documents.
@@ -53,16 +63,16 @@ export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number;
53
63
* @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order.
54
64
* If this option is provided, the return type will not be an array of scores but an array of documents with their scores.
55
65
*/
56
- export function BM25 (
57
- documents : string [ ] ,
66
+ export function BM25 < T > (
67
+ documents : BMInputDocument < T > [ ] ,
58
68
keywords : string [ ] ,
59
69
constants ?: BMConstants ,
60
- sorter ?: BMSorter
61
- ) : number [ ] | BMDocument [ ] {
70
+ sorter ?: BMSorter < T >
71
+ ) : BMOutputDocument < T > [ ] {
62
72
const b = constants && constants . b ? constants . b : 0.75 ;
63
73
const k1 = constants && constants . k1 ? constants . k1 : 1.2 ;
64
- const documentLengths = documents . map ( ( document : string ) =>
65
- getWordCount ( document )
74
+ const documentLengths = documents . map ( ( document ) =>
75
+ getWordCount ( document . text )
66
76
) ;
67
77
const averageDocumentLength =
68
78
documentLengths . reduce ( ( a , b ) => a + b , 0 ) / documents . length ;
@@ -71,14 +81,14 @@ export function BM25(
71
81
return obj ;
72
82
} , new Map < string , number > ( ) ) ;
73
83
74
- const scores = documents . map ( ( document : string , index : number ) => {
84
+ const scoredDocs = documents . map ( ( { text , docs } , index ) => {
75
85
const score = keywords
76
86
. map ( ( keyword : string ) => {
77
87
const inverseDocumentFrequency = idfByKeyword . get ( keyword ) ;
78
88
if ( inverseDocumentFrequency === undefined ) {
79
89
throw new Error ( "Missing keyword." ) ;
80
90
}
81
- const termFrequency = getTermFrequency ( keyword , document ) ;
91
+ const termFrequency = getTermFrequency ( keyword , text ) ;
82
92
const documentLength = documentLengths [ index ] ;
83
93
return (
84
94
( inverseDocumentFrequency * ( termFrequency * ( k1 + 1 ) ) ) /
@@ -87,14 +97,11 @@ export function BM25(
87
97
) ;
88
98
} )
89
99
. reduce ( ( a : number , b : number ) => a + b , 0 ) ;
90
- if ( sorter ) {
91
- return { score, document } as BMDocument ;
92
- }
93
- return score ;
100
+ return { score, docs } as BMOutputDocument < T > ;
94
101
} ) ;
95
102
// sort the results
96
103
if ( sorter ) {
97
- return ( scores as BMDocument [ ] ) . sort ( sorter ) ;
104
+ return scoredDocs . sort ( sorter ) ;
98
105
}
99
- return scores as number [ ] ;
106
+ return scoredDocs ;
100
107
}
0 commit comments