Skip to content

improve bm25 sorting function #8427

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions libs/langchain-community/src/retrievers/bm25.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,28 +47,28 @@ export class BM25Retriever extends BaseRetriever {

async _getRelevantDocuments(query: string) {
const processedQuery = this.preprocessFunc(query);
const documents = this.docs.map((doc) => doc.pageContent);
const scores = BM25(documents, processedQuery) as number[];

const scoredDocs = this.docs.map((doc, index) => ({
document: doc,
score: scores[index],
}));

scoredDocs.sort((a, b) => b.score - a.score);
const scoredDocs = BM25<Document>(
this.docs.map((d) => ({
text: d.pageContent,
docs: d,
})),
processedQuery,
undefined,
(a, b) => b.score - a.score
);

return scoredDocs.slice(0, this.k).map((item) => {
if (this.includeScore) {
return new Document({
...(item.document.id && { id: item.document.id }),
pageContent: item.document.pageContent,
...(item.docs.id && { id: item.docs.id }),
pageContent: item.docs.pageContent,
metadata: {
bm25Score: item.score,
...item.document.metadata,
...item.docs.metadata,
},
});
} else {
return item.document;
return item.docs;
}
});
}
Expand Down
49 changes: 28 additions & 21 deletions libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,28 @@ export const getTermFrequency = (term: string, corpus: string) => {
};

/** Inverse document frequency. */
export const getIDF = (term: string, documents: string[]) => {
export const getIDF = <T>(term: string, documents: BMInputDocument<T>[]) => {
// Number of relevant documents.
const relevantDocuments = documents.filter((document: string) =>
document.includes(term)
const relevantDocuments = documents.filter((document) =>
document.text.includes(term)
).length;
return Math.log(
(documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1
);
};

export interface BMInputDocument<T> {
/** The text from the original document */
text: string;
Copy link
Contributor Author

@AllenFang AllenFang Jun 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let me know if team prefer to use a method to extract the text rather than adding another text field BMInputDocument:

ex:

export interface BMInputDocument<T> {
  extractText: (docs: T) => string;
  /** The original document */
  docs: T;
}

/** The original document */
docs: T;
}

/** Represents a document; useful when sorting results.
*/
export interface BMDocument {
/** The document is originally scoreed. */
document: string;
export interface BMOutputDocument<T> {
/** The original document */
docs: T;
/** The score that the document recieves. */
score: number;
}
Expand All @@ -44,7 +51,10 @@ export interface BMConstants {
}

/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */
export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number;
export type BMSorter<T> = (
firstEl: BMOutputDocument<T>,
secondEl: BMOutputDocument<T>
) => number;

/** Implementation of Okapi BM25 algorithm.
* @param documents: Collection of documents.
Expand All @@ -53,16 +63,16 @@ export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number;
* @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order.
* If this option is provided, the return type will not be an array of scores but an array of documents with their scores.
*/
export function BM25(
documents: string[],
export function BM25<T>(
documents: BMInputDocument<T>[],
keywords: string[],
constants?: BMConstants,
sorter?: BMSorter
): number[] | BMDocument[] {
sorter?: BMSorter<T>
): BMOutputDocument<T>[] {
const b = constants && constants.b ? constants.b : 0.75;
const k1 = constants && constants.k1 ? constants.k1 : 1.2;
const documentLengths = documents.map((document: string) =>
getWordCount(document)
const documentLengths = documents.map((document) =>
getWordCount(document.text)
);
const averageDocumentLength =
documentLengths.reduce((a, b) => a + b, 0) / documents.length;
Expand All @@ -71,14 +81,14 @@ export function BM25(
return obj;
}, new Map<string, number>());

const scores = documents.map((document: string, index: number) => {
const scoredDocs = documents.map(({ text, docs }, index) => {
const score = keywords
.map((keyword: string) => {
const inverseDocumentFrequency = idfByKeyword.get(keyword);
if (inverseDocumentFrequency === undefined) {
throw new Error("Missing keyword.");
}
const termFrequency = getTermFrequency(keyword, document);
const termFrequency = getTermFrequency(keyword, text);
const documentLength = documentLengths[index];
return (
(inverseDocumentFrequency * (termFrequency * (k1 + 1))) /
Expand All @@ -87,14 +97,11 @@ export function BM25(
);
})
.reduce((a: number, b: number) => a + b, 0);
if (sorter) {
return { score, document } as BMDocument;
}
return score;
return { score, docs } as BMOutputDocument<T>;
});
// sort the results
if (sorter) {
return (scores as BMDocument[]).sort(sorter);
return scoredDocs.sort(sorter);
}
return scores as number[];
return scoredDocs;
}