Skip to content

Commit 69eb4d2

Browse files
committed
FIRST
0 parents  commit 69eb4d2

File tree

5 files changed

+227
-0
lines changed

5 files changed

+227
-0
lines changed

.travis.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
sudo: false
2+
language: go
3+
branches:
4+
only:
5+
- master
6+
7+
go:
8+
- 1.13.x
9+
- 1.14.x
10+
- tip
11+
12+
env:
13+
global:
14+
- GOARCH=amd64
15+
- TRAVISTEST=true
16+
17+
before_install:
18+
- go get github.com/mattn/goveralls
19+
20+
script:
21+
- go test -run=. -coverprofile=profile.cov
22+
- $HOME/gopath/bin/goveralls -coverprofile=profile.cov -service=travis-ci
23+
24+
matrix:
25+
allow_failures:
26+
- go: tip

bm25.go

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// package bm25 is a lingo-friendly BM25 library.
2+
// BM25 is a scoring function that relies on TFIDF, and is useful for document retrieval
3+
package bm25
4+
5+
import (
6+
"sort"
7+
8+
"github.com/go-nlp/tfidf"
9+
"github.com/xtgo/set"
10+
)
11+
12+
// DocScore is a tuple of the document ID and a score
13+
type DocScore struct {
14+
ID int
15+
Score float64
16+
}
17+
18+
// DocScores is a list of DocScore
19+
type DocScores []DocScore
20+
21+
func (ds DocScores) Len() int { return len(ds) }
22+
func (ds DocScores) Less(i, j int) bool { return ds[i].Score < ds[j].Score }
23+
func (ds DocScores) Swap(i, j int) {
24+
ds[i].Score, ds[j].Score = ds[j].Score, ds[i].Score
25+
ds[i].ID, ds[j].ID = ds[j].ID, ds[i].ID
26+
}
27+
28+
// BM25 is the scoring function.
29+
//
30+
// k1 should be between 1.2 and 2.
31+
// b should be around 0.75
32+
func BM25(tf *tfidf.TFIDF, query tfidf.Document, docs []tfidf.Document, k1, b float64) DocScores {
33+
q := tfidf.BOW(query)
34+
w := make([]int, len(q))
35+
copy(w, q)
36+
avgLen := float64(tf.Len) / float64(tf.Docs)
37+
38+
scores := make([]float64, 0, len(docs))
39+
for _, doc := range docs {
40+
//TF := tfidf.TF(doc)
41+
d := tfidf.BOW(doc)
42+
w = append(w, d...)
43+
size := set.Inter(sort.IntSlice(w), len(q))
44+
n := w[:size]
45+
46+
score := make([]float64, 0, len(n))
47+
docLen := float64(len(d))
48+
for _, id := range n {
49+
num := (tf.TF[id] * (k1 + 1))
50+
denom := (tf.TF[id] + k1*(1-b+b*docLen/avgLen))
51+
idf := tf.IDF[id]
52+
score = append(score, idf*num/denom)
53+
}
54+
scores = append(scores, sum(score))
55+
56+
// reset working vector
57+
copy(w, q)
58+
w = w[:len(q)]
59+
}
60+
var retVal DocScores
61+
for i := range docs {
62+
retVal = append(retVal, DocScore{i, scores[i]})
63+
}
64+
return retVal
65+
}
66+
67+
func sum(a []float64) float64 {
68+
var retVal float64
69+
for _, f := range a {
70+
retVal += f
71+
}
72+
return retVal
73+
}

bm25_test.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package bm25
2+
3+
import (
4+
"fmt"
5+
"sort"
6+
"strings"
7+
8+
"github.com/go-nlp/tfidf"
9+
)
10+
11+
var mobydick = []string{
12+
"Call me Ishmael .",
13+
"Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world .",
14+
"It is a way I have of driving off the spleen and regulating the circulation .",
15+
"Whenever I find myself growing grim about the mouth ; ",
16+
"whenever it is a damp , drizzly November in my soul ; ",
17+
"whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; ",
18+
"and especially whenever my hypos get such an upper hand of me , that it requires a strong moral principle to prevent me from deliberately stepping into the street , and methodically knocking people's hats off -- then , I account it high time to get to sea as soon as I can .",
19+
"This is my substitute for pistol and ball . ",
20+
"With a philosophical flourish Cato throws himself upon his sword ; ",
21+
"I quietly take to the ship . There is nothing surprising in this .",
22+
"If they but knew it , almost all men in their degree , some time or other , cherish very nearly the same feelings towards the ocean with me .",
23+
}
24+
25+
type doc []int
26+
27+
func (d doc) IDs() []int { return []int(d) }
28+
29+
func makeCorpus(a []string) (map[string]int, []string) {
30+
retVal := make(map[string]int)
31+
invRetVal := make([]string, 0)
32+
var id int
33+
for _, s := range a {
34+
for _, f := range strings.Fields(s) {
35+
f = strings.ToLower(f)
36+
if _, ok := retVal[f]; !ok {
37+
retVal[f] = id
38+
invRetVal = append(invRetVal, f)
39+
id++
40+
}
41+
}
42+
}
43+
return retVal, invRetVal
44+
}
45+
46+
func makeDocuments(a []string, c map[string]int) []tfidf.Document {
47+
retVal := make([]tfidf.Document, 0, len(a))
48+
for _, s := range a {
49+
var ts []int
50+
for _, f := range strings.Fields(s) {
51+
f = strings.ToLower(f)
52+
id := c[f]
53+
ts = append(ts, id)
54+
}
55+
retVal = append(retVal, doc(ts))
56+
}
57+
return retVal
58+
}
59+
60+
func Example_BM25() {
61+
corpus, _ := makeCorpus(mobydick)
62+
docs := makeDocuments(mobydick, corpus)
63+
tf := tfidf.New()
64+
65+
for _, doc := range docs {
66+
tf.Add(doc)
67+
}
68+
tf.CalculateIDF()
69+
70+
// now we search
71+
72+
// "ishmael" is a query
73+
ishmael := doc{corpus["ishmael"]}
74+
75+
// "whenever i find" is another query
76+
whenever := doc{corpus["whenever"]}
77+
78+
ishmaelScores := BM25(tf, ishmael, docs, 1.5, 0.75)
79+
wheneverScores := BM25(tf, whenever, docs, 1.5, 0.75)
80+
81+
sort.Sort(sort.Reverse(ishmaelScores))
82+
sort.Sort(sort.Reverse(wheneverScores))
83+
84+
fmt.Printf("Top 3 Relevant Docs to \"Ishmael\":\n")
85+
for _, d := range ishmaelScores[:3] {
86+
fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID])
87+
}
88+
fmt.Println("")
89+
fmt.Printf("Top 3 Relevant Docs to \"whenever i find\":\n")
90+
for _, d := range wheneverScores[:3] {
91+
fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID])
92+
}
93+
// Output:
94+
// Top 3 Relevant Docs to "Ishmael":
95+
// ID : 0
96+
// Score: 3.706
97+
// Doc : "Call me Ishmael ."
98+
// ID : 1
99+
// Score: 0.000
100+
// Doc : "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world ."
101+
// ID : 2
102+
// Score: 0.000
103+
// Doc : "It is a way I have of driving off the spleen and regulating the circulation ."
104+
//
105+
// Top 3 Relevant Docs to "whenever i find":
106+
// ID : 3
107+
// Score: 2.031
108+
// Doc : "Whenever I find myself growing grim about the mouth ; "
109+
// ID : 4
110+
// Score: 1.982
111+
// Doc : "whenever it is a damp , drizzly November in my soul ; "
112+
// ID : 5
113+
// Score: 1.810
114+
// Doc : "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; "
115+
116+
}

go.mod

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
module github.com/go-nlp/bm25
2+
3+
go 1.14
4+
5+
require (
6+
github.com/go-nlp/tfidf v1.1.0
7+
github.com/xtgo/set v1.0.0
8+
)

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
github.com/go-nlp/tfidf v1.0.0 h1:usRGZjJO/MkU4Oq2Xa836MTpAgfhyb2kLyIkjnKfWY0=
2+
github.com/go-nlp/tfidf v1.0.0/go.mod h1:FHOpf09wrdELx7OnbxywpW4Cs0Q3r15QzpOc73rmUTo=
3+
github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
4+
github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=

0 commit comments

Comments
 (0)