|
| 1 | +package bm25 |
| 2 | + |
| 3 | +import ( |
| 4 | + "fmt" |
| 5 | + "sort" |
| 6 | + "strings" |
| 7 | + |
| 8 | + "github.com/go-nlp/tfidf" |
| 9 | +) |
| 10 | + |
| 11 | +var mobydick = []string{ |
| 12 | + "Call me Ishmael .", |
| 13 | + "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world .", |
| 14 | + "It is a way I have of driving off the spleen and regulating the circulation .", |
| 15 | + "Whenever I find myself growing grim about the mouth ; ", |
| 16 | + "whenever it is a damp , drizzly November in my soul ; ", |
| 17 | + "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; ", |
| 18 | + "and especially whenever my hypos get such an upper hand of me , that it requires a strong moral principle to prevent me from deliberately stepping into the street , and methodically knocking people's hats off -- then , I account it high time to get to sea as soon as I can .", |
| 19 | + "This is my substitute for pistol and ball . ", |
| 20 | + "With a philosophical flourish Cato throws himself upon his sword ; ", |
| 21 | + "I quietly take to the ship . There is nothing surprising in this .", |
| 22 | + "If they but knew it , almost all men in their degree , some time or other , cherish very nearly the same feelings towards the ocean with me .", |
| 23 | +} |
| 24 | + |
| 25 | +type doc []int |
| 26 | + |
| 27 | +func (d doc) IDs() []int { return []int(d) } |
| 28 | + |
| 29 | +func makeCorpus(a []string) (map[string]int, []string) { |
| 30 | + retVal := make(map[string]int) |
| 31 | + invRetVal := make([]string, 0) |
| 32 | + var id int |
| 33 | + for _, s := range a { |
| 34 | + for _, f := range strings.Fields(s) { |
| 35 | + f = strings.ToLower(f) |
| 36 | + if _, ok := retVal[f]; !ok { |
| 37 | + retVal[f] = id |
| 38 | + invRetVal = append(invRetVal, f) |
| 39 | + id++ |
| 40 | + } |
| 41 | + } |
| 42 | + } |
| 43 | + return retVal, invRetVal |
| 44 | +} |
| 45 | + |
| 46 | +func makeDocuments(a []string, c map[string]int) []tfidf.Document { |
| 47 | + retVal := make([]tfidf.Document, 0, len(a)) |
| 48 | + for _, s := range a { |
| 49 | + var ts []int |
| 50 | + for _, f := range strings.Fields(s) { |
| 51 | + f = strings.ToLower(f) |
| 52 | + id := c[f] |
| 53 | + ts = append(ts, id) |
| 54 | + } |
| 55 | + retVal = append(retVal, doc(ts)) |
| 56 | + } |
| 57 | + return retVal |
| 58 | +} |
| 59 | + |
| 60 | +func Example_BM25() { |
| 61 | + corpus, _ := makeCorpus(mobydick) |
| 62 | + docs := makeDocuments(mobydick, corpus) |
| 63 | + tf := tfidf.New() |
| 64 | + |
| 65 | + for _, doc := range docs { |
| 66 | + tf.Add(doc) |
| 67 | + } |
| 68 | + tf.CalculateIDF() |
| 69 | + |
| 70 | + // now we search |
| 71 | + |
| 72 | + // "ishmael" is a query |
| 73 | + ishmael := doc{corpus["ishmael"]} |
| 74 | + |
| 75 | + // "whenever i find" is another query |
| 76 | + whenever := doc{corpus["whenever"]} |
| 77 | + |
| 78 | + ishmaelScores := BM25(tf, ishmael, docs, 1.5, 0.75) |
| 79 | + wheneverScores := BM25(tf, whenever, docs, 1.5, 0.75) |
| 80 | + |
| 81 | + sort.Sort(sort.Reverse(ishmaelScores)) |
| 82 | + sort.Sort(sort.Reverse(wheneverScores)) |
| 83 | + |
| 84 | + fmt.Printf("Top 3 Relevant Docs to \"Ishmael\":\n") |
| 85 | + for _, d := range ishmaelScores[:3] { |
| 86 | + fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID]) |
| 87 | + } |
| 88 | + fmt.Println("") |
| 89 | + fmt.Printf("Top 3 Relevant Docs to \"whenever i find\":\n") |
| 90 | + for _, d := range wheneverScores[:3] { |
| 91 | + fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID]) |
| 92 | + } |
| 93 | + // Output: |
| 94 | + // Top 3 Relevant Docs to "Ishmael": |
| 95 | + // ID : 0 |
| 96 | + // Score: 3.706 |
| 97 | + // Doc : "Call me Ishmael ." |
| 98 | + // ID : 1 |
| 99 | + // Score: 0.000 |
| 100 | + // Doc : "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world ." |
| 101 | + // ID : 2 |
| 102 | + // Score: 0.000 |
| 103 | + // Doc : "It is a way I have of driving off the spleen and regulating the circulation ." |
| 104 | + // |
| 105 | + // Top 3 Relevant Docs to "whenever i find": |
| 106 | + // ID : 3 |
| 107 | + // Score: 2.031 |
| 108 | + // Doc : "Whenever I find myself growing grim about the mouth ; " |
| 109 | + // ID : 4 |
| 110 | + // Score: 1.982 |
| 111 | + // Doc : "whenever it is a damp , drizzly November in my soul ; " |
| 112 | + // ID : 5 |
| 113 | + // Score: 1.810 |
| 114 | + // Doc : "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; " |
| 115 | + |
| 116 | +} |
0 commit comments