@@ -3,7 +3,201 @@ package appliednlp.spell
3
3
object SpellingCorrector {
4
4
5
5
def main (args : Array [String ]) {
6
- println(" Hello" )
6
+
7
+ // The input sentence
8
+ val input = args(0 )
9
+
10
+ // Get the vocabulary from one or both input files (when available)
11
+ val firstVocab = getVocab(args(1 ))
12
+ val secondVocab = if (args.length> 2 ) getVocab(args(2 )) else Set [String ]()
13
+ val vocab = firstVocab ++ secondVocab
14
+
15
+ // Set up the vs candidate generator
16
+ val vsCandGen = VectorSpaceCandidateGenerator (vocab)
17
+
18
+ val editCandGen1 = EditDistanceCandidateGenerator (vocab)
19
+ val editCandGen2 = EditDistanceCandidateGenerator (vocab,TwoEdits )
20
+
21
+ // Get the language model
22
+ val unigramProb = LanguageModel (io.Source .fromFile(args(3 )).mkString)
23
+
24
+ // Process the input sentence.
25
+ println(" Detecting spelling errors in: " + input)
26
+ input.split(" " ).foreach { token => {
27
+ if (! vocab(token)) {
28
+ println(" ERROR: " + token)
29
+ val vsCandidates = vsCandGen(token)
30
+ println(" VS: " + vsCandidates.toSeq.sorted.mkString(" " ))
31
+
32
+ val ed1Candidates = editCandGen1(token)
33
+ println(" ED1: " + ed1Candidates.toSeq.sorted.mkString(" " ))
34
+
35
+ val ed2Candidates = editCandGen2(token)
36
+ println(" ED2: " + ed2Candidates.toSeq.sorted.mkString(" " ))
37
+
38
+ val allCandidates = vsCandidates ++ ed1Candidates
39
+ val best = allCandidates.toSeq.map(c => (c, unigramProb(c))).sorted.last._1
40
+ println(" Best: " + best)
41
+ }
42
+ }}
43
+ }
44
+
45
+ // Get a word list from a file with one word per line
46
+ def getVocab (filename : String ) =
47
+ io.Source .fromFile(filename).getLines.toSet
48
+
49
+ }
50
+
51
+
52
+ object LanguageModel {
53
+
54
+ def apply (text : String ) = {
55
+ val unigramCounts = collection.mutable.HashMap [String ,Double ]().withDefault(x=> 0.0 )
56
+ var numTokens = 1
57
+ text
58
+ .replaceAll(""" [^a-zA-Z\s]""" ," " )
59
+ .replaceAll(" \\ s+" ," " )
60
+ .split(" " )
61
+ .foreach { word => {
62
+ unigramCounts(word) += 1
63
+ numTokens += 1
64
+ }}
65
+ unigramCounts.mapValues(_/numTokens).toMap.withDefault(x=> 1.0 / numTokens)
66
+ }
67
+ }
68
+
69
+ /**
70
+ * Candidate generators produce valid words from the vocabulary that
71
+ * are close (by some measure) to the typo.
72
+ */
73
+ trait CandidateGenerator {
74
+
75
+ /**
76
+ * Produce a set of candidates for the typo.
77
+ *
78
+ * @param typo the typo that we need candidates for
79
+ * @return the set of candidates as determined by this candidate generator
80
+ */
81
+ def apply (typo : String ): Set [String ]
82
+ }
83
+
84
+ sealed trait NumEdits
85
+ object OneEdit extends NumEdits
86
+ object TwoEdits extends NumEdits
87
+
88
+ class EditDistanceCandidateGenerator (vocab : Set [String ], distance : NumEdits )
89
+ extends CandidateGenerator {
90
+
91
+ val alpha = ('A' to 'Z' ) ++ ('a' to 'z' )
92
+
93
+ def apply (typo : String ) = {
94
+ val candidates = distance match {
95
+ case OneEdit => edits(typo)
96
+ case TwoEdits => for (e1 <- edits(typo); e2 <- edits(e1)) yield e2
97
+ }
98
+ candidates.filter(vocab)
99
+ }
100
+
101
+ def edits (typo : String ) = {
102
+
103
+ val typoSeq = typo.toSeq
104
+ val typoLength = typoSeq.length
105
+ val nonTranspositions = (0 until typoLength).flatMap { i => {
106
+ val deletion = (typoSeq.take(i) ++ typoSeq.drop(i+ 1 )).mkString
107
+ val substitutions =
108
+ for (c <- alpha)
109
+ yield (typoSeq.take(i) ++ Seq (c) ++ typoSeq.drop(i+ 1 )).mkString
110
+ val insertions =
111
+ for (c <- alpha)
112
+ yield (typoSeq.take(i) ++ Seq (c) ++ typoSeq.drop(i)).mkString
113
+ Seq (deletion) ++ substitutions ++ insertions
114
+ }}
115
+ val transpositions = (1 until typoLength).map { i => {
116
+ (typoSeq.take(i- 1 )
117
+ ++ Seq (typoSeq(i),typoSeq(i- 1 ))
118
+ ++ typoSeq.drop(i+ 1 )).mkString
119
+ }}
120
+
121
+ (nonTranspositions ++ transpositions).toSet
122
+ }
123
+
124
+ }
125
+
126
+ object EditDistanceCandidateGenerator {
127
+
128
+ def apply (vocab : Set [String ], distance : NumEdits = OneEdit ) =
129
+ new EditDistanceCandidateGenerator (vocab, distance)
130
+ }
131
+
132
+
133
+ class VectorSpaceCandidateGenerator (
134
+ vocabVectors : Map [String , Map [String , Int ]],
135
+ invertedIndex : Map [String , Seq [String ]],
136
+ numCandidates : Int
137
+ ) extends CandidateGenerator {
138
+
139
+ import VectorSpaceCandidateGenerator ._
140
+
141
+ def apply (typo : String ) = {
142
+ val typoVector = getVector(typo)
143
+ typoVector
144
+ .keys
145
+ .flatMap(invertedIndex)
146
+ .toSeq
147
+ .map(c => (c,cosine(typoVector,vocabVectors(c))))
148
+ .sortBy(_._2)
149
+ .takeRight(numCandidates)
150
+ .map(_._1)
151
+ .toSet
7
152
}
8
153
9
154
}
155
+
156
+ /**
157
+ * A companion object to help set up VS candidate generators and
158
+ * provide helper functions.
159
+ */
160
+ object VectorSpaceCandidateGenerator {
161
+ import math .{sqrt ,pow }
162
+
163
+ def apply (vocab : Set [String ], numCandidates : Int = 20 ) = {
164
+
165
+ // A map from words to their counts. Can be used later to look up
166
+ // vectors for candidates without needing to recompute the counts.
167
+ // (Trading use of more space to make cosine computations faster.)
168
+ val vocabVectors : Map [String ,Map [String ,Int ]] =
169
+ vocab.map(word => (word, getVector(word))).toMap
170
+
171
+ // Build the inverted index.
172
+ val invertedIndex = vocabVectors
173
+ .toSeq
174
+ .flatMap { case (word, ngrams) => {
175
+ ngrams.map { case (ngram, count) => (ngram,word) }.toSeq
176
+ }}
177
+ .groupBy(x=> x._1)
178
+ .mapValues(_.map(_._2))
179
+ .withDefault(x=> Seq [String ]())
180
+
181
+ new VectorSpaceCandidateGenerator (vocabVectors, invertedIndex, numCandidates)
182
+ }
183
+
184
+ // Get the character ngrams in a word with their counts
185
+ def getVector (word : String , size : Int = 3 ): Map [String ,Int ] =
186
+ (" #" + word+ " #" )
187
+ .sliding(size)
188
+ .toSeq
189
+ .groupBy(x=> x)
190
+ .mapValues(_.length)
191
+ .withDefault(x=> 0 )
192
+
193
+ // Compute the cosine between two vectors
194
+ def cosine (x : Map [String ,Int ], y : Map [String , Int ]) = {
195
+ val dotProduct = x.map { case (k,v) => v* y(k) }.sum
196
+ dotProduct/ (norm(x)* norm(y))
197
+ }
198
+
199
+ // Compute the Euclidean norm of a vector
200
+ def norm (x : Map [String ,Int ]) = sqrt(x.values.map(pow(_,2 )).sum)
201
+
202
+ }
203
+
0 commit comments