Skip to content

Commit ba2224b

Browse files
Add answers for project phase 2.
1 parent ea3f4fc commit ba2224b

File tree

2 files changed

+267
-1
lines changed

2 files changed

+267
-1
lines changed

project/phase2/answers_p2.txt

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
Name:
2+
EID:
3+
4+
#### 1
5+
6+
* List the last five tweets from Austin, San Francisco, and New York City.
7+
8+
9+
#### 2
10+
11+
The address for your Github fork:
12+
13+
14+
#### 3
15+
16+
Show the command line calls requested.
17+
18+
* follow @wired, @theeconomist, @nytimes, and @wsj
19+
20+
* search for scala, java and python
21+
22+
* search the bounding boxes around Austin, San Francisco, and New York City
23+
24+
25+
#### 4
26+
27+
* Authorization description
28+
29+
30+
* Code description
31+
32+
33+
* Questions
34+
35+
36+
#### 5
37+
38+
* What you did to isEnglish
39+
40+
41+
* Accuracy and description
42+
43+
44+
#### 6
45+
46+
* Summary output
47+
48+
49+
* List ten tweets and say whether their labels are correct.
50+
51+
52+
* Remarks on anything extra you did.
53+
54+
55+
#### 7
56+
57+
* Give each of the terms you compared and their summary output.
58+
59+
60+
#### 8
61+
62+
* Summary output for the three cities.
63+
64+
65+
#### Extra
66+
67+
Discuss what you did, including any additional programs you wrote and how to run them (if that is straightforward enough), and a discussion of the output you obtained.
68+
69+
70+
71+
72+

src/main/scala/appliednlp/spell/Spelling.scala

Lines changed: 195 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,201 @@ package appliednlp.spell
33
object SpellingCorrector {
44

55
def main(args: Array[String]) {
6-
println("Hello")
6+
7+
// The input sentence
8+
val input = args(0)
9+
10+
// Get the vocabulary from one or both input files (when available)
11+
val firstVocab = getVocab(args(1))
12+
val secondVocab = if (args.length>2) getVocab(args(2)) else Set[String]()
13+
val vocab = firstVocab ++ secondVocab
14+
15+
// Set up the vs candidate generator
16+
val vsCandGen = VectorSpaceCandidateGenerator(vocab)
17+
18+
val editCandGen1 = EditDistanceCandidateGenerator(vocab)
19+
val editCandGen2 = EditDistanceCandidateGenerator(vocab,TwoEdits)
20+
21+
// Get the language model
22+
val unigramProb = LanguageModel(io.Source.fromFile(args(3)).mkString)
23+
24+
// Process the input sentence.
25+
println("Detecting spelling errors in: " + input)
26+
input.split(" ").foreach { token => {
27+
if (!vocab(token)) {
28+
println("ERROR: " + token)
29+
val vsCandidates = vsCandGen(token)
30+
println(" VS: " + vsCandidates.toSeq.sorted.mkString(" "))
31+
32+
val ed1Candidates = editCandGen1(token)
33+
println(" ED1: " + ed1Candidates.toSeq.sorted.mkString(" "))
34+
35+
val ed2Candidates = editCandGen2(token)
36+
println(" ED2: " + ed2Candidates.toSeq.sorted.mkString(" "))
37+
38+
val allCandidates = vsCandidates ++ ed1Candidates
39+
val best = allCandidates.toSeq.map(c => (c, unigramProb(c))).sorted.last._1
40+
println(" Best: " + best)
41+
}
42+
}}
43+
}
44+
45+
// Get a word list from a file with one word per line
46+
def getVocab(filename: String) =
47+
io.Source.fromFile(filename).getLines.toSet
48+
49+
}
50+
51+
52+
object LanguageModel {
53+
54+
def apply(text: String) = {
55+
val unigramCounts = collection.mutable.HashMap[String,Double]().withDefault(x=>0.0)
56+
var numTokens = 1
57+
text
58+
.replaceAll("""[^a-zA-Z\s]""","")
59+
.replaceAll("\\s+"," ")
60+
.split(" ")
61+
.foreach { word => {
62+
unigramCounts(word) += 1
63+
numTokens += 1
64+
}}
65+
unigramCounts.mapValues(_/numTokens).toMap.withDefault(x=>1.0/numTokens)
66+
}
67+
}
68+
69+
/**
70+
* Candidate generators produce valid words from the vocabulary that
71+
* are close (by some measure) to the typo.
72+
*/
73+
trait CandidateGenerator {
74+
75+
/**
76+
* Produce a set of candidates for the typo.
77+
*
78+
* @param typo the typo that we need candidates for
79+
* @return the set of candidates as determined by this candidate generator
80+
*/
81+
def apply(typo: String): Set[String]
82+
}
83+
84+
sealed trait NumEdits
85+
object OneEdit extends NumEdits
86+
object TwoEdits extends NumEdits
87+
88+
class EditDistanceCandidateGenerator(vocab: Set[String], distance: NumEdits)
89+
extends CandidateGenerator {
90+
91+
val alpha = ('A' to 'Z') ++ ('a' to 'z')
92+
93+
def apply(typo: String) = {
94+
val candidates = distance match {
95+
case OneEdit => edits(typo)
96+
case TwoEdits => for (e1 <- edits(typo); e2 <- edits(e1)) yield e2
97+
}
98+
candidates.filter(vocab)
99+
}
100+
101+
def edits(typo: String) = {
102+
103+
val typoSeq = typo.toSeq
104+
val typoLength = typoSeq.length
105+
val nonTranspositions = (0 until typoLength).flatMap { i => {
106+
val deletion = (typoSeq.take(i) ++ typoSeq.drop(i+1)).mkString
107+
val substitutions =
108+
for (c <- alpha)
109+
yield (typoSeq.take(i) ++ Seq(c) ++ typoSeq.drop(i+1)).mkString
110+
val insertions =
111+
for (c <- alpha)
112+
yield (typoSeq.take(i) ++ Seq(c) ++ typoSeq.drop(i)).mkString
113+
Seq(deletion) ++ substitutions ++ insertions
114+
}}
115+
val transpositions = (1 until typoLength).map { i => {
116+
(typoSeq.take(i-1)
117+
++ Seq(typoSeq(i),typoSeq(i-1))
118+
++ typoSeq.drop(i+1)).mkString
119+
}}
120+
121+
(nonTranspositions ++ transpositions).toSet
122+
}
123+
124+
}
125+
126+
object EditDistanceCandidateGenerator {
127+
128+
def apply(vocab: Set[String], distance: NumEdits = OneEdit) =
129+
new EditDistanceCandidateGenerator(vocab, distance)
130+
}
131+
132+
133+
class VectorSpaceCandidateGenerator(
134+
vocabVectors: Map[String, Map[String, Int]],
135+
invertedIndex: Map[String, Seq[String]],
136+
numCandidates: Int
137+
) extends CandidateGenerator {
138+
139+
import VectorSpaceCandidateGenerator._
140+
141+
def apply(typo: String) = {
142+
val typoVector = getVector(typo)
143+
typoVector
144+
.keys
145+
.flatMap(invertedIndex)
146+
.toSeq
147+
.map(c => (c,cosine(typoVector,vocabVectors(c))))
148+
.sortBy(_._2)
149+
.takeRight(numCandidates)
150+
.map(_._1)
151+
.toSet
7152
}
8153

9154
}
155+
156+
/**
157+
* A companion object to help set up VS candidate generators and
158+
* provide helper functions.
159+
*/
160+
object VectorSpaceCandidateGenerator {
161+
import math.{sqrt,pow}
162+
163+
def apply(vocab: Set[String], numCandidates: Int = 20) = {
164+
165+
// A map from words to their counts. Can be used later to look up
166+
// vectors for candidates without needing to recompute the counts.
167+
// (Trading use of more space to make cosine computations faster.)
168+
val vocabVectors: Map[String,Map[String,Int]] =
169+
vocab.map(word => (word, getVector(word))).toMap
170+
171+
// Build the inverted index.
172+
val invertedIndex = vocabVectors
173+
.toSeq
174+
.flatMap { case(word, ngrams) => {
175+
ngrams.map { case(ngram, count) => (ngram,word) }.toSeq
176+
}}
177+
.groupBy(x=>x._1)
178+
.mapValues(_.map(_._2))
179+
.withDefault(x=>Seq[String]())
180+
181+
new VectorSpaceCandidateGenerator(vocabVectors, invertedIndex, numCandidates)
182+
}
183+
184+
// Get the character ngrams in a word with their counts
185+
def getVector(word: String, size: Int = 3): Map[String,Int] =
186+
("#"+word+"#")
187+
.sliding(size)
188+
.toSeq
189+
.groupBy(x=>x)
190+
.mapValues(_.length)
191+
.withDefault(x=>0)
192+
193+
// Compute the cosine between two vectors
194+
def cosine(x: Map[String,Int], y: Map[String, Int]) = {
195+
val dotProduct = x.map { case(k,v) => v*y(k) }.sum
196+
dotProduct/(norm(x)*norm(y))
197+
}
198+
199+
// Compute the Euclidean norm of a vector
200+
def norm(x: Map[String,Int]) = sqrt(x.values.map(pow(_,2)).sum)
201+
202+
}
203+

0 commit comments

Comments
 (0)