Skip to content

Commit

Permalink
many small changes, sorry future self
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed May 24, 2012
1 parent 8dcf307 commit 1c0dfc2
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 34 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ logfile
*.py.*
*.*gz
*.html
.#*
*.*#
kernprof.py
possible_classifiers
7 changes: 3 additions & 4 deletions dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def trainModel(training_data, iterations, data_model) :
return(data_model)



def findDuplicates(candidates, data_d, data_model, threshold) :
duplicateScores = []

Expand All @@ -90,7 +89,6 @@ def findDuplicates(candidates, data_d, data_model, threshold) :
#print (pair, score)
if score > threshold :
#print (data_d[pair[0]],data_d[pair[1]])
#print score
duplicateScores.append({ pair : score })

return duplicateScores
Expand All @@ -101,7 +99,7 @@ def findDuplicates(candidates, data_d, data_model, threshold) :
from test_data import init
num_training_dupes = 200
num_training_distinct = 16000
numIterations = 50
numIterations = 100

import time
t0 = time.time()
Expand Down Expand Up @@ -131,6 +129,7 @@ def findDuplicates(candidates, data_d, data_model, threshold) :
commonSixGram),
data_model, 1, 1)


blocked_data = blockingIndex(data_d, predicates)
candidates = mergeBlocks(blocked_data)

Expand Down Expand Up @@ -170,7 +169,7 @@ def findDuplicates(candidates, data_d, data_model, threshold) :

print "finding duplicates ..."
print ""
dupes = findDuplicates(candidates, data_d, data_model, .40)
dupes = findDuplicates(candidates, data_d, data_model, .60)

dupe_ids = set([frozenset(list(dupe_pair.keys()[0])) for dupe_pair in dupes])
true_positives = dupe_ids & duplicates_s
Expand Down
17 changes: 8 additions & 9 deletions lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __init__(self):
self.rate = 0.01
self.weight = {}
self.bias = 0
self.alpha = 0.001
self.alpha = 0.0001
return
# data is a list of [label, feature]. label is an integer,
# 1 for positive instance, 0 for negative instance. feature is
Expand All @@ -28,21 +28,20 @@ def train(self, data, n):
max_update = 0
for [label, feature] in data:
predicted = self.classify(feature)

rate_n = self.rate - (self.rate * i)/float(n)

for f,v in feature.iteritems():
if f not in self.weight:
self.weight[f] = 0
print f
update = (label - predicted) * v - self.alpha * self.weight[f]
self.weight[f] += self.rate * update
self.weight[f] += rate_n * update
if abs(update * self.rate) > max_update :
max_update = abs(update * self.rate)
max_update = abs(update * rate_n)
bias_update = (label - predicted)
self.bias += self.rate * bias_update
print 'iteration', i, 'done. Max update:', max_update
if abs(max_update - old_update)/max_update < .0001 : return
else : old_update = max_update
self.bias += rate_n * bias_update
#print 'iteration', i, 'done. Max update:', max_update
if max_update < .0001 : return
#else : old_update = max_update
return
# feature is a dict object, the key is feature name, the value
# is feature weight. Return value is the probability of being
Expand Down
18 changes: 17 additions & 1 deletion test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,25 @@ def canonicalImport(filename) :
if header[j] == 'unique_id' :
duplicates_d.setdefault(col, []).append(i)
else :
# we may want to think about removing common stop
# words
#col = col.strip()
#col = re.sub('[^a-z0-9 ]', ' ', col)
#col = re.sub(' +', ' ', col)
#col = re.sub('\.', ' ', col)
#col = re.sub(r'\bthe\b', ' ', col)
#col = re.sub(r'restaurant', ' ', col)
#col = re.sub(r'cafe', ' ', col)
#col = re.sub(r'diner', ' ', col)
#col = re.sub(r'\(.*\)', ' ', col)

#col = re.sub(r'\bn\.', ' ', col)
#col = re.sub(r'\bs\.', ' ', col)
#col = re.sub(r'\be\.', ' ', col)
#col = re.sub(r'\bw\.', ' ', col)
col = re.sub(r'\broad\b', 'rd', col)
col = re.sub(' +', ' ', col)


instance[header[j]] = col.strip().strip('"').strip("'")

data_d[i] = instance
Expand Down
30 changes: 10 additions & 20 deletions testaffine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import cProfile
from affinegap import affineGapDistance
from affinegap import affineGapDistance, normalizedAffineGapDistance

def performanceTest() :
for i in xrange(10000) :
Expand All @@ -10,29 +10,19 @@ def performanceTest() :
def correctnessTest() :
print affineGapDistance('a', 'b', -5, 5, 5, 1) == 5
print affineGapDistance('ab', 'cd', -5, 5, 5, 1) == 10
print affineGapDistance('ab', 'cde', -5, 5, 5, 1) == 15
print affineGapDistance('a', 'cde', -5, 5, 5, 1) == 12
print affineGapDistance('a', 'cd', -5, 5, 5, 1) == 11
print affineGapDistance('ab', 'cde', -5, 5, 5, 1) == 13
print affineGapDistance('a', 'cde', -5, 5, 5, 1) == 8.5
print affineGapDistance('a', 'cd', -5, 5, 5, 1) == 8
print affineGapDistance('b', 'a', -5, 5, 5, 1) == 5
print affineGapDistance('a', 'a', -5, 5, 5, 1) == -5
print affineGapDistance('a', '', -5, 5, 5, 1) == 6
print affineGapDistance('a', '', -5, 5, 5, 1) == 3
print affineGapDistance('aba', 'aaa', -5, 5, 5, 1) == -5
print affineGapDistance('aaa', 'aba', -5, 5, 5, 1) == -5
print affineGapDistance('aaa', 'aa', -5, 5, 5, 1) == -4
print affineGapDistance('aaa', 'a', -5, 5, 5, 1) == 2
print affineGapDistance('aaa', '', -5, 5, 5, 1) == 8
print affineGapDistance('aaa', 'abba', -5, 5, 5, 1) == 1
print affineGapDistance('0', '1', -5, 5, 5, 1) == 10
print affineGapDistance('0', '2', -5, 5, 5, 1) == 10
print affineGapDistance('0', '3', -5, 5, 5, 1) == 10
print affineGapDistance('0', '4', -5, 5, 5, 1) == 10
print affineGapDistance('0', '5', -5, 5, 5, 1) == 10
print affineGapDistance('0', '6', -5, 5, 5, 1) == 10
print affineGapDistance('0', '7', -5, 5, 5, 1) == 10
print affineGapDistance('0', '8', -5, 5, 5, 1) == 10
print affineGapDistance('0', '9', -5, 5, 5, 1) == 10
print affineGapDistance('0', 'a', -5, 5, 5, 1) == 5
print affineGapDistance('0', '', -5, 5, 5, 1) == 6
print affineGapDistance('aaa', 'aa', -5, 5, 5, 1) == -7
print affineGapDistance('aaa', 'a', -5, 5, 5, 1) == -1.5
print affineGapDistance('aaa', '', -5, 5, 5, 1) == 4
print affineGapDistance('aaa', 'abba', -5, 5, 5, 1) == 8
print normalizedAffineGapDistance("bone's", "bone's restaurant", -5, 5, 5, 1)



Expand Down

0 comments on commit 1c0dfc2

Please sign in to comment.