many small changes, sorry future self

dedupeio · May 24, 2012 · 1c0dfc2 · 1c0dfc2
1 parent 8dcf307
commit 1c0dfc2
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,5 +8,7 @@ logfile
 *.py.*
 *.*gz
 *.html
+.#*
+*.*#
 kernprof.py
 possible_classifiers
diff --git a/dedupe.py b/dedupe.py
@@ -73,7 +73,6 @@ def trainModel(training_data, iterations, data_model) :
     return(data_model)
 
 
-
 def findDuplicates(candidates, data_d, data_model, threshold) :
   duplicateScores = []
 
@@ -90,7 +89,6 @@ def findDuplicates(candidates, data_d, data_model, threshold) :
     #print (pair, score)
     if score > threshold :
     #print (data_d[pair[0]],data_d[pair[1]])
-    #print score
       duplicateScores.append({ pair : score })
 
   return duplicateScores
@@ -101,7 +99,7 @@ def findDuplicates(candidates, data_d, data_model, threshold) :
   from test_data import init
   num_training_dupes = 200
   num_training_distinct = 16000
-  numIterations = 50
+  numIterations = 100
 
   import time
   t0 = time.time()
@@ -131,6 +129,7 @@ def findDuplicates(candidates, data_d, data_model, threshold) :
                              commonSixGram),
                              data_model, 1, 1)
 
+
   blocked_data = blockingIndex(data_d, predicates)
   candidates = mergeBlocks(blocked_data)
 
@@ -170,7 +169,7 @@ def findDuplicates(candidates, data_d, data_model, threshold) :
 
   print "finding duplicates ..."
   print ""
-  dupes = findDuplicates(candidates, data_d, data_model, .40)
+  dupes = findDuplicates(candidates, data_d, data_model, .60)
 
   dupe_ids = set([frozenset(list(dupe_pair.keys()[0])) for dupe_pair in dupes])
   true_positives = dupe_ids & duplicates_s

diff --git a/lr.py b/lr.py
@@ -12,7 +12,7 @@ def __init__(self):
         self.rate = 0.01
         self.weight = {}
         self.bias = 0
-        self.alpha = 0.001
+        self.alpha = 0.0001
         return
     # data is a list of [label, feature]. label is an integer,
     # 1 for positive instance, 0 for negative instance. feature is
@@ -28,21 +28,20 @@ def train(self, data, n):
             max_update = 0
             for [label, feature] in data:
                 predicted = self.classify(feature)
-
+                rate_n = self.rate - (self.rate * i)/float(n)
 
                 for f,v in feature.iteritems():
                     if f not in self.weight:
                         self.weight[f] = 0
-                        print f
                     update = (label - predicted) * v - self.alpha * self.weight[f]
-                    self.weight[f] += self.rate * update
+                    self.weight[f] += rate_n * update
                     if abs(update * self.rate) > max_update :
-                        max_update = abs(update * self.rate)
+                        max_update = abs(update * rate_n)
                 bias_update = (label - predicted) 
-                self.bias += self.rate * bias_update
-            print 'iteration', i, 'done. Max update:', max_update
-            if abs(max_update - old_update)/max_update < .0001 : return
-            else : old_update = max_update
+                self.bias += rate_n * bias_update
+            #print 'iteration', i, 'done. Max update:', max_update
+            if max_update < .0001 : return
+            #else : old_update = max_update
         return
     # feature is a dict object, the key is feature name, the value
     # is feature weight. Return value is the probability of being

diff --git a/test_data.py b/test_data.py
@@ -16,9 +16,25 @@ def canonicalImport(filename) :
               if header[j] == 'unique_id' :
                 duplicates_d.setdefault(col, []).append(i)
               else :
+                # we may want to think about removing common stop
+                # words
                 #col = col.strip()
                 #col = re.sub('[^a-z0-9 ]', ' ', col)
-                #col = re.sub('  +', ' ', col)
+                #col = re.sub('\.', ' ', col)
+                #col = re.sub(r'\bthe\b', ' ', col)
+                #col = re.sub(r'restaurant', ' ', col)
+                #col = re.sub(r'cafe', ' ', col)
+                #col = re.sub(r'diner', ' ', col)
+                #col = re.sub(r'\(.*\)', ' ', col)
+
+                #col = re.sub(r'\bn\.', ' ', col)
+                #col = re.sub(r'\bs\.', ' ', col)
+                #col = re.sub(r'\be\.', ' ', col)
+                #col = re.sub(r'\bw\.', ' ', col)
+                col = re.sub(r'\broad\b', 'rd', col)
+                col = re.sub('  +', ' ', col)
+
+
                 instance[header[j]] = col.strip().strip('"').strip("'")
 
             data_d[i] = instance

diff --git a/testaffine.py b/testaffine.py
@@ -1,5 +1,5 @@
 import cProfile
-from affinegap import affineGapDistance
+from affinegap import affineGapDistance, normalizedAffineGapDistance
 
 def performanceTest() :
   for i in xrange(10000) :
@@ -10,29 +10,19 @@ def performanceTest() :
 def correctnessTest() :
   print affineGapDistance('a', 'b', -5, 5, 5, 1) == 5
   print affineGapDistance('ab', 'cd', -5, 5, 5, 1) == 10
-  print affineGapDistance('ab', 'cde', -5, 5, 5, 1) == 15
-  print affineGapDistance('a', 'cde', -5, 5, 5, 1) == 12
-  print affineGapDistance('a', 'cd', -5, 5, 5, 1) == 11
+  print affineGapDistance('ab', 'cde', -5, 5, 5, 1) == 13
+  print affineGapDistance('a', 'cde', -5, 5, 5, 1) == 8.5
+  print affineGapDistance('a', 'cd', -5, 5, 5, 1) == 8
   print affineGapDistance('b', 'a', -5, 5, 5, 1) == 5
   print affineGapDistance('a', 'a', -5, 5, 5, 1) == -5
-  print affineGapDistance('a', '', -5, 5, 5, 1) == 6
+  print affineGapDistance('a', '', -5, 5, 5, 1) == 3
   print affineGapDistance('aba', 'aaa', -5, 5, 5, 1) == -5
   print affineGapDistance('aaa', 'aba', -5, 5, 5, 1) == -5
-  print affineGapDistance('aaa', 'aa', -5, 5, 5, 1) == -4
-  print affineGapDistance('aaa', 'a', -5, 5, 5, 1) == 2
-  print affineGapDistance('aaa', '', -5, 5, 5, 1) == 8
-  print affineGapDistance('aaa', 'abba', -5, 5, 5, 1) == 1
-  print affineGapDistance('0', '1', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '2', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '3', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '4', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '5', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '6', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '7', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '8', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', '9', -5, 5, 5, 1) == 10
-  print affineGapDistance('0', 'a', -5, 5, 5, 1) == 5
-  print affineGapDistance('0', '', -5, 5, 5, 1) == 6
+  print affineGapDistance('aaa', 'aa', -5, 5, 5, 1) == -7
+  print affineGapDistance('aaa', 'a', -5, 5, 5, 1) == -1.5
+  print affineGapDistance('aaa', '', -5, 5, 5, 1) == 4
+  print affineGapDistance('aaa', 'abba', -5, 5, 5, 1) == 8
+  print normalizedAffineGapDistance("bone's", "bone's restaurant", -5, 5, 5, 1)
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,5 +8,7 @@ logfile @@
     *.py.*
     *.*gz
     *.html
+    .#*
+    *.*#
     kernprof.py
     possible_classifiers