Skip to content

Commit

Permalink
continued replacing training data with numpy arrays, crossvalidation …
Browse files Browse the repository at this point in the history
…currently not working
  • Loading branch information
derekeder committed Sep 7, 2012
1 parent 7ea70c8 commit 97a9054
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 55 deletions.
25 changes: 17 additions & 8 deletions dedupe/crossvalidation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import core
from random import shuffle
import copy
import numpy

#http://code.activestate.com/recipes/521906-k-fold-cross-validation-partition/

Expand All @@ -17,28 +18,38 @@ def gridSearch(training_data,

print "using cross validation to find optimum alpha"
scores = []

fields = training_data[0][1][0]

for alpha in search_space :
all_score = 0
all_N = 0
for training, validation in kFolds(training_data, k) :

data_model = trainer(training, num_iterations, original_data_model, alpha)

weights = numpy.array([data_model['fields'][field]['weight'] for field in fields])
print weights,

(real_labels,
validation_distances) = zip(*[(label, distances)
for label, distances in validation])

predicted_labels = []

for pair in validation_distances :
prediction = data_model["bias"]
for name, distance in pair.iteritems() :
prediction += distance * data_model['fields'][name]["weight"]
predicted_labels = []
bias = data_model["bias"]
print bias
for example in validation_distances :
prediction = bias + numpy.dot(weights, example[1])
print prediction
#for name, distance in example.iteritems() :
# prediction += distance * data_model['fields'][name]["weight"]
if prediction > 0 :
predicted_labels.append(1)
else :
predicted_labels.append(0)

print predicted_labels

score = 0
for real_label, predicted_label in zip(real_labels, predicted_labels) :
if real_label == predicted_label :
Expand All @@ -55,9 +66,7 @@ def gridSearch(training_data,
return best_alpha

def kFolds(training_data, k):

slices = [training_data[i::k] for i in xrange(k)]

for i in xrange(k):
validation = slices[i]
training = [datum
Expand Down
17 changes: 14 additions & 3 deletions dedupe/dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from predicates import *
import blocking
import clustering
import numpy

def sampleDict(d, sample_size) :

Expand Down Expand Up @@ -42,20 +43,30 @@ def initializeSettings(self, fields) :
self.data_model['fields'][k] = v

self.data_model['bias'] = 0

field_dtype = [('names', 'a20', (len(fields)),),
('values', 'f4', (len(fields)),)
]

training_dtype = [('label', 'i4'),
('field_distances', field_dtype)
]
self.training_data = numpy.zeros(0, dtype=training_dtype)


def trainingDistance(self) :
self.training_data = training_sample.addTrainingData(self.training_pairs,
self.data_model)
self.data_model, self.training_data)

def findAlpha(self) :
self.alpha = crossvalidation.gridSearch(self.training_data,
core.trainModel,
self.data_model,
k = 10,
k = 2,
num_iterations = self.num_iterations)

def train(self) :
self.findAlpha()
#self.findAlpha()
self.data_model = core.trainModel(self.training_data,
self.num_iterations,
self.data_model,
Expand Down
36 changes: 1 addition & 35 deletions dedupe/lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ def __init__(self):
#
# We use online update formula to train the model.
def train(self, data, n):
data = self.convert_to_numpy(data)
num_features = len(data[0][1][1])
self.weight = numpy.zeros(num_features)
self.feature_names = data[0][1][0]
Expand All @@ -33,20 +32,13 @@ def train(self, data, n):
for i in range(n):
max_update = 0
for [label, (_, feature)] in data:
#print feature
predicted = self.classify(feature)
rate_n = self.rate - (self.rate * i)/float(n)

update = (label - predicted) * feature - (self.alpha * self.weight)
#print update
self.weight += rate_n * update

# for f,v in feature.iteritems():
# if f not in self.weight:
# self.weight[f] = 0
# update = (label - predicted) * v - (self.alpha * self.weight[f])
# self.weight[f] += rate_n * update
# if abs(update * self.rate) > max_update :
# max_update = abs(update * rate_n)
bias_update = (label - predicted)
self.bias += rate_n * bias_update
#print 'iteration', i, 'done. Max update:', max_update
Expand All @@ -56,31 +48,5 @@ def train(self, data, n):
# a positive instance.
def classify(self, feature):
logit = self.bias

logit += numpy.dot(self.weight, feature)
# for f,v in feature.iteritems():
# coef = 0
# if f in self.weight:
# coef = self.weight[f]
# logit += coef * v
return 1.0 / (1.0 + math.exp(-logit))

def convert_to_numpy(self, training_data):
fields = training_data[0][1].keys()

field_dtype = [('names', 'a20', (len(fields)),),
('values', 'f4', (len(fields)),)
]

training_dtype = [('label', 'i4'),
('field_distances', field_dtype)
]

training_array = numpy.zeros(len(training_data), dtype=training_dtype)

for i, example in enumerate(training_data) :
training_array[i] = ((example[0]),
(example[1].keys(),
example[1].values())
)
return training_array
50 changes: 41 additions & 9 deletions dedupe/training_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,27 +105,59 @@ def activeLearning(data_d, data_model, labelPairFunction, num_questions) :


# appends training data to the training data collection
def addTrainingData(labeled_pairs, data_model, training_data=[]) :
def addTrainingData(labeled_pairs, data_model, old_training_data=[]) :

fields = data_model['fields']

field_dtype = [('names', 'a10', (len(fields)),),
('values', 'f4', (len(fields)),)
]
# field_dtype = [('names', 'a10', (len(fields)),),
# ('values', 'f4', (len(fields)),)
# ]

field_dtype = old_training_data.dtype[1]
distances = numpy.zeros(1, dtype=field_dtype)

num_existing_examples = old_training_data.shape[0]
num_training_pairs = len(labeled_pairs[0]) + len(labeled_pairs[1])
training_data = numpy.zeros(num_training_pairs + num_existing_examples, dtype=old_training_data.dtype)

i = num_existing_examples
for label, examples in labeled_pairs.items() :
for pair in examples :
for i, pair in enumerate(examples, i) :
c_distances = core.calculateDistance(pair[0],
pair[1],
fields,
distances)
c_distances = dict(zip(fields.keys(), c_distances[0]['values']))
training_data.append((label, c_distances))


example = ((label), c_distances)
training_data[i] = example
i += 1
# print training_data
# raise
# c_distances = dict(zip(fields.keys(), c_distances[0]['values']))
# training_data.append((label, c_distances))

#print c_distances
return training_data

# def convert_to_numpy(self, training_data):
# fields = training_data[0][1].keys()

# field_dtype = [('names', 'a20', (len(fields)),),
# ('values', 'f4', (len(fields)),)
# ]

# training_dtype = [('label', 'i4'),
# ('field_distances', field_dtype)
# ]

# training_array = numpy.zeros(len(training_data), dtype=training_dtype)

# for i, example in enumerate(training_data) :
# training_array[i] = ((example[0]),
# (example[1].keys(),
# example[1].values())
# )
# return training_array

def consoleLabel(uncertain_pairs, data_d, data_model) :
duplicates = []
nonduplicates = []
Expand Down

0 comments on commit 97a9054

Please sign in to comment.