From 97a9054beb41a4c62824e597353c1a918f2ee47e Mon Sep 17 00:00:00 2001 From: Derek Eder Date: Fri, 7 Sep 2012 13:04:03 -0500 Subject: [PATCH] continued replacing training data with numpy arrays, crossvalidation currently not working --- dedupe/crossvalidation.py | 25 +++++++++++++------- dedupe/dedupe.py | 17 ++++++++++--- dedupe/lr.py | 36 +--------------------------- dedupe/training_sample.py | 50 ++++++++++++++++++++++++++++++++------- 4 files changed, 73 insertions(+), 55 deletions(-) diff --git a/dedupe/crossvalidation.py b/dedupe/crossvalidation.py index 5d1041a3e..535bb13b0 100644 --- a/dedupe/crossvalidation.py +++ b/dedupe/crossvalidation.py @@ -1,6 +1,7 @@ import core from random import shuffle import copy +import numpy #http://code.activestate.com/recipes/521906-k-fold-cross-validation-partition/ @@ -17,28 +18,38 @@ def gridSearch(training_data, print "using cross validation to find optimum alpha" scores = [] + + fields = training_data[0][1][0] + for alpha in search_space : all_score = 0 all_N = 0 for training, validation in kFolds(training_data, k) : - data_model = trainer(training, num_iterations, original_data_model, alpha) + + weights = numpy.array([data_model['fields'][field]['weight'] for field in fields]) + print weights, (real_labels, validation_distances) = zip(*[(label, distances) for label, distances in validation]) - predicted_labels = [] - for pair in validation_distances : - prediction = data_model["bias"] - for name, distance in pair.iteritems() : - prediction += distance * data_model['fields'][name]["weight"] + predicted_labels = [] + bias = data_model["bias"] + print bias + for example in validation_distances : + prediction = bias + numpy.dot(weights, example[1]) + print prediction + #for name, distance in example.iteritems() : + # prediction += distance * data_model['fields'][name]["weight"] if prediction > 0 : predicted_labels.append(1) else : predicted_labels.append(0) + print predicted_labels + score = 0 for real_label, predicted_label in zip(real_labels, predicted_labels) : if real_label == predicted_label : @@ -55,9 +66,7 @@ def gridSearch(training_data, return best_alpha def kFolds(training_data, k): - slices = [training_data[i::k] for i in xrange(k)] - for i in xrange(k): validation = slices[i] training = [datum diff --git a/dedupe/dedupe.py b/dedupe/dedupe.py index b686e3ded..4580166ea 100644 --- a/dedupe/dedupe.py +++ b/dedupe/dedupe.py @@ -7,6 +7,7 @@ from predicates import * import blocking import clustering +import numpy def sampleDict(d, sample_size) : @@ -42,20 +43,30 @@ def initializeSettings(self, fields) : self.data_model['fields'][k] = v self.data_model['bias'] = 0 + + field_dtype = [('names', 'a20', (len(fields)),), + ('values', 'f4', (len(fields)),) + ] + + training_dtype = [('label', 'i4'), + ('field_distances', field_dtype) + ] + self.training_data = numpy.zeros(0, dtype=training_dtype) + def trainingDistance(self) : self.training_data = training_sample.addTrainingData(self.training_pairs, - self.data_model) + self.data_model, self.training_data) def findAlpha(self) : self.alpha = crossvalidation.gridSearch(self.training_data, core.trainModel, self.data_model, - k = 10, + k = 2, num_iterations = self.num_iterations) def train(self) : - self.findAlpha() + #self.findAlpha() self.data_model = core.trainModel(self.training_data, self.num_iterations, self.data_model, diff --git a/dedupe/lr.py b/dedupe/lr.py index 53f375ac5..84b45c2dd 100644 --- a/dedupe/lr.py +++ b/dedupe/lr.py @@ -24,7 +24,6 @@ def __init__(self): # # We use online update formula to train the model. def train(self, data, n): - data = self.convert_to_numpy(data) num_features = len(data[0][1][1]) self.weight = numpy.zeros(num_features) self.feature_names = data[0][1][0] @@ -33,20 +32,13 @@ def train(self, data, n): for i in range(n): max_update = 0 for [label, (_, feature)] in data: - #print feature predicted = self.classify(feature) rate_n = self.rate - (self.rate * i)/float(n) update = (label - predicted) * feature - (self.alpha * self.weight) + #print update self.weight += rate_n * update - # for f,v in feature.iteritems(): - # if f not in self.weight: - # self.weight[f] = 0 - # update = (label - predicted) * v - (self.alpha * self.weight[f]) - # self.weight[f] += rate_n * update - # if abs(update * self.rate) > max_update : - # max_update = abs(update * rate_n) bias_update = (label - predicted) self.bias += rate_n * bias_update #print 'iteration', i, 'done. Max update:', max_update @@ -56,31 +48,5 @@ def train(self, data, n): # a positive instance. def classify(self, feature): logit = self.bias - logit += numpy.dot(self.weight, feature) - # for f,v in feature.iteritems(): - # coef = 0 - # if f in self.weight: - # coef = self.weight[f] - # logit += coef * v return 1.0 / (1.0 + math.exp(-logit)) - - def convert_to_numpy(self, training_data): - fields = training_data[0][1].keys() - - field_dtype = [('names', 'a20', (len(fields)),), - ('values', 'f4', (len(fields)),) - ] - - training_dtype = [('label', 'i4'), - ('field_distances', field_dtype) - ] - - training_array = numpy.zeros(len(training_data), dtype=training_dtype) - - for i, example in enumerate(training_data) : - training_array[i] = ((example[0]), - (example[1].keys(), - example[1].values()) - ) - return training_array diff --git a/dedupe/training_sample.py b/dedupe/training_sample.py index 2e64ff21b..697a8bc55 100644 --- a/dedupe/training_sample.py +++ b/dedupe/training_sample.py @@ -105,27 +105,59 @@ def activeLearning(data_d, data_model, labelPairFunction, num_questions) : # appends training data to the training data collection -def addTrainingData(labeled_pairs, data_model, training_data=[]) : +def addTrainingData(labeled_pairs, data_model, old_training_data=[]) : fields = data_model['fields'] - field_dtype = [('names', 'a10', (len(fields)),), - ('values', 'f4', (len(fields)),) - ] + # field_dtype = [('names', 'a10', (len(fields)),), + # ('values', 'f4', (len(fields)),) + # ] + field_dtype = old_training_data.dtype[1] distances = numpy.zeros(1, dtype=field_dtype) - + num_existing_examples = old_training_data.shape[0] + num_training_pairs = len(labeled_pairs[0]) + len(labeled_pairs[1]) + training_data = numpy.zeros(num_training_pairs + num_existing_examples, dtype=old_training_data.dtype) + + i = num_existing_examples for label, examples in labeled_pairs.items() : - for pair in examples : + for i, pair in enumerate(examples, i) : c_distances = core.calculateDistance(pair[0], pair[1], fields, distances) - c_distances = dict(zip(fields.keys(), c_distances[0]['values'])) - training_data.append((label, c_distances)) - + + example = ((label), c_distances) + training_data[i] = example + i += 1 + # print training_data + # raise + # c_distances = dict(zip(fields.keys(), c_distances[0]['values'])) + # training_data.append((label, c_distances)) + + #print c_distances return training_data + # def convert_to_numpy(self, training_data): + # fields = training_data[0][1].keys() + + # field_dtype = [('names', 'a20', (len(fields)),), + # ('values', 'f4', (len(fields)),) + # ] + + # training_dtype = [('label', 'i4'), + # ('field_distances', field_dtype) + # ] + + # training_array = numpy.zeros(len(training_data), dtype=training_dtype) + + # for i, example in enumerate(training_data) : + # training_array[i] = ((example[0]), + # (example[1].keys(), + # example[1].values()) + # ) + # return training_array + def consoleLabel(uncertain_pairs, data_d, data_model) : duplicates = [] nonduplicates = []