From fd0c75f809a61d3625a836d2e5dc813afcf2ecd9 Mon Sep 17 00:00:00 2001 From: Christopher Potts Date: Tue, 17 Mar 2020 13:44:26 -0700 Subject: [PATCH] Control of sampling_rate for experiment train and test --- rel_ext.py | 15 ++++++++++----- test/test_rel_ext.py | 2 ++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/rel_ext.py b/rel_ext.py index 337aea7..b49b904 100644 --- a/rel_ext.py +++ b/rel_ext.py @@ -450,8 +450,8 @@ def macro_average_results(results): return avg_result -def evaluate(splits, classifier, test_split='dev', verbose=True): - test_kbts_by_rel, true_labels_by_rel = splits[test_split].build_dataset() +def evaluate(splits, classifier, test_split='dev', sampling_rate=0.1, verbose=True): + test_kbts_by_rel, true_labels_by_rel = splits[test_split].build_dataset(sampling_rate=sampling_rate) results = {} if verbose: print_statistics_header() @@ -475,10 +475,11 @@ def train_models( split_name='train', model_factory=(lambda: LogisticRegression( fit_intercept=True, solver='liblinear', random_state=42)), + sampling_rate=0.1, vectorize=True, verbose=True): train_dataset = splits[split_name] - train_o, train_y = train_dataset.build_dataset() + train_o, train_y = train_dataset.build_dataset(sampling_rate=sampling_rate) train_X, vectorizer = train_dataset.featurize( train_o, featurizers, vectorize=vectorize) models = {} @@ -493,9 +494,9 @@ def train_models( 'vectorize': vectorize} -def predict(splits, train_result, split_name='dev', vectorize=True): +def predict(splits, train_result, split_name='dev', sampling_rate=0.1, vectorize=True): assess_dataset = splits[split_name] - assess_o, assess_y = assess_dataset.build_dataset() + assess_o, assess_y = assess_dataset.build_dataset(sampling_rate=sampling_rate) test_X, _ = assess_dataset.featurize( assess_o, featurizers=train_result['featurizers'], @@ -531,6 +532,8 @@ def experiment( test_split='dev', model_factory=(lambda: LogisticRegression( fit_intercept=True, solver='liblinear', random_state=42)), + train_sampling_rate=0.1, + test_sampling_rate=0.1, vectorize=True, verbose=True): train_result = train_models( @@ -538,12 +541,14 @@ def experiment( featurizers=featurizers, split_name=train_split, model_factory=model_factory, + sampling_rate=train_sampling_rate, vectorize=vectorize, verbose=verbose) predictions, test_y = predict( splits, train_result, split_name=test_split, + sampling_rate=test_sampling_rate, vectorize=vectorize) evaluate_predictions( predictions, diff --git a/test/test_rel_ext.py b/test/test_rel_ext.py index 7f16d03..9e0358d 100644 --- a/test/test_rel_ext.py +++ b/test/test_rel_ext.py @@ -84,6 +84,8 @@ def test_experiment(featurizer, vectorize, corpus, kb): train_split='tiny_train', test_split='tiny_dev', featurizers=[featurizer], + train_sampling_rate=0.2, + test_sampling_rate=0.2, vectorize=vectorize, verbose=False)