Skip to content

Commit

Permalink
New vectorize=False option to support deep learning models
Browse files Browse the repository at this point in the history
  • Loading branch information
cgpotts committed Feb 5, 2020
1 parent 3e2b4d3 commit c5d12df
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 14 deletions.
89 changes: 86 additions & 3 deletions hw_rel_ext.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
"\n",
"1. [Overview](#Overview)\n",
"1. [Set-up](#Set-up)\n",
"1. [Baseline](#Baseline)\n",
"1. [Baselines](#Baselines)\n",
" 1. [Hand-build feature functions](#Hand-build-feature-functions)\n",
" 1. [Distributed representations](#Distributed-representations)\n",
"1. [Homework questions](#Homework-questions)\n",
" 1. [Different model factory [1 points]](#Different-model-factory-[1-points])\n",
" 1. [Directional unigram features [1.5 points]](#Directional-unigram-features-[1.5-points])\n",
Expand Down Expand Up @@ -61,9 +63,11 @@
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import os\n",
"import rel_ext\n",
"from sklearn.linear_model import LogisticRegression"
"from sklearn.linear_model import LogisticRegression\n",
"import utils"
]
},
{
Expand Down Expand Up @@ -141,7 +145,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Baseline"
"## Baselines"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Hand-build feature functions"
]
},
{
Expand Down Expand Up @@ -209,6 +220,78 @@
"rel_ext.examine_model_weights(baseline_results)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Distributed representations\n",
"\n",
"This simple baseline sums the GloVe vector representations for all of the words in the \"middle\" span and feeds those representations into the standard `LogisticRegression`-based `model_factory`. The crucial parameter that enables this is `vectorize=False`. This essentially says to `rel_ext.experiment` that your featurizer or your model will do the work of turning examples into vectors; in that case, `rel_ext.experiment` just organizes these representations by relation type."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"GLOVE_HOME = os.path.join('data', 'glove.6B')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"glove_lookup = utils.glove2dict(\n",
" os.path.join(GLOVE_HOME, 'glove.6B.300d.txt'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def glove_middle_featurizer(kbt, corpus, np_func=np.sum):\n",
" reps = []\n",
" for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):\n",
" for word in ex.middle.split():\n",
" rep = glove_lookup.get(word)\n",
" if rep is not None:\n",
" reps.append(rep)\n",
" # A random representation of the right dimensionality if the\n",
" # example happens not to overlap with GloVe's vocabulary:\n",
" if len(reps) == 0:\n",
" dim = len(next(iter(glove_lookup.values()))) \n",
" return utils.randvec(n=dim)\n",
" else:\n",
" return np_func(reps, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"glove_results = rel_ext.experiment(\n",
" splits,\n",
" train_split='train',\n",
" test_split='dev',\n",
" featurizers=[glove_middle_featurizer], \n",
" vectorize=False, # Crucial for this featurizer!\n",
" verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With the same basic code design, one can also use the PyTorch models included in the course repo, or write new ones that are better aligned with the task. For those models, it's likely that the featurizer will just return a list of tokens (or perhaps a list of lists of tokens), and the model will map those into vectors using an embedding."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
65 changes: 56 additions & 9 deletions rel_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def find_unrelated_pairs(self):
unrelated_pairs.add((ex.entity_2, ex.entity_1))
return unrelated_pairs

def featurize(self, kbts_by_rel, featurizers, vectorizer=None):
def featurize(self, kbts_by_rel, featurizers, vectorizer=None, vectorize=True):
"""Featurize by relation.
Parameters
Expand All @@ -266,8 +266,37 @@ def featurize(self, kbts_by_rel, featurizers, vectorizer=None):
If None, a new `DictVectorizer` is created and used via
`fit`. This is primarily for training. If not None, then
`transform` is used. This is primarily for testing.
vectorize: bool
If True, the feature functions in `featurizers` are presumed
to create feature dicts, and a `DictVectorizer` is used. If
False, then `featurizers` is required to have exactly one
function in it, and that function must return exactly the
sort of objects that the models in the model factory take
as inputs.
Returns
-------
feat_matrices_by_rel, vectorizer
where `feat_matrices_by_rel` is a dict mapping relation names
to (i) lists of representation if `vectorize=False`, else
to `np.array`s, and (ii) and `vectorizer` is a
`DictVectorizer` if `vectorize=True`, else None
"""
if not vectorize:

feat_matrices_by_rel = defaultdict(list)
if len(featurizers) != 1:
raise ValueError(
"If `vectorize=True`, the `featurizers` argument "
"must contain exactly one function.")
featurizer = featurizers[0]
for rel, kbts in kbts_by_rel.items():
for kbt in kbts:
rep = featurizer(kbt, self.corpus)
feat_matrices_by_rel[rel].append(rep)
return feat_matrices_by_rel, None

# Create feature counters for all instances (kbts).
feat_counters_by_rel = defaultdict(list)
for rel, kbts in kbts_by_rel.items():
Expand Down Expand Up @@ -446,10 +475,12 @@ def train_models(
split_name='train',
model_factory=(lambda: LogisticRegression(
fit_intercept=True, solver='liblinear', random_state=42)),
vectorize=True,
verbose=True):
train_dataset = splits[split_name]
train_o, train_y = train_dataset.build_dataset()
train_X, vectorizer = train_dataset.featurize(train_o, featurizers)
train_X, vectorizer = train_dataset.featurize(
train_o, featurizers, vectorize=vectorize)
models = {}
for rel in splits['all'].kb.all_relations:
models[rel] = model_factory()
Expand All @@ -458,16 +489,18 @@ def train_models(
'featurizers': featurizers,
'vectorizer': vectorizer,
'models': models,
'all_relations': splits['all'].kb.all_relations}
'all_relations': splits['all'].kb.all_relations,
'vectorize': vectorize}


def predict(splits, train_result, split_name='dev'):
def predict(splits, train_result, split_name='dev', vectorize=True):
assess_dataset = splits[split_name]
assess_o, assess_y = assess_dataset.build_dataset()
test_X, _ = assess_dataset.featurize(
assess_o,
featurizers=train_result['featurizers'],
vectorizer=train_result['vectorizer'])
vectorizer=train_result['vectorizer'],
vectorize=vectorize)
predictions = {}
for rel in train_result['all_relations']:
predictions[rel] = train_result['models'][rel].predict(test_X[rel])
Expand Down Expand Up @@ -498,17 +531,20 @@ def experiment(
test_split='dev',
model_factory=(lambda: LogisticRegression(
fit_intercept=True, solver='liblinear', random_state=42)),
vectorize=True,
verbose=True):
train_result = train_models(
splits,
featurizers=featurizers,
split_name=train_split,
model_factory=model_factory,
vectorize=vectorize,
verbose=verbose)
predictions, test_y = predict(
splits,
train_result,
split_name=test_split)
split_name=test_split,
vectorize=vectorize)
evaluate_predictions(
predictions,
test_y,
Expand All @@ -517,7 +553,14 @@ def experiment(


def examine_model_weights(train_result, k=3, verbose=True):
feature_names = train_result['vectorizer'].get_feature_names()
vectorizer = train_result['vectorizer']

if vectorizer is None:
print("Model weights can be examined only if the featurizers "
"are based in dicts (i.e., if `vectorize=True`).")
return

feature_names = vectorizer.get_feature_names()
for rel, model in train_result['models'].items():
print('Highest and lowest feature weights for relation {}:\n'.format(rel))
try:
Expand All @@ -541,6 +584,7 @@ def find_new_relation_instances(
model_factory=(lambda: LogisticRegression(
fit_intercept=True, solver='liblinear', random_state=42)),
k=10,
vectorize=True,
verbose=True):
splits = dataset.build_splits()
# train models
Expand All @@ -549,6 +593,7 @@ def find_new_relation_instances(
split_name=train_split,
featurizers=featurizers,
model_factory=model_factory,
vectorize=vectorize,
verbose=True)
test_split = splits[test_split]
neg_o, neg_y = test_split.build_dataset(
Expand All @@ -557,7 +602,8 @@ def find_new_relation_instances(
neg_X, _ = test_split.featurize(
neg_o,
featurizers=featurizers,
vectorizer=train_result['vectorizer'])
vectorizer=train_result['vectorizer'],
vectorize=vectorize)
# Report highest confidence predictions:
for rel, model in train_result['models'].items():
print('Highest probability examples for relation {}:\n'.format(rel))
Expand All @@ -579,7 +625,8 @@ def bake_off_experiment(train_result, rel_ext_data_home, verbose=True):
test_X, _ = test_dataset.featurize(
test_o,
featurizers=train_result['featurizers'],
vectorizer=train_result['vectorizer'])
vectorizer=train_result['vectorizer'],
vectorize=train_result['vectorize'])
predictions = {}
for rel in train_result['all_relations']:
predictions[rel] = train_result['models'][rel].predict(test_X[rel])
Expand Down
78 changes: 76 additions & 2 deletions test/test_rel_ext.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import pytest
import rel_ext
from sklearn.linear_model import LogisticRegression
import utils

__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2020"
Expand All @@ -26,6 +28,14 @@ def kb():
return rel_ext.KB(src_filename)


def dummy_vectorizing_feature_function(kbt, corpus, feature_counter):
return {"bias": 1}


def dummy_nonvectorizing_feature_function(kbt, corpus):
return utils.randvec(10)


def test_corpus_length(corpus):
assert len(corpus) == 331696

Expand All @@ -44,8 +54,72 @@ def test_dataset_build_splits(corpus, kb):
dat = dataset.build_splits(seed=1)


def test_dataset_featurize(corpus, kb):
def test_dataset_featurize_vectorize(corpus, kb):
dataset = rel_ext.Dataset(corpus, kb)
kbts_by_rel, _ = dataset.build_dataset()
kbts_by_rel, _ = dataset.build_dataset(sampling_rate=0.1)
featurizers = [lambda kbt, corpus, feature_counter: {"bias": 1}]
dataset.featurize(kbts_by_rel, featurizers)


def test_dataset_featurize_no_vectorize(corpus, kb):
dataset = rel_ext.Dataset(corpus, kb)
kbts_by_rel, _ = dataset.build_dataset(sampling_rate=0.1)
def featurizer(kbt, corpus):
return utils.randvec(10)
dataset.featurize(kbts_by_rel, [featurizer], vectorize=False)


@pytest.mark.parametrize("featurizer, vectorize", [
[dummy_nonvectorizing_feature_function, False],
[dummy_vectorizing_feature_function, True]
])
def test_experiment(featurizer, vectorize, corpus, kb):
dataset = rel_ext.Dataset(corpus, kb)
splits = dataset.build_splits(
split_names=['tiny_train', 'tiny_dev', 'rest'],
split_fracs=[0.05, 0.05, 0.90],
seed=1)
results = rel_ext.experiment(
splits,
train_split='tiny_train',
test_split='tiny_dev',
featurizers=[featurizer],
vectorize=vectorize,
verbose=False)


@pytest.mark.parametrize("featurizer, vectorize", [
[dummy_nonvectorizing_feature_function, False],
[dummy_vectorizing_feature_function, True]
])
def test_examine_model_weights(featurizer, vectorize, corpus, kb):
dataset = rel_ext.Dataset(corpus, kb)
splits = dataset.build_splits(
split_names=['tiny_train', 'tiny_dev', 'rest'],
split_fracs=[0.05, 0.05, 0.90],
seed=1)
results = rel_ext.experiment(
splits,
train_split='tiny_train',
test_split='tiny_dev',
featurizers=[featurizer],
vectorize=vectorize,
verbose=False)
rel_ext.examine_model_weights(results)


@pytest.mark.parametrize("featurizer, vectorize", [
[dummy_nonvectorizing_feature_function, False],
[dummy_vectorizing_feature_function, True]
])
def test_find_new_relation_instances(corpus, kb, featurizer, vectorize):
dataset = rel_ext.Dataset(corpus, kb)
rel_ext.find_new_relation_instances(
dataset,
[featurizer],
train_split='train',
test_split='dev',
model_factory=lambda: LogisticRegression(solver='liblinear'),
k=10,
vectorize=vectorize,
verbose=False)

0 comments on commit c5d12df

Please sign in to comment.