Skip to content

Commit

Permalink
Refactored reduce_assoc to take embeddings as inputs, added zero-vect…
Browse files Browse the repository at this point in the history
…or cleanup to retrofit.
  • Loading branch information
luminoso-beaudoin committed Apr 30, 2018
1 parent cf4f42a commit 943680b
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 40 deletions.
6 changes: 3 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,7 @@ rule read_conceptnet4:
output:
DATA + "/edges/conceptnet4/conceptnet4_flat_{num}.msgpack"
run:
single_input = input[0]
shell("cn5-read conceptnet4 {single_input} {output}")
shell("cn5-read conceptnet4 {input} {output}")

rule read_dbpedia:
input:
Expand Down Expand Up @@ -554,7 +553,8 @@ rule assoc_uniq:

rule reduce_assoc:
input:
DATA + "/assoc/assoc.csv"
DATA + "/assoc/assoc.csv",
expand(DATA + "/vectors/{name}.h5", name=INPUT_EMBEDDINGS)
output:
DATA + "/assoc/reduced.csv"
shell:
Expand Down
9 changes: 5 additions & 4 deletions conceptnet5/builders/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,16 @@ def run_combine(input, output):


@cli.command(name='reduce_assoc')
@click.argument('input', type=click.Path(readable=True, dir_okay=False))
@click.argument('input_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False))
@click.argument('output', type=click.Path(writable=True, dir_okay=False))
def run_reduce_assoc(input, output):
def run_reduce_assoc(input_filenames, output):
"""
Takes in a file of tab-separated simple associations, and removes
Takes in a file of tab-separated simple associations, one or more
hdf5 files defining vector embeddings, and removes from the associations
low-frequency terms and associations that are judged unlikely to be
useful by various filters.
"""
reduce_assoc(input, output)
reduce_assoc(input_filenames[0], input_filenames[1:], output)


@cli.command('prepare_morphology')
Expand Down
82 changes: 54 additions & 28 deletions conceptnet5/builders/reduce_assoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from conceptnet5.relations import is_negative_relation
from conceptnet5.uri import is_concept, uri_prefix
from conceptnet5.vectors.formats import load_hdf
import pandas as pd


def concept_is_bad(uri):
Expand Down Expand Up @@ -77,14 +79,14 @@ def find_components(self):
stack.append(neighbor)

return component_labels



def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
"""
Takes in a file of tab-separated simple associations, and removes
uncommon associations and associations unlikely to be useful.

def make_filtered_concepts(filename, cutoff=3, en_cutoff=3):
"""
Takes in a file of tab-separated associations, and returns a set of
concepts from which those which are unlikely to be useful have been
removed.
All concepts that occur fewer than `cutoff` times will be removed.
All English concepts that occur fewer than `en_cutoff` times will be removed.
"""
Expand All @@ -109,12 +111,19 @@ def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
(not is_concept(concept) and count >= cutoff)
)
}
return filtered_concepts


def make_graph(filename, filtered_concepts, bad_concept=concept_is_bad,
bad_relation=is_negative_relation):
"""
Reads an association file and builds an (undirected) graph from it,
"""
graph = Graph()
with open(filename, encoding='utf-8') as file:
for line in file:
left, right, value, dataset, rel = line.rstrip().split('\t', 4)
if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel):
if bad_concept(left) or bad_concept(right) or bad_relation(rel):
continue
fvalue = float(value)
gleft = uri_prefix(left)
Expand All @@ -126,33 +135,50 @@ def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
):
if gleft != gright:
graph.add_edge(gleft, gright, value, dataset, rel)
return graph


def read_embedding_vocabularies(filenames):
result = pd.Index([])
for filename in filenames:
vectors = load_hdf(filename)
result = result.union(vectors.index)
return result



def reduce_assoc(assoc_filename, embedding_filenames, output_filename,
cutoff=3, en_cutoff=3):
"""
Takes in a file of tab-separated simple associations, and removes
uncommon associations and associations unlikely to be useful. Also
requires one or more vector embedding files (from which only the
vocabularies are used; associations involving terms that have no
connection, no matter how distant, to the union of those vocabularies
will be removed).
All concepts that occur fewer than `cutoff` times will be removed.
All English concepts that occur fewer than `en_cutoff` times will be removed.
"""

filtered_concepts = make_filtered_concepts(assoc_filename, cutoff=cutoff,
en_cutoff=en_cutoff)

graph = make_graph(assoc_filename, filtered_concepts)

component_labels = graph.find_components()

component_sizes = defaultdict(int)
max_component_size = 0
for vertex in graph.vertices():
component_sizes[component_labels[vertex]] += 1
if component_sizes[component_labels[vertex]] > max_component_size:
max_component_size = component_sizes[component_labels[vertex]]

max_size_labels = [label for label in component_sizes.keys()
if component_sizes[label] == max_component_size]
assert len(max_size_labels) > 0
if len(max_size_labels) != 1:
print('Warning: largest component of ConceptNet graph is not unique.')
max_size_label = min(max_size_labels)

print('The ConceptNet graph given has {} vertices and {} components, and the largest component has size {}.'.
format(len(graph.vertices()),
len(component_sizes),
max_component_size))

embedding_vocab = read_embedding_vocabularies(embedding_filenames)

good_component_labels = set(label for term, label
in component_labels.items()
if term in embedding_vocab)

with open(output_filename, 'w', encoding='utf-8') as out:
for gleft, gright, value, dataset, rel in graph.edges():
if component_labels[gleft] != max_size_label:
if component_labels[gleft] not in good_component_labels:
continue
if component_labels[gright] != max_size_label:
if component_labels[gright] not in good_component_labels:
continue
line = '\t'.join([gleft, gright, value, dataset, rel])
print(line, file=out)
6 changes: 4 additions & 2 deletions conceptnet5/vectors/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,16 @@ def filter_word_vectors(dense_hdf_filename, vocab_filename):
@click.option('--iterations', '-i', default=5)
@click.option('--nshards', '-s', default=6)
@click.option('--verbose', '-v', count=True)
@click.option('--max_cleanup_iters', '-m', default=20)
def run_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
iterations=5, nshards=6, verbose=0):
iterations=5, nshards=6, verbose=0, max_cleanup_iters=20):
"""
Run retrofit, operating on a part of a frame at a time.
"""
sharded_retrofit(
dense_hdf_filename, conceptnet_filename, output_filename,
iterations=iterations, nshards=nshards, verbosity=verbose
iterations=iterations, nshards=nshards, verbosity=verbose,
max_cleanup_iters=max_cleanup_iters
)


Expand Down
32 changes: 29 additions & 3 deletions conceptnet5/vectors/retrofit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
iterations=5, nshards=6, verbosity=0):
iterations=5, nshards=6, verbosity=0,
max_cleanup_iters=20):
# frame_box is basically a reference to a single large DataFrame. The
# DataFrame will at times be present or absent. When it's present, the list
# contains one item, which is the DataFrame. When it's absent, the list
Expand All @@ -27,7 +28,7 @@ def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
# up a lot of memory and we can reload it from disk later.
frame_box.clear()

retrofitted = retrofit(combined_index, dense_frame, sparse_csr, iterations, verbosity)
retrofitted = retrofit(combined_index, dense_frame, sparse_csr, iterations, verbosity, max_cleanup_iters)
save_hdf(retrofitted, temp_filename)
del retrofitted

Expand All @@ -49,7 +50,8 @@ def join_shards(output_filename, nshards=6):
save_hdf(dframe, output_filename)


def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=0):
def retrofit(row_labels, dense_frame, sparse_csr,
iterations=5, verbosity=0, max_cleanup_iters=20):
"""
Retrofitting is a process of combining information from a machine-learned
space of term vectors with further structured information about those
Expand Down Expand Up @@ -112,5 +114,29 @@ def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=0):
vecs += orig_vecs
vecs /= (weight_array + 1.)

# Clean up as many all-zero vectors as possible. Zero vectors
# can either come from components of the conceptnet graph that
# don't contain any terms from the embedding we are currently
# retrofitting (and there is nothing we can do about those here,
# but when retrofitting is done on that embedding they should be
# taken care of then) or from terms whose distance in the graph is
# larger than the number of retrofitting iterations used above; we
# propagate non-zero values to those terms by averaging over their
# non-zero neighbors. Note that this propagation can never reach
# the first class of terms, so we can't necessarily expect the
# number of zero vectors to go to zero at any one invocation of
# this code.
n_zero_indices_old = -1
for iteration in range(max_cleanup_iters):
zero_indices = (np.abs(vecs).sum(1) == 0)
n_zero_indices = np.sum(zero_indices)
if n_zero_indices == 0 or n_zero_indices == n_zero_indices_old:
break
n_zero_indices_old = n_zero_indices
vecs[zero_indices, :] = sparse_csr[zero_indices, :].dot(vecs)
normalize(vecs, norm='l2', copy=False)
else:
print('Warning: cleanup iteration limit exceeded.')

retroframe = pd.DataFrame(data=vecs, index=row_labels, columns=dense_frame.columns)
return retroframe

0 comments on commit 943680b

Please sign in to comment.