Refactored reduce_assoc to take embeddings as inputs, added zero-vect…

…or cleanup to retrofit.
commonsense · Apr 30, 2018 · 943680b · 943680b
1 parent cf4f42a
commit 943680b
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 40 deletions.
diff --git a/Snakefile b/Snakefile
@@ -250,8 +250,7 @@ rule read_conceptnet4:
     output:
         DATA + "/edges/conceptnet4/conceptnet4_flat_{num}.msgpack"
     run:
-        single_input = input[0]
-        shell("cn5-read conceptnet4 {single_input} {output}")
+        shell("cn5-read conceptnet4 {input} {output}")
 
 rule read_dbpedia:
     input:
@@ -554,7 +553,8 @@ rule assoc_uniq:
 
 rule reduce_assoc:
     input:
-        DATA + "/assoc/assoc.csv"
+        DATA + "/assoc/assoc.csv",
+        expand(DATA + "/vectors/{name}.h5", name=INPUT_EMBEDDINGS)
     output:
         DATA + "/assoc/reduced.csv"
     shell:

diff --git a/conceptnet5/builders/cli.py b/conceptnet5/builders/cli.py
@@ -24,15 +24,16 @@ def run_combine(input, output):
 
 
 @cli.command(name='reduce_assoc')
-@click.argument('input', type=click.Path(readable=True, dir_okay=False))
+@click.argument('input_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False))
 @click.argument('output', type=click.Path(writable=True, dir_okay=False))
-def run_reduce_assoc(input, output):
+def run_reduce_assoc(input_filenames, output):
     """
-    Takes in a file of tab-separated simple associations, and removes
+    Takes in a file of tab-separated simple associations, one or more 
+    hdf5 files defining vector embeddings, and removes from the associations 
     low-frequency terms and associations that are judged unlikely to be
     useful by various filters.
     """
-    reduce_assoc(input, output)
+    reduce_assoc(input_filenames[0], input_filenames[1:], output)
 
 
 @cli.command('prepare_morphology')

diff --git a/conceptnet5/builders/reduce_assoc.py b/conceptnet5/builders/reduce_assoc.py
@@ -7,6 +7,8 @@
 
 from conceptnet5.relations import is_negative_relation
 from conceptnet5.uri import is_concept, uri_prefix
+from conceptnet5.vectors.formats import load_hdf
+import pandas as pd
 
 
 def concept_is_bad(uri):
@@ -77,14 +79,14 @@ def find_components(self):
                         stack.append(neighbor)
 
         return component_labels
-
-
 
-def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
-    """
-    Takes in a file of tab-separated simple associations, and removes
-    uncommon associations and associations unlikely to be useful.
 
+def make_filtered_concepts(filename, cutoff=3, en_cutoff=3):
+    """
+    Takes in a file of tab-separated associations, and returns a set of 
+    concepts from which those which are unlikely to be useful have been 
+    removed. 
+    
     All concepts that occur fewer than `cutoff` times will be removed.
     All English concepts that occur fewer than `en_cutoff` times will be removed.
     """
@@ -109,12 +111,19 @@ def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
             (not is_concept(concept) and count >= cutoff)
         )
     }
+    return filtered_concepts
+
 
+def make_graph(filename, filtered_concepts, bad_concept=concept_is_bad,
+               bad_relation=is_negative_relation):
+    """
+    Reads an association file and builds an (undirected) graph from it, 
+    """
     graph = Graph()
     with open(filename, encoding='utf-8') as file:
         for line in file:
             left, right, value, dataset, rel = line.rstrip().split('\t', 4)
-            if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel):
+            if bad_concept(left) or bad_concept(right) or bad_relation(rel):
                 continue
             fvalue = float(value)
             gleft = uri_prefix(left)
@@ -126,33 +135,50 @@ def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
             ):
                 if gleft != gright:
                     graph.add_edge(gleft, gright, value, dataset, rel)
+    return graph
+
+
+def read_embedding_vocabularies(filenames):
+    result = pd.Index([])
+    for filename in filenames:
+        vectors = load_hdf(filename)
+        result = result.union(vectors.index)
+    return result
+
+
+
+def reduce_assoc(assoc_filename, embedding_filenames, output_filename,
+                 cutoff=3, en_cutoff=3):
+    """
+    Takes in a file of tab-separated simple associations, and removes
+    uncommon associations and associations unlikely to be useful.  Also 
+    requires one or more vector embedding files (from which only the 
+    vocabularies are used; associations involving terms that have no 
+    connection, no matter how distant, to the union of those vocabularies 
+    will be removed).
+
+    All concepts that occur fewer than `cutoff` times will be removed.
+    All English concepts that occur fewer than `en_cutoff` times will be removed.
+    """
+
+    filtered_concepts = make_filtered_concepts(assoc_filename, cutoff=cutoff,
+                                               en_cutoff=en_cutoff)
+
+    graph = make_graph(assoc_filename, filtered_concepts)
 
     component_labels = graph.find_components()
-
-    component_sizes = defaultdict(int)
-    max_component_size = 0
-    for vertex in graph.vertices():
-        component_sizes[component_labels[vertex]] += 1
-        if component_sizes[component_labels[vertex]] > max_component_size:
-            max_component_size = component_sizes[component_labels[vertex]]
-
-    max_size_labels = [label for label in component_sizes.keys()
-                       if component_sizes[label] == max_component_size]
-    assert len(max_size_labels) > 0
-    if len(max_size_labels) != 1:
-        print('Warning: largest component of ConceptNet graph is not unique.')
-    max_size_label = min(max_size_labels)
-
-    print('The ConceptNet graph given has {} vertices and {} components, and the largest component has size {}.'.
-          format(len(graph.vertices()),
-                 len(component_sizes),
-                 max_component_size))
 
+    embedding_vocab = read_embedding_vocabularies(embedding_filenames)
+
+    good_component_labels = set(label for term, label
+                                in component_labels.items()
+                                if term in embedding_vocab)
+
     with open(output_filename, 'w', encoding='utf-8') as out:
         for gleft, gright, value, dataset, rel in graph.edges():
-            if component_labels[gleft] != max_size_label:
+            if component_labels[gleft] not in good_component_labels:
                 continue
-            if component_labels[gright] != max_size_label:
+            if component_labels[gright] not in good_component_labels:
                 continue
             line = '\t'.join([gleft, gright, value, dataset, rel])
             print(line, file=out)
diff --git a/conceptnet5/vectors/cli.py b/conceptnet5/vectors/cli.py
@@ -52,14 +52,16 @@ def filter_word_vectors(dense_hdf_filename, vocab_filename):
 @click.option('--iterations', '-i', default=5)
 @click.option('--nshards', '-s', default=6)
 @click.option('--verbose', '-v', count=True)
+@click.option('--max_cleanup_iters', '-m', default=20)
 def run_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
-                 iterations=5, nshards=6, verbose=0):
+                 iterations=5, nshards=6, verbose=0, max_cleanup_iters=20):
     """
     Run retrofit, operating on a part of a frame at a time.
     """
     sharded_retrofit(
         dense_hdf_filename, conceptnet_filename, output_filename,
-        iterations=iterations, nshards=nshards, verbosity=verbose
+        iterations=iterations, nshards=nshards, verbosity=verbose,
+        max_cleanup_iters=max_cleanup_iters
     )
 
 

diff --git a/conceptnet5/vectors/retrofit.py b/conceptnet5/vectors/retrofit.py
@@ -6,7 +6,8 @@
 
 
 def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
-                     iterations=5, nshards=6, verbosity=0):
+                     iterations=5, nshards=6, verbosity=0,
+                     max_cleanup_iters=20):
     # frame_box is basically a reference to a single large DataFrame. The
     # DataFrame will at times be present or absent. When it's present, the list
     # contains one item, which is the DataFrame. When it's absent, the list
@@ -27,7 +28,7 @@ def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
         # up a lot of memory and we can reload it from disk later.
         frame_box.clear()
 
-        retrofitted = retrofit(combined_index, dense_frame, sparse_csr, iterations, verbosity)
+        retrofitted = retrofit(combined_index, dense_frame, sparse_csr, iterations, verbosity, max_cleanup_iters)
         save_hdf(retrofitted, temp_filename)
         del retrofitted
 
@@ -49,7 +50,8 @@ def join_shards(output_filename, nshards=6):
     save_hdf(dframe, output_filename)
 
 
-def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=0):
+def retrofit(row_labels, dense_frame, sparse_csr,
+             iterations=5, verbosity=0, max_cleanup_iters=20):
     """
     Retrofitting is a process of combining information from a machine-learned
     space of term vectors with further structured information about those
@@ -112,5 +114,29 @@ def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=0):
         vecs += orig_vecs
         vecs /= (weight_array + 1.)
 
+    # Clean up as many all-zero vectors as possible.  Zero vectors
+    # can either come from components of the conceptnet graph that
+    # don't contain any terms from the embedding we are currently
+    # retrofitting (and there is nothing we can do about those here,
+    # but when retrofitting is done on that embedding they should be
+    # taken care of then) or from terms whose distance in the graph is
+    # larger than the number of retrofitting iterations used above; we
+    # propagate non-zero values to those terms by averaging over their
+    # non-zero neighbors.  Note that this propagation can never reach
+    # the first class of terms, so we can't necessarily expect the
+    # number of zero vectors to go to zero at any one invocation of
+    # this code.
+    n_zero_indices_old = -1
+    for iteration in range(max_cleanup_iters):
+        zero_indices = (np.abs(vecs).sum(1) == 0)
+        n_zero_indices = np.sum(zero_indices)
+        if n_zero_indices == 0 or n_zero_indices == n_zero_indices_old:
+            break
+        n_zero_indices_old = n_zero_indices
+        vecs[zero_indices, :] = sparse_csr[zero_indices, :].dot(vecs)
+        normalize(vecs, norm='l2', copy=False)
+    else:
+        print('Warning: cleanup iteration limit exceeded.')
+
     retroframe = pd.DataFrame(data=vecs, index=row_labels, columns=dense_frame.columns)
     return retroframe