Add separate join_retrofit step that joins shards efficiently

commonsense · May 24, 2016 · 3924ac6 · 3924ac6
1 parent f64bfb6
commit 3924ac6
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 15 deletions.
diff --git a/Snakefile b/Snakefile
@@ -40,7 +40,7 @@ WIKTIONARY_LANGUAGES = sorted(list(WIKTIONARY_VERSIONS))
 # Increment this number when we incompatibly change the parser
 WIKT_PARSER_VERSION = "1"
 
-CHUNKS_OF_100 = [0, 100, 200, 300, 400, 500]
+RETROFIT_SHARDS = 6
 
 # Dataset filenames
 # =================
@@ -382,7 +382,15 @@ rule retrofit:
     input:
         "data/vectors/merged.h5",
         "data/assoc/reduced.csv"
+    output:
+        expand("data/vectors/retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS))
+    shell:
+        "cn5-vectors retrofit -s {RETROFIT_SHARDS} -v {input} {output}"
+
+rule join_retrofit:
+    input:
+        expand("data/vectors/retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS))
     output:
         "data/vectors/retrofit.h5"
     shell:
-        "cn5-vectors retrofit -v {input} {output}"
+        "cn5-vectors join_retrofit -s {RETROFIT_SHARDS} {output}"
diff --git a/TODO.txt b/TODO.txt
@@ -7,16 +7,18 @@ Design of ConceptNet 5.5:
 - [x] Handle 'sw-maps' correctly, no proliferation of N-triples, no double-escaping
 - [x] Store the index in a reasonable constant database
 - [x] Use the blacklist when making assertions
-- [ ] Use symmetrical relations
 - [ ] Use JSON-LD when possible
-- [ ] Optionally build ConceptNet vectors in build process
+- [x] Build ConceptNet vectors in build process
   - include word2vec and GloVe inputs in raw/
-- [ ] Automatically test ConceptNet vectors
+- [x] Automatically test ConceptNet vectors
+- [x] Build vectors using less than 16 GB RAM
+- [ ] fix 'FormOf' links to self
 
 ConceptNet 5.6 perhaps:
 
 - [ ] Nodes should have labels that are stored separately from their edges
 - [ ] Break down the and-or trees into more machine-readable structures
+- [ ] Use symmetrical relations
 
 Cleaning up the data:
 

diff --git a/conceptnet5/vectors/cli.py b/conceptnet5/vectors/cli.py
@@ -1,7 +1,7 @@
 import click
 from .formats import convert_glove, convert_word2vec, load_hdf, save_hdf
 from .sparse_matrix_builder import build_from_conceptnet_table
-from .retrofit import sharded_retrofit
+from .retrofit import sharded_retrofit, join_shards
 from .interpolate import merge_interpolate
 from .evaluation.wordsim import evaluate
 
@@ -26,6 +26,13 @@ def run_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
     )
 
 
+@cli.command(name='join_retrofit')
+@click.argument('filename', type=click.Path(writable=True, dir_okay=False))
+@click.option('--nshards', '-s', default=6)
+def run_join_retrofit(filename, nshards=6):
+    join_shards(filename, nshards)
+
+
 @cli.command(name='convert_glove')
 @click.argument('glove_filename', type=click.Path(readable=True, dir_okay=False))
 @click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))

diff --git a/conceptnet5/vectors/formats.py b/conceptnet5/vectors/formats.py
@@ -14,6 +14,11 @@ def save_hdf(table, filename):
     return table.to_hdf(filename, 'mat', encoding='utf-8')
 
 
+def save_npy_and_labels(table, filebase):
+    np.save(filebase + '.npy', table.values)
+    save_index_as_labels(table.index, filebase + '.labels.txt')
+
+
 def convert_glove(glove_filename, output_filename, nrows):
     """
     Convert GloVe data from a gzipped text file to a Feather dataframe.
@@ -98,11 +103,17 @@ def load_labels_and_npy(label_file, npy_file):
     return pd.DataFrame(npy, index=labels, dtype='f')
 
 
-def load_labels_as_index(label_file):
-    labels = [line.rstrip('\n') for line in open(label_file, encoding='utf-8')]
+def load_labels_as_index(label_filename):
+    labels = [line.rstrip('\n') for line in open(label_filename, encoding='utf-8')]
     return pd.Index(labels)
 
 
+def save_index_as_labels(index, label_filename):
+    with open(label_filename, 'w', encoding='utf-8') as out:
+        for label in index:
+            print(label, file=out)
+
+
 def load_csr(filename):
     with np.load(filename) as npz:
         mat = sparse.csr_matrix((npz['data'], npz['indices'], npz['indptr']), shape=npz['shape'])

diff --git a/conceptnet5/vectors/retrofit.py b/conceptnet5/vectors/retrofit.py
@@ -2,8 +2,8 @@
 import numpy as np
 from sklearn.preprocessing import normalize
 from .sparse_matrix_builder import build_from_conceptnet_table
-from .transforms import l2_normalize_rows
 from .formats import load_hdf, save_hdf
+from sklearn.preprocessing import normalize
 
 
 def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
@@ -32,13 +32,22 @@ def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
         save_hdf(retrofitted, temp_filename)
         del retrofitted
 
-    shards = [load_hdf(output_filename + '.shard%d' % i)
-              for i in range(nshards)]
-    joined = pd.concat(shards, axis=1, ignore_index=True)
-    del shards
 
-    save_hdf(joined, output_filename)
-    return joined
+def join_shards(output_filename, nshards=6):
+    joined_matrix = None
+    joined_labels = None
+    for i in range(nshards):
+        shard = load_hdf(output_filename + '.shard%d' % i)
+        nrows, ncols = shard.shape
+        if joined_matrix is None:
+            joined_matrix = np.zeros((nrows, ncols * nshards), dtype='f')
+            joined_labels = shard.index
+        joined_matrix[:, (ncols * i):(ncols * (i + 1))] = shard.values
+        del shard
+
+    normalize(joined_matrix, axis=1, norm='l2', copy=False)
+    dframe = pd.DataFrame(joined_matrix, index=joined_labels)
+    save_hdf(dframe, output_filename)
 
 
 def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=1):