Skip to content

Commit

Permalink
Add separate join_retrofit step that joins shards efficiently
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Speer committed May 24, 2016
1 parent f64bfb6 commit 3924ac6
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 15 deletions.
12 changes: 10 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ WIKTIONARY_LANGUAGES = sorted(list(WIKTIONARY_VERSIONS))
# Increment this number when we incompatibly change the parser
WIKT_PARSER_VERSION = "1"

CHUNKS_OF_100 = [0, 100, 200, 300, 400, 500]
RETROFIT_SHARDS = 6

# Dataset filenames
# =================
Expand Down Expand Up @@ -382,7 +382,15 @@ rule retrofit:
input:
"data/vectors/merged.h5",
"data/assoc/reduced.csv"
output:
expand("data/vectors/retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS))
shell:
"cn5-vectors retrofit -s {RETROFIT_SHARDS} -v {input} {output}"

rule join_retrofit:
input:
expand("data/vectors/retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS))
output:
"data/vectors/retrofit.h5"
shell:
"cn5-vectors retrofit -v {input} {output}"
"cn5-vectors join_retrofit -s {RETROFIT_SHARDS} {output}"
8 changes: 5 additions & 3 deletions TODO.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,18 @@ Design of ConceptNet 5.5:
- [x] Handle 'sw-maps' correctly, no proliferation of N-triples, no double-escaping
- [x] Store the index in a reasonable constant database
- [x] Use the blacklist when making assertions
- [ ] Use symmetrical relations
- [ ] Use JSON-LD when possible
- [ ] Optionally build ConceptNet vectors in build process
- [x] Build ConceptNet vectors in build process
- include word2vec and GloVe inputs in raw/
- [ ] Automatically test ConceptNet vectors
- [x] Automatically test ConceptNet vectors
- [x] Build vectors using less than 16 GB RAM
- [ ] fix 'FormOf' links to self

ConceptNet 5.6 perhaps:

- [ ] Nodes should have labels that are stored separately from their edges
- [ ] Break down the and-or trees into more machine-readable structures
- [ ] Use symmetrical relations

Cleaning up the data:

Expand Down
9 changes: 8 additions & 1 deletion conceptnet5/vectors/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import click
from .formats import convert_glove, convert_word2vec, load_hdf, save_hdf
from .sparse_matrix_builder import build_from_conceptnet_table
from .retrofit import sharded_retrofit
from .retrofit import sharded_retrofit, join_shards
from .interpolate import merge_interpolate
from .evaluation.wordsim import evaluate

Expand All @@ -26,6 +26,13 @@ def run_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
)


@cli.command(name='join_retrofit')
@click.argument('filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--nshards', '-s', default=6)
def run_join_retrofit(filename, nshards=6):
join_shards(filename, nshards)


@cli.command(name='convert_glove')
@click.argument('glove_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
Expand Down
15 changes: 13 additions & 2 deletions conceptnet5/vectors/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ def save_hdf(table, filename):
return table.to_hdf(filename, 'mat', encoding='utf-8')


def save_npy_and_labels(table, filebase):
np.save(filebase + '.npy', table.values)
save_index_as_labels(table.index, filebase + '.labels.txt')


def convert_glove(glove_filename, output_filename, nrows):
"""
Convert GloVe data from a gzipped text file to a Feather dataframe.
Expand Down Expand Up @@ -98,11 +103,17 @@ def load_labels_and_npy(label_file, npy_file):
return pd.DataFrame(npy, index=labels, dtype='f')


def load_labels_as_index(label_file):
labels = [line.rstrip('\n') for line in open(label_file, encoding='utf-8')]
def load_labels_as_index(label_filename):
labels = [line.rstrip('\n') for line in open(label_filename, encoding='utf-8')]
return pd.Index(labels)


def save_index_as_labels(index, label_filename):
with open(label_filename, 'w', encoding='utf-8') as out:
for label in index:
print(label, file=out)


def load_csr(filename):
with np.load(filename) as npz:
mat = sparse.csr_matrix((npz['data'], npz['indices'], npz['indptr']), shape=npz['shape'])
Expand Down
23 changes: 16 additions & 7 deletions conceptnet5/vectors/retrofit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import numpy as np
from sklearn.preprocessing import normalize
from .sparse_matrix_builder import build_from_conceptnet_table
from .transforms import l2_normalize_rows
from .formats import load_hdf, save_hdf
from sklearn.preprocessing import normalize


def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
Expand Down Expand Up @@ -32,13 +32,22 @@ def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
save_hdf(retrofitted, temp_filename)
del retrofitted

shards = [load_hdf(output_filename + '.shard%d' % i)
for i in range(nshards)]
joined = pd.concat(shards, axis=1, ignore_index=True)
del shards

save_hdf(joined, output_filename)
return joined
def join_shards(output_filename, nshards=6):
joined_matrix = None
joined_labels = None
for i in range(nshards):
shard = load_hdf(output_filename + '.shard%d' % i)
nrows, ncols = shard.shape
if joined_matrix is None:
joined_matrix = np.zeros((nrows, ncols * nshards), dtype='f')
joined_labels = shard.index
joined_matrix[:, (ncols * i):(ncols * (i + 1))] = shard.values
del shard

normalize(joined_matrix, axis=1, norm='l2', copy=False)
dframe = pd.DataFrame(joined_matrix, index=joined_labels)
save_hdf(dframe, output_filename)


def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=1):
Expand Down

0 comments on commit 3924ac6

Please sign in to comment.