Sharded processing in merge-intersect and debias; fixed snakefile dep…

…endencies.
commonsense · Jun 6, 2018 · cbe2758 · cbe2758
1 parent a128e92
commit cbe2758
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 59 deletions.
diff --git a/Snakefile b/Snakefile
@@ -248,10 +248,12 @@ rule precompute_wiktionary:
 rule read_conceptnet4:
     input:
         DATA + "/raw/conceptnet4/conceptnet4_flat_{num}.jsons",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/edges/conceptnet4/conceptnet4_flat_{num}.msgpack"
     run:
-        shell("cn5-read conceptnet4 {input} {output}")
+        single_input = input[0]
+        shell("cn5-read conceptnet4 {single_input} {output}")
 
 rule read_dbpedia:
     input:
@@ -276,11 +278,13 @@ rule read_jmdict:
 
 rule read_nadya:
     input:
-        DATA + "/raw/nadya/nadya-2017.csv"
+        DATA + "/raw/nadya/nadya-2017.csv",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/edges/nadya/nadya.msgpack"
-    shell:
-        "cn5-read nadya {input} {output}"
+    run:
+        single_input = input[0]
+        shell("cn5-read nadya {single_input} {output}")
 
 rule read_ptt_petgame:
     input:
@@ -566,71 +570,85 @@ rule reduce_assoc:
 # =========================
 rule convert_word2vec:
     input:
-        DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz"
+        DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/vectors/w2v-google-news.h5"
     resources:
         ram=24
-    shell:
-        "CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
+    run:
+        single_input = input[0]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
 
 rule convert_glove:
     input:
-        DATA + "/raw/vectors/glove12.840B.300d.txt.gz"
+        DATA + "/raw/vectors/glove12.840B.300d.txt.gz",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/vectors/glove12-840B.h5"
     resources:
         ram=24
-    shell:
-        "CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
+    run:
+        single_input = input[0]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
 
 rule convert_fasttext_crawl:
     input:
-        DATA + "/raw/vectors/crawl-300d-2M.vec.gz"
+        DATA + "/raw/vectors/crawl-300d-2M.vec.gz",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/vectors/crawl-300d-2M.h5"
     resources:
         ram=24
-    shell:
-        "CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
+    run:
+        single_input = input[0]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
 
 rule convert_fasttext:
     input:
-        DATA + "/raw/vectors/fasttext-wiki-{lang}.vec.gz"
+        DATA + "/raw/vectors/fasttext-wiki-{lang}.vec.gz",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/vectors/fasttext-wiki-{lang}.h5"
     resources:
         ram=24
-    shell:
-        "CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} {input} {output}"
+    run:
+        single_input = input[0]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} {single_input} {output}")
 
 rule convert_lexvec:
     input:
         DATA + "/raw/vectors/lexvec.commoncrawl.300d.W+C.pos.vectors.gz",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/vectors/lexvec-commoncrawl.h5"
     resources:
         ram=24
-    shell:
-        "CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
+    run:
+        single_input = input[0]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
 
 rule convert_opensubtitles_ft:
     input:
         DATA + "/raw/vectors/ft-opensubtitles.vec.gz",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/vectors/fasttext-opensubtitles.h5"
     resources:
         ram=24
-    shell:
-        "CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} {input} {output}"
+    run:
+        single_input = input[0]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} {single_input} {output}")
 
 rule convert_polyglot:
     input:
-        DATA + "/raw/vectors/polyglot-{language}.pkl"
+        DATA + "/raw/vectors/polyglot-{language}.pkl",
+        DATA + "/db/wiktionary.db"
     output:
         DATA + "/vectors/polyglot-{language}.h5"
-    shell:
-        "CONCEPTNET_DATA=data cn5-vectors convert_polyglot -l {wildcards.language} {input} {output}"
+    run:
+        single_input = input[0]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_polyglot -l {wildcards.language} {single_input} {output}")
 
 rule retrofit:
     input:

diff --git a/conceptnet5/vectors/cli.py b/conceptnet5/vectors/cli.py
@@ -104,8 +104,7 @@ def run_intersect(input_filenames, output_filename, projection_filename):
     """
     Combine the vector knowledge contained in frames.
     """
-    frames = [load_hdf(filename) for filename in input_filenames]
-    intersected, projection = merge_intersect(frames)
+    intersected, projection = merge_intersect(input_filenames)
     save_hdf(intersected, output_filename)
     save_hdf(projection, projection_filename)
 
@@ -118,8 +117,8 @@ def run_debias(input_filename, output_filename):
     Modify a frame to attempt to remove biases and prejudices.
     """
     frame = load_hdf(input_filename)
-    debiased = de_bias_frame(frame)
-    save_hdf(debiased, output_filename)
+    de_bias_frame(frame)
+    save_hdf(frame, output_filename)
 
 
 @cli.command(name='evaluate')

diff --git a/conceptnet5/vectors/debias.py b/conceptnet5/vectors/debias.py
@@ -359,6 +359,28 @@
 ]
 
 
+def make_shard_endpoints(total_length, shard_size=int(1e6)):
+    """
+    Partition the half-open integer interval [0, total_length) into a 
+    sequence of half-open subintervals [s0,e0), [s1,e1), ... [s_n, e_n) 
+    such that s0 = 0, s_(k+1) = e_k, e_n = total_length, and each of these 
+    subintervals (except possibly the last) has length equal to the given 
+    shard_size.  Return the sequence of pairs of endpoints of the 
+    subintervals.
+    """
+    shard_end = 0
+    shards = []
+    while True:
+        shard_start = shard_end
+        shard_end = shard_start + shard_size
+        if shard_end > total_length:
+            shard_end = total_length
+        if shard_start >= shard_end:
+            break
+        shards.append((shard_start, shard_end))
+    return shards
+
+
 def get_weighted_vector(frame, weighted_terms):
     """
     Given a list of (term, weight) pairs, get a unit vector corresponding
@@ -473,30 +495,38 @@ def de_bias_binary(frame, pos_examples, neg_examples, left_examples, right_examp
     # The SVM can predict the probability, for each vector in the frame, that
     # it's in each class. The positive class is column 1 of this prediction.
     # This gives us a vector of how much each word in the vocabulary should be
-    # de-biased.
-    applicability = category_predictor.predict_proba(frame)[:, 1]
+    # de-biased.  This is done on shards, to reduce peak memory consumption.
+    applicability = np.zeros(shape=(len(frame),), dtype=np.float32)
+    for shard_start, shard_end in make_shard_endpoints(len(frame)):
+        applicability[shard_start:shard_end] = category_predictor.predict_proba(
+            frame[shard_start:shard_end])[:, 1]
+    del category_predictor
 
     # The bias axis is the vector difference between the average right example
     # and the average left example.
     bias_axis = get_category_axis(frame, right_examples) - get_category_axis(frame, left_examples)
 
     # Make a modified version of the space that projects the bias axis to 0.
     # Then weight each row of that space by "applicability", the probability
-    # that each row should be de-biased.
-    modified_component = reject_subspace(frame, [bias_axis]).mul(applicability, axis=0)
+    # that each row should be de-biased.  This is also done on shards.
+    modified_component = np.zeros(shape=frame.values.shape, dtype=np.float32)
+    for shard_start, shard_end in make_shard_endpoints(len(frame)):
+        modified_component[shard_start:shard_end, :] = \
+            reject_subspace(frame[shard_start:shard_end], [bias_axis]).mul(
+                applicability[shard_start:shard_end], axis=0).values
 
     # Make another component representing the vectors that should not be
     # de-biased: the original space times (1 - applicability).
-    result = frame.mul(1 - applicability, axis=0)
+    np.multiply(1 - applicability.reshape((len(frame), 1)), frame.values,
+                out=frame.values)
 
     # The sum of these two components is the de-biased space, where de-biasing
     # applies to each row proportional to its applicability.
-    np.add(result.values, modified_component.values, out=result.values)
+    np.add(frame.values, modified_component, out=frame.values)
     del modified_component
 
     # L_2-normalize the resulting rows in-place.
-    normalize(result.values, norm='l2', copy=False)
-    return result
+    normalize(frame.values, norm='l2', copy=False)
 
 
 def de_bias_category(frame, category_examples, bias_examples):
@@ -519,8 +549,11 @@ def de_bias_category(frame, category_examples, bias_examples):
     category_predictor = two_class_svm(frame, category_examples, bias_examples)
 
     # Predict the probability of each word in the vocabulary being in the
-    # category.
-    applicability = category_predictor.predict_proba(frame)[:, 1]
+    # category.  This is done on shards, to reduce peak memory consumption.
+    applicability = np.zeros(shape=(len(frame),), dtype=np.float32)
+    for shard_start, shard_end in make_shard_endpoints(len(frame)):
+        applicability[shard_start:shard_end] = category_predictor.predict_proba(
+            frame[shard_start:shard_end])[:, 1]
     del category_predictor
 
     # Make a matrix of vectors representing the correlations to remove.
@@ -531,22 +564,26 @@ def de_bias_category(frame, category_examples, bias_examples):
 
     # Make a modified version of the space that projects the bias vectors to 0.
     # Then weight each row of that space by "applicability", the probability
-    # that each row should be de-biased.
-    modified_component = reject_subspace(frame, components_to_reject).mul(applicability, axis=0)
+    # that each row should be de-biased.  This is also done on shards.
+    modified_component = np.zeros(shape=frame.values.shape, dtype=np.float32)
+    for shard_start, shard_end in make_shard_endpoints(len(frame)):
+        modified_component[shard_start:shard_end, :] = \
+            reject_subspace(frame[shard_start:shard_end], components_to_reject).mul(
+                applicability[shard_start:shard_end], axis=0).values
     del components_to_reject
 
     # Make another component representing the vectors that should not be
     # de-biased: the original space times (1 - applicability).
-    result = frame.mul(1 - applicability, axis=0)
+    np.multiply(1 - applicability.reshape((len(frame), 1)), frame.values,
+                out=frame.values)
 
     # The sum of these two components is the de-biased space, where de-biasing
     # applies to each row proportional to its applicability.
-    np.add(result.values, modified_component.values, out=result.values)
+    np.add(frame.values, modified_component, out=frame.values)
     del modified_component
 
     # L_2-normalize the resulting rows in-place.
-    normalize(result.values, norm='l2', copy=False)
-    return result
+    normalize(frame.values, norm='l2', copy=False)
 
 
 def de_bias_frame(frame):
@@ -558,9 +595,11 @@ def de_bias_frame(frame):
     The resulting space attempts not to learn stereotyped associations with
     anyone's race, color, religion, national origin, sex, gender presentation,
     or sexual orientation.
+    
+    The input frame is modified in-place; this can save considerable memory 
+    with realistically sized semantic spaces.
     """
-    newframe = de_bias_category(frame, PEOPLE_BY_ETHNICITY, CULTURE_PREJUDICES + SEX_PREJUDICES)
-    newframe = de_bias_category(newframe, PEOPLE_BY_BELIEF, CULTURE_PREJUDICES + SEX_PREJUDICES)
-    newframe = de_bias_category(newframe, FEMALE_WORDS + MALE_WORDS + ORIENTATION_WORDS + AGE_WORDS, CULTURE_PREJUDICES + SEX_PREJUDICES)
-    newframe = de_bias_binary(newframe, GENDER_NEUTRAL_WORDS, GENDERED_WORDS, MALE_WORDS, FEMALE_WORDS)
-    return newframe
+    de_bias_category(frame, PEOPLE_BY_ETHNICITY, CULTURE_PREJUDICES + SEX_PREJUDICES)
+    de_bias_category(frame, PEOPLE_BY_BELIEF, CULTURE_PREJUDICES + SEX_PREJUDICES)
+    de_bias_category(frame, FEMALE_WORDS + MALE_WORDS + ORIENTATION_WORDS + AGE_WORDS, CULTURE_PREJUDICES + SEX_PREJUDICES)
+    de_bias_binary(frame, GENDER_NEUTRAL_WORDS, GENDERED_WORDS, MALE_WORDS, FEMALE_WORDS)
diff --git a/conceptnet5/vectors/merge.py b/conceptnet5/vectors/merge.py
@@ -4,6 +4,7 @@
 
 from conceptnet5.uri import get_uri_language
 from conceptnet5.languages import CORE_LANGUAGES
+from .formats import load_hdf
 
 
 def dataframe_svd_projection(frame, k):
@@ -23,10 +24,11 @@ def dataframe_svd_projection(frame, k):
     return uframe, Σ[:k], vframe
 
 
-def concat_intersect(frames):
+def concat_intersect(frame_filenames):
     """
-    Find the intersection of the labels of all the `frames`, and concatenate
-    the vectors that the frames have for each of those labels.
+    Find the intersection of the labels of all the frames in the given 
+    files , and concatenate the vectors that the frames have for each of 
+    those labels.
 
     This is exactly what `pd.concat` is for. However, `pd.concat` uses too
     much memory. We have to emulate what it does while building the result
@@ -36,14 +38,17 @@ def concat_intersect(frames):
     # frame. As we scan through the frames, find out what the indices of those
     # columns are.
     frame_col_offsets = [0]
-    ncolumns = frames[0].shape[1]
+    assert len(frame_filenames) > 0
+    frame = load_hdf(frame_filenames[0])
+    ncolumns = frame.shape[1]
 
     # Our label intersection starts out as the label set of the first frame.
-    label_intersection = set(frames[0].index)
+    label_intersection = set(frame.index)
 
     # Narrow down the label intersection, and find the column offset of
     # each subsequent frame.
-    for frame in frames[1:]:
+    for frame_filename in frame_filenames[1:]:
+        frame = load_hdf(frame_filename)
         label_intersection &= set(frame.index)
         frame_col_offsets.append(ncolumns)
         ncolumns += frame.shape[1]
@@ -58,7 +63,8 @@ def concat_intersect(frames):
 
     # Find the appropriate rows of each frame, extract them in the order of
     # our labels, and set those as the appropriate columns of the merged array.
-    for frame, offset in zip(frames, frame_col_offsets):
+    for frame_filename, offset in zip(frame_filenames, frame_col_offsets):
+        frame = load_hdf(frame_filename)
         width = frame.shape[1]
         for i, label in enumerate(label_intersection):
             joindata[i, offset:(offset + width)] = frame.loc[label].values
@@ -70,9 +76,9 @@ def concat_intersect(frames):
     return joined
 
 
-def merge_intersect(frames, subsample=20, k=300):
+def merge_intersect(frame_filenames, subsample=20, k=300):
     """
-    Combine the vector knowledge contained in `frames` over the vocabulary
+    Combine the vector knowledge contained in the frames over the vocabulary
     that they agree on, and use dimensionality reduction to mitigate the
     redundancy of learning the same thing multiple ways.
 
@@ -82,7 +88,7 @@ def merge_intersect(frames, subsample=20, k=300):
     """
     # Find the intersected vocabulary of the frames, and concatenate their
     # vectors over that vocabulary.
-    joined = concat_intersect(frames)
+    joined = concat_intersect(frame_filenames)
 
     # Find a subset of the labels that we'll use for calculating the
     # dimensionality-reduced version. The labels we particularly care about

diff --git a/conceptnet5/vectors/miniaturize.py b/conceptnet5/vectors/miniaturize.py
@@ -63,7 +63,7 @@ def miniaturize(frame, other_vocab=None, k=300, debias=True):
     redecomposed = pd.DataFrame(U[:, :k], index=vocab, dtype='f')
     del U, vocab
     if debias:
-        redecomposed = de_bias_frame(redecomposed)
+        de_bias_frame(redecomposed)
     mini = (redecomposed * 64).astype(np.int8)
     mini.sort_index(inplace=True)
     return mini