Skip to content

Commit

Permalink
Sharded processing in merge-intersect and debias; fixed snakefile dep…
Browse files Browse the repository at this point in the history
…endencies.
  • Loading branch information
luminoso-beaudoin committed Jun 6, 2018
1 parent a128e92 commit cbe2758
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 59 deletions.
64 changes: 41 additions & 23 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,12 @@ rule precompute_wiktionary:
rule read_conceptnet4:
input:
DATA + "/raw/conceptnet4/conceptnet4_flat_{num}.jsons",
DATA + "/db/wiktionary.db"
output:
DATA + "/edges/conceptnet4/conceptnet4_flat_{num}.msgpack"
run:
shell("cn5-read conceptnet4 {input} {output}")
single_input = input[0]
shell("cn5-read conceptnet4 {single_input} {output}")

rule read_dbpedia:
input:
Expand All @@ -276,11 +278,13 @@ rule read_jmdict:

rule read_nadya:
input:
DATA + "/raw/nadya/nadya-2017.csv"
DATA + "/raw/nadya/nadya-2017.csv",
DATA + "/db/wiktionary.db"
output:
DATA + "/edges/nadya/nadya.msgpack"
shell:
"cn5-read nadya {input} {output}"
run:
single_input = input[0]
shell("cn5-read nadya {single_input} {output}")

rule read_ptt_petgame:
input:
Expand Down Expand Up @@ -566,71 +570,85 @@ rule reduce_assoc:
# =========================
rule convert_word2vec:
input:
DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz"
DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/w2v-google-news.h5"
resources:
ram=24
shell:
"CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")

rule convert_glove:
input:
DATA + "/raw/vectors/glove12.840B.300d.txt.gz"
DATA + "/raw/vectors/glove12.840B.300d.txt.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/glove12-840B.h5"
resources:
ram=24
shell:
"CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")

rule convert_fasttext_crawl:
input:
DATA + "/raw/vectors/crawl-300d-2M.vec.gz"
DATA + "/raw/vectors/crawl-300d-2M.vec.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/crawl-300d-2M.h5"
resources:
ram=24
shell:
"CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")

rule convert_fasttext:
input:
DATA + "/raw/vectors/fasttext-wiki-{lang}.vec.gz"
DATA + "/raw/vectors/fasttext-wiki-{lang}.vec.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/fasttext-wiki-{lang}.h5"
resources:
ram=24
shell:
"CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} {input} {output}"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} {single_input} {output}")

rule convert_lexvec:
input:
DATA + "/raw/vectors/lexvec.commoncrawl.300d.W+C.pos.vectors.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/lexvec-commoncrawl.h5"
resources:
ram=24
shell:
"CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {input} {output}"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")

rule convert_opensubtitles_ft:
input:
DATA + "/raw/vectors/ft-opensubtitles.vec.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/fasttext-opensubtitles.h5"
resources:
ram=24
shell:
"CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} {input} {output}"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} {single_input} {output}")

rule convert_polyglot:
input:
DATA + "/raw/vectors/polyglot-{language}.pkl"
DATA + "/raw/vectors/polyglot-{language}.pkl",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/polyglot-{language}.h5"
shell:
"CONCEPTNET_DATA=data cn5-vectors convert_polyglot -l {wildcards.language} {input} {output}"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_polyglot -l {wildcards.language} {single_input} {output}")

rule retrofit:
input:
Expand Down
7 changes: 3 additions & 4 deletions conceptnet5/vectors/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,7 @@ def run_intersect(input_filenames, output_filename, projection_filename):
"""
Combine the vector knowledge contained in frames.
"""
frames = [load_hdf(filename) for filename in input_filenames]
intersected, projection = merge_intersect(frames)
intersected, projection = merge_intersect(input_filenames)
save_hdf(intersected, output_filename)
save_hdf(projection, projection_filename)

Expand All @@ -118,8 +117,8 @@ def run_debias(input_filename, output_filename):
Modify a frame to attempt to remove biases and prejudices.
"""
frame = load_hdf(input_filename)
debiased = de_bias_frame(frame)
save_hdf(debiased, output_filename)
de_bias_frame(frame)
save_hdf(frame, output_filename)


@cli.command(name='evaluate')
Expand Down
81 changes: 60 additions & 21 deletions conceptnet5/vectors/debias.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,28 @@
]


def make_shard_endpoints(total_length, shard_size=int(1e6)):
"""
Partition the half-open integer interval [0, total_length) into a
sequence of half-open subintervals [s0,e0), [s1,e1), ... [s_n, e_n)
such that s0 = 0, s_(k+1) = e_k, e_n = total_length, and each of these
subintervals (except possibly the last) has length equal to the given
shard_size. Return the sequence of pairs of endpoints of the
subintervals.
"""
shard_end = 0
shards = []
while True:
shard_start = shard_end
shard_end = shard_start + shard_size
if shard_end > total_length:
shard_end = total_length
if shard_start >= shard_end:
break
shards.append((shard_start, shard_end))
return shards


def get_weighted_vector(frame, weighted_terms):
"""
Given a list of (term, weight) pairs, get a unit vector corresponding
Expand Down Expand Up @@ -473,30 +495,38 @@ def de_bias_binary(frame, pos_examples, neg_examples, left_examples, right_examp
# The SVM can predict the probability, for each vector in the frame, that
# it's in each class. The positive class is column 1 of this prediction.
# This gives us a vector of how much each word in the vocabulary should be
# de-biased.
applicability = category_predictor.predict_proba(frame)[:, 1]
# de-biased. This is done on shards, to reduce peak memory consumption.
applicability = np.zeros(shape=(len(frame),), dtype=np.float32)
for shard_start, shard_end in make_shard_endpoints(len(frame)):
applicability[shard_start:shard_end] = category_predictor.predict_proba(
frame[shard_start:shard_end])[:, 1]
del category_predictor

# The bias axis is the vector difference between the average right example
# and the average left example.
bias_axis = get_category_axis(frame, right_examples) - get_category_axis(frame, left_examples)

# Make a modified version of the space that projects the bias axis to 0.
# Then weight each row of that space by "applicability", the probability
# that each row should be de-biased.
modified_component = reject_subspace(frame, [bias_axis]).mul(applicability, axis=0)
# that each row should be de-biased. This is also done on shards.
modified_component = np.zeros(shape=frame.values.shape, dtype=np.float32)
for shard_start, shard_end in make_shard_endpoints(len(frame)):
modified_component[shard_start:shard_end, :] = \
reject_subspace(frame[shard_start:shard_end], [bias_axis]).mul(
applicability[shard_start:shard_end], axis=0).values

# Make another component representing the vectors that should not be
# de-biased: the original space times (1 - applicability).
result = frame.mul(1 - applicability, axis=0)
np.multiply(1 - applicability.reshape((len(frame), 1)), frame.values,
out=frame.values)

# The sum of these two components is the de-biased space, where de-biasing
# applies to each row proportional to its applicability.
np.add(result.values, modified_component.values, out=result.values)
np.add(frame.values, modified_component, out=frame.values)
del modified_component

# L_2-normalize the resulting rows in-place.
normalize(result.values, norm='l2', copy=False)
return result
normalize(frame.values, norm='l2', copy=False)


def de_bias_category(frame, category_examples, bias_examples):
Expand All @@ -519,8 +549,11 @@ def de_bias_category(frame, category_examples, bias_examples):
category_predictor = two_class_svm(frame, category_examples, bias_examples)

# Predict the probability of each word in the vocabulary being in the
# category.
applicability = category_predictor.predict_proba(frame)[:, 1]
# category. This is done on shards, to reduce peak memory consumption.
applicability = np.zeros(shape=(len(frame),), dtype=np.float32)
for shard_start, shard_end in make_shard_endpoints(len(frame)):
applicability[shard_start:shard_end] = category_predictor.predict_proba(
frame[shard_start:shard_end])[:, 1]
del category_predictor

# Make a matrix of vectors representing the correlations to remove.
Expand All @@ -531,22 +564,26 @@ def de_bias_category(frame, category_examples, bias_examples):

# Make a modified version of the space that projects the bias vectors to 0.
# Then weight each row of that space by "applicability", the probability
# that each row should be de-biased.
modified_component = reject_subspace(frame, components_to_reject).mul(applicability, axis=0)
# that each row should be de-biased. This is also done on shards.
modified_component = np.zeros(shape=frame.values.shape, dtype=np.float32)
for shard_start, shard_end in make_shard_endpoints(len(frame)):
modified_component[shard_start:shard_end, :] = \
reject_subspace(frame[shard_start:shard_end], components_to_reject).mul(
applicability[shard_start:shard_end], axis=0).values
del components_to_reject

# Make another component representing the vectors that should not be
# de-biased: the original space times (1 - applicability).
result = frame.mul(1 - applicability, axis=0)
np.multiply(1 - applicability.reshape((len(frame), 1)), frame.values,
out=frame.values)

# The sum of these two components is the de-biased space, where de-biasing
# applies to each row proportional to its applicability.
np.add(result.values, modified_component.values, out=result.values)
np.add(frame.values, modified_component, out=frame.values)
del modified_component

# L_2-normalize the resulting rows in-place.
normalize(result.values, norm='l2', copy=False)
return result
normalize(frame.values, norm='l2', copy=False)


def de_bias_frame(frame):
Expand All @@ -558,9 +595,11 @@ def de_bias_frame(frame):
The resulting space attempts not to learn stereotyped associations with
anyone's race, color, religion, national origin, sex, gender presentation,
or sexual orientation.
The input frame is modified in-place; this can save considerable memory
with realistically sized semantic spaces.
"""
newframe = de_bias_category(frame, PEOPLE_BY_ETHNICITY, CULTURE_PREJUDICES + SEX_PREJUDICES)
newframe = de_bias_category(newframe, PEOPLE_BY_BELIEF, CULTURE_PREJUDICES + SEX_PREJUDICES)
newframe = de_bias_category(newframe, FEMALE_WORDS + MALE_WORDS + ORIENTATION_WORDS + AGE_WORDS, CULTURE_PREJUDICES + SEX_PREJUDICES)
newframe = de_bias_binary(newframe, GENDER_NEUTRAL_WORDS, GENDERED_WORDS, MALE_WORDS, FEMALE_WORDS)
return newframe
de_bias_category(frame, PEOPLE_BY_ETHNICITY, CULTURE_PREJUDICES + SEX_PREJUDICES)
de_bias_category(frame, PEOPLE_BY_BELIEF, CULTURE_PREJUDICES + SEX_PREJUDICES)
de_bias_category(frame, FEMALE_WORDS + MALE_WORDS + ORIENTATION_WORDS + AGE_WORDS, CULTURE_PREJUDICES + SEX_PREJUDICES)
de_bias_binary(frame, GENDER_NEUTRAL_WORDS, GENDERED_WORDS, MALE_WORDS, FEMALE_WORDS)
26 changes: 16 additions & 10 deletions conceptnet5/vectors/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from conceptnet5.uri import get_uri_language
from conceptnet5.languages import CORE_LANGUAGES
from .formats import load_hdf


def dataframe_svd_projection(frame, k):
Expand All @@ -23,10 +24,11 @@ def dataframe_svd_projection(frame, k):
return uframe, Σ[:k], vframe


def concat_intersect(frames):
def concat_intersect(frame_filenames):
"""
Find the intersection of the labels of all the `frames`, and concatenate
the vectors that the frames have for each of those labels.
Find the intersection of the labels of all the frames in the given
files , and concatenate the vectors that the frames have for each of
those labels.
This is exactly what `pd.concat` is for. However, `pd.concat` uses too
much memory. We have to emulate what it does while building the result
Expand All @@ -36,14 +38,17 @@ def concat_intersect(frames):
# frame. As we scan through the frames, find out what the indices of those
# columns are.
frame_col_offsets = [0]
ncolumns = frames[0].shape[1]
assert len(frame_filenames) > 0
frame = load_hdf(frame_filenames[0])
ncolumns = frame.shape[1]

# Our label intersection starts out as the label set of the first frame.
label_intersection = set(frames[0].index)
label_intersection = set(frame.index)

# Narrow down the label intersection, and find the column offset of
# each subsequent frame.
for frame in frames[1:]:
for frame_filename in frame_filenames[1:]:
frame = load_hdf(frame_filename)
label_intersection &= set(frame.index)
frame_col_offsets.append(ncolumns)
ncolumns += frame.shape[1]
Expand All @@ -58,7 +63,8 @@ def concat_intersect(frames):

# Find the appropriate rows of each frame, extract them in the order of
# our labels, and set those as the appropriate columns of the merged array.
for frame, offset in zip(frames, frame_col_offsets):
for frame_filename, offset in zip(frame_filenames, frame_col_offsets):
frame = load_hdf(frame_filename)
width = frame.shape[1]
for i, label in enumerate(label_intersection):
joindata[i, offset:(offset + width)] = frame.loc[label].values
Expand All @@ -70,9 +76,9 @@ def concat_intersect(frames):
return joined


def merge_intersect(frames, subsample=20, k=300):
def merge_intersect(frame_filenames, subsample=20, k=300):
"""
Combine the vector knowledge contained in `frames` over the vocabulary
Combine the vector knowledge contained in the frames over the vocabulary
that they agree on, and use dimensionality reduction to mitigate the
redundancy of learning the same thing multiple ways.
Expand All @@ -82,7 +88,7 @@ def merge_intersect(frames, subsample=20, k=300):
"""
# Find the intersected vocabulary of the frames, and concatenate their
# vectors over that vocabulary.
joined = concat_intersect(frames)
joined = concat_intersect(frame_filenames)

# Find a subset of the labels that we'll use for calculating the
# dimensionality-reduced version. The labels we particularly care about
Expand Down
2 changes: 1 addition & 1 deletion conceptnet5/vectors/miniaturize.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def miniaturize(frame, other_vocab=None, k=300, debias=True):
redecomposed = pd.DataFrame(U[:, :k], index=vocab, dtype='f')
del U, vocab
if debias:
redecomposed = de_bias_frame(redecomposed)
de_bias_frame(redecomposed)
mini = (redecomposed * 64).astype(np.int8)
mini.sort_index(inplace=True)
return mini

0 comments on commit cbe2758

Please sign in to comment.