-
Notifications
You must be signed in to change notification settings - Fork 356
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes to the ConceptNet build process and some readers #148
Changes from 11 commits
764e3fe
408a23a
9837705
f1e5ddc
197507a
aac4b94
60fe2c5
63f8f59
2f947a5
6b36a75
73abb48
07b1370
3aa5b14
acc705e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,9 @@ WIKTIONARY_VERSIONS = { | |
} | ||
WIKTIONARY_LANGUAGES = sorted(list(WIKTIONARY_VERSIONS)) | ||
|
||
# Languages where morphemes should not be split anywhere except at spaces | ||
ATOMIC_SPACE_LANGUAGES = {'vi'} | ||
|
||
# Languages that the CLDR emoji data is available in. These match the original | ||
# filenames, not ConceptNet language codes; they are turned into ConceptNet | ||
# language codes by the reader. | ||
|
@@ -50,7 +53,34 @@ WIKT_PARSER_VERSION = "1" | |
|
||
RETROFIT_SHARDS = 6 | ||
|
||
RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016" | ||
# Dataset filenames | ||
# ================= | ||
# The goal of reader steps is to produce Msgpack files, and later CSV files, | ||
# with these names. | ||
# | ||
# We distingish *core dataset names*, which collectively determine the set of | ||
# terms that ConceptNet will attempt to represent, from the additional datasets | ||
# that will mainly be used to find more information about those terms. | ||
|
||
|
||
CORE_DATASET_NAMES = [ | ||
"jmdict/jmdict", | ||
"nadya/nadya", | ||
"ptt_petgame/api", | ||
"opencyc/opencyc", | ||
"verbosity/verbosity", | ||
"wordnet/wordnet", | ||
"cedict/cedict" | ||
] | ||
CORE_DATASET_NAMES += ["conceptnet4/conceptnet4_flat_{}".format(num) for num in range(10)] | ||
CORE_DATASET_NAMES += ["ptt_petgame/part{}".format(num) for num in range(1, 13)] | ||
CORE_DATASET_NAMES += ["wiktionary/{}".format(lang) for lang in WIKTIONARY_LANGUAGES] | ||
CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES] | ||
|
||
|
||
DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh whoops, I should have paid more attention to the merge conflict. |
||
|
||
RAW_DATA_URL = "https://zenodo.org/record/998169/files/conceptnet-raw-data-5.5.zip" | ||
PRECOMPUTED_DATA_PATH = "/precomputed-data/2016" | ||
PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH | ||
PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH | ||
|
@@ -143,8 +173,8 @@ rule webdata: | |
rule clean: | ||
shell: | ||
"for subdir in assertions assoc collated db edges psql tmp vectors stats; " | ||
"do echo Removing %(data)s/$subdir; " | ||
"rm -rf %(data)s/$subdir; done" % {'data': DATA} | ||
"do echo Removing {DATA}/$subdir; " | ||
"rm -rf {DATA}/$subdir; done" | ||
|
||
rule test: | ||
input: | ||
|
@@ -156,11 +186,19 @@ rule test: | |
|
||
# Downloaders | ||
# =========== | ||
rule download_raw: | ||
rule download_raw_package: | ||
output: | ||
DATA + "/raw/conceptnet-raw-data-5.5.zip" | ||
shell: | ||
"wget -nv {RAW_DATA_URL} -O {output}" | ||
|
||
rule extract_raw: | ||
input: | ||
DATA + "/raw/conceptnet-raw-data-5.5.zip" | ||
output: | ||
DATA + "/raw/{dirname}/{filename}" | ||
shell: | ||
"wget -nv {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}" | ||
"unzip {input} raw/{wildcards.dirname}/{wildcards.filename} -d {DATA}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like every time a single new file is added to the process, one would have to re-download the entire |
||
|
||
rule download_conceptnet_ppmi: | ||
output: | ||
|
@@ -243,9 +281,9 @@ rule read_dbpedia: | |
output: | ||
DATA + "/edges/dbpedia/dbpedia_en.msgpack", | ||
shell: | ||
"cn5-read dbpedia %(data)s/raw/dbpedia " | ||
"cn5-read dbpedia {DATA}/raw/dbpedia " | ||
"{output} " | ||
"%(data)s/stats/core_concepts.txt " % {'data': DATA} | ||
"{DATA}/stats/core_concepts.txt " | ||
|
||
rule read_jmdict: | ||
input: | ||
|
@@ -297,9 +335,9 @@ rule prescan_wiktionary: | |
output: | ||
DATA + "/db/wiktionary.db" | ||
shell: | ||
"mkdir -p %(data)s/tmp && " | ||
"cn5-read wiktionary_pre {input} %(data)s/tmp/wiktionary.db && " | ||
"mv %(data)s/tmp/wiktionary.db {output}" % {'data': DATA} | ||
"mkdir -p {DATA}/tmp && " | ||
"cn5-read wiktionary_pre {input} {DATA}/tmp/wiktionary.db && " | ||
"mv {DATA}/tmp/wiktionary.db {output}" | ||
|
||
rule read_wiktionary: | ||
input: | ||
|
@@ -360,7 +398,7 @@ rule sort_edges: | |
output: | ||
DATA + "/collated/sorted/edges.csv" | ||
shell: | ||
"mkdir -p %(data)s/tmp && cat {input} | LC_ALL=C sort -T %(data)s/tmp | LC_ALL=C uniq > {output}" % {'data': DATA} | ||
"mkdir -p {DATA}/tmp && cat {input} | LC_ALL=C sort -T {DATA}/tmp | LC_ALL=C uniq > {output}" | ||
|
||
rule combine_assertions: | ||
input: | ||
|
@@ -385,7 +423,7 @@ rule prepare_db: | |
DATA + "/psql/sources.csv", | ||
DATA + "/psql/relations.csv" | ||
shell: | ||
"cn5-db prepare_data {input} %(data)s/psql" % {'data': DATA} | ||
"cn5-db prepare_data {input} {DATA}/psql" | ||
|
||
rule gzip_db: | ||
input: | ||
|
@@ -407,7 +445,7 @@ rule load_db: | |
output: | ||
DATA + "/psql/done" | ||
shell: | ||
"cn5-db load_data %(data)s/psql && touch {output}" % {'data': DATA} | ||
"cn5-db load_data {DATA}/psql && touch {output}" | ||
|
||
|
||
# Collecting statistics | ||
|
@@ -597,7 +635,7 @@ rule retrofit: | |
resources: | ||
ram=16 | ||
shell: | ||
"cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} %(data)s/vectors/{wildcards.name}-retrofit.h5" % {'data': DATA} | ||
"cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} {DATA}/vectors/{wildcards.name}-retrofit.h5" | ||
|
||
rule join_retrofit: | ||
input: | ||
|
@@ -659,6 +697,37 @@ rule export_english_text: | |
"cn5-vectors export_text -l en {input} {output}" | ||
|
||
|
||
# Morphology | ||
# ========== | ||
|
||
rule prepare_vocab: | ||
input: | ||
DATA + "/stats/core_concept_counts.txt" | ||
output: | ||
DATA + "/morph/vocab/{language}.txt" | ||
shell: | ||
"cn5-build prepare_morphology {wildcards.language} {input} {output}" | ||
|
||
rule morfessor_segmentation: | ||
input: | ||
DATA + "/morph/vocab/{language}.txt" | ||
output: | ||
DATA + "/morph/segments/{language}.txt" | ||
run: | ||
if wildcards.language in ATOMIC_SPACE_LANGUAGES: | ||
shell("morfessor-train {input} -S {output} --traindata-list --nosplit-re '[^_].'") | ||
else: | ||
shell("morfessor-train {input} -S {output} -f '_' --traindata-list") | ||
|
||
rule subwords: | ||
input: | ||
DATA + "/morph/segments/{language}.txt", | ||
output: | ||
DATA + "/edges/morphology/subwords-{language}.msgpack" | ||
shell: | ||
"cn5-build subwords {wildcards.language} {input} {output}" | ||
|
||
|
||
# Evaluation | ||
# ========== | ||
|
||
|
@@ -677,7 +746,7 @@ rule compare_embeddings: | |
run: | ||
input_embeddings = input[:-2] | ||
input_embeddings_str = ' '.join(input_embeddings) | ||
shell("cn5-vectors compare_embeddings %s {output}" % input_embeddings_str) | ||
shell("cn5-vectors compare_embeddings {input_embeddings_str} {output}") | ||
|
||
rule comparison_graph: | ||
input: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is still incorrect, because
CORE_DATASET_NAMES
gets updated with emoji files beforeEMOJI_LANGUAGES
is overwritten in line 107.