Skip to content

Commit

Permalink
Merge pull request #148 from commonsense/setup-fixes
Browse files Browse the repository at this point in the history
Fixes to the ConceptNet build process and some readers
  • Loading branch information
jlowryduda authored Feb 2, 2018
2 parents 709d757 + acc705e commit 1414015
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 37 deletions.
65 changes: 38 additions & 27 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ WIKTIONARY_VERSIONS = {
}
WIKTIONARY_LANGUAGES = sorted(list(WIKTIONARY_VERSIONS))

# Languages where morphemes should not be split anywhere except at spaces
ATOMIC_SPACE_LANGUAGES = {'vi'}

# Languages that the CLDR emoji data is available in. These match the original
# filenames, not ConceptNet language codes; they are turned into ConceptNet
# language codes by the reader.
Expand All @@ -56,7 +59,17 @@ WIKT_PARSER_VERSION = "1"

RETROFIT_SHARDS = 6

RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016"
# Dataset filenames
# =================
# The goal of reader steps is to produce Msgpack files, and later CSV files,
# with these names.
#
# We distingish *core dataset names*, which collectively determine the set of
# terms that ConceptNet will attempt to represent, from the additional datasets
# that will mainly be used to find more information about those terms.


RAW_DATA_URL = "https://zenodo.org/record/998169/files/conceptnet-raw-data-5.5.zip"
PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH
Expand Down Expand Up @@ -85,16 +98,6 @@ if TESTMODE:
EMOJI_LANGUAGES = ['en', 'en_001']


# Dataset filenames
# =================
# The goal of reader steps is to produce Msgpack files, and later CSV files,
# with these names.
#
# We distingish *core dataset names*, which collectively determine the set of
# terms that ConceptNet will attempt to represent, from the additional datasets
# that will mainly be used to find more information about those terms.


CORE_DATASET_NAMES = [
"jmdict/jmdict",
"nadya/nadya",
Expand Down Expand Up @@ -151,8 +154,8 @@ rule webdata:
rule clean:
shell:
"for subdir in assertions assoc collated db edges psql tmp vectors stats; "
"do echo Removing %(data)s/$subdir; "
"rm -rf %(data)s/$subdir; done" % {'data': DATA}
"do echo Removing {DATA}/$subdir; "
"rm -rf {DATA}/$subdir; done"

rule test:
input:
Expand All @@ -164,11 +167,19 @@ rule test:

# Downloaders
# ===========
rule download_raw:
rule download_raw_package:
output:
DATA + "/raw/conceptnet-raw-data-5.5.zip"
shell:
"wget -nv {RAW_DATA_URL} -O {output}"

rule extract_raw:
input:
DATA + "/raw/conceptnet-raw-data-5.5.zip"
output:
DATA + "/raw/{dirname}/{filename}"
shell:
"wget -nv {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}"
"unzip {input} raw/{wildcards.dirname}/{wildcards.filename} -d {DATA}"

rule download_conceptnet_ppmi:
output:
Expand Down Expand Up @@ -251,9 +262,9 @@ rule read_dbpedia:
output:
DATA + "/edges/dbpedia/dbpedia_en.msgpack",
shell:
"cn5-read dbpedia %(data)s/raw/dbpedia "
"cn5-read dbpedia {DATA}/raw/dbpedia "
"{output} "
"%(data)s/stats/core_concepts.txt " % {'data': DATA}
"{DATA}/stats/core_concepts.txt "

rule read_jmdict:
input:
Expand Down Expand Up @@ -305,9 +316,9 @@ rule prescan_wiktionary:
output:
DATA + "/db/wiktionary.db"
shell:
"mkdir -p %(data)s/tmp && "
"cn5-read wiktionary_pre {input} %(data)s/tmp/wiktionary.db && "
"mv %(data)s/tmp/wiktionary.db {output}" % {'data': DATA}
"mkdir -p {DATA}/tmp && "
"cn5-read wiktionary_pre {input} {DATA}/tmp/wiktionary.db && "
"mv {DATA}/tmp/wiktionary.db {output}"

rule read_wiktionary:
input:
Expand Down Expand Up @@ -368,7 +379,7 @@ rule sort_edges:
output:
DATA + "/collated/sorted/edges.csv"
shell:
"mkdir -p %(data)s/tmp && cat {input} | LC_ALL=C sort -T %(data)s/tmp | LC_ALL=C uniq > {output}" % {'data': DATA}
"mkdir -p {DATA}/tmp && cat {input} | LC_ALL=C sort -T {DATA}/tmp | LC_ALL=C uniq > {output}"

rule combine_assertions:
input:
Expand All @@ -393,7 +404,7 @@ rule prepare_db:
DATA + "/psql/sources.csv",
DATA + "/psql/relations.csv"
shell:
"cn5-db prepare_data {input} %(data)s/psql" % {'data': DATA}
"cn5-db prepare_data {input} {DATA}/psql"

rule gzip_db:
input:
Expand All @@ -415,7 +426,7 @@ rule load_db:
output:
DATA + "/psql/done"
shell:
"cn5-db load_data %(data)s/psql && touch {output}" % {'data': DATA}
"cn5-db load_data {DATA}/psql && touch {output}"


# Collecting statistics
Expand Down Expand Up @@ -623,11 +634,11 @@ rule retrofit:
DATA + "/vectors/{name}.h5",
DATA + "/assoc/reduced.csv"
output:
expand(DATA + "/vectors/{{name}}-retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS))
temp(expand(DATA + "/vectors/{{name}}-retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS)))
resources:
ram=16
ram=24
shell:
"cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} %(data)s/vectors/{wildcards.name}-retrofit.h5" % {'data': DATA}
"cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} {DATA}/vectors/{wildcards.name}-retrofit.h5"

rule join_retrofit:
input:
Expand Down Expand Up @@ -738,7 +749,7 @@ rule compare_embeddings:
run:
input_embeddings = input[:-2]
input_embeddings_str = ' '.join(input_embeddings)
shell("cn5-vectors compare_embeddings %s {output}" % input_embeddings_str)
shell("cn5-vectors compare_embeddings {input_embeddings_str} {output}")

rule comparison_graph:
input:
Expand Down
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ check_db () {
check_disk_space
pip install -e '.[vectors]'
check_db
snakemake --resources 'ram=30' -j
snakemake --resources 'ram=30' -j 2
5 changes: 4 additions & 1 deletion conceptnet5/db/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

DB_USERNAME = os.environ.get('CONCEPTNET_DB_USER', os.environ.get('USER', 'postgres'))
DB_NAME = os.environ.get('CONCEPTNET_DB_NAME', 'conceptnet5')
DB_PASSWORD = os.environ.get('CONCEPTNET_DB_PASSWORD')
DB_SOCKET = '/var/run/postgresql/.s.PGSQL.5432'

# These will not be used if DB_PASSWORD is blank -- instead, we'll use DB_SOCKET
DB_PASSWORD = os.environ.get('CONCEPTNET_DB_PASSWORD', '')
DB_HOSTNAME = os.environ.get('CONCEPTNET_DB_HOSTNAME', 'localhost')
DB_PORT = int(os.environ.get('CONCEPTNET_DB_PORT', '5432'))
21 changes: 15 additions & 6 deletions conceptnet5/db/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,21 @@ def get_db_connection(dbname=None, building=False):


def _get_db_connection_inner(dbname):
conn = pg8000.connect(
user=config.DB_USERNAME,
password=config.DB_PASSWORD or None,
unix_sock=config.DB_SOCKET,
database=dbname
)
if not config.DB_PASSWORD:
conn = pg8000.connect(
user=config.DB_USERNAME,
unix_sock=config.DB_SOCKET,
database=dbname
)
else:
conn = pg8000.connect(
user=config.DB_USERNAME,
password=config.DB_PASSWORD,
host=config.DB_HOSTNAME,
port=config.DB_PORT,
database=dbname
)

pg8000.paramstyle = 'named'
return conn

Expand Down
3 changes: 2 additions & 1 deletion conceptnet5/readers/conceptnet4.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
'/s/contributor/omcs/mrt',
'/s/contributor/omcs/humplik',
'/s/contributor/omcs/mickh',
'/s/contributor/omcs/visionsofkaos',
}
CONCEPT_BLACKLIST = {
# Too vague
Expand All @@ -112,7 +113,7 @@
"response to picture",
"response to diagram",
"commons2_reject",
"globalmind", # avoid double-counting with the GlobalMind reader
"globalmind",
"pycommons/question"
}

Expand Down
5 changes: 4 additions & 1 deletion conceptnet5/readers/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

REL_MAPPING = {
'hypernym': ('IsA', '{0} is a type of {1}'),
'hypernym-v': ('MannerOf', '{0} is a way to {1}'),
'part_meronym': ('PartOf', '{0} is a part of {1}'),
'domain_category': ('HasContext', '{0} is used in the context of {1}'),
'domain_region': ('HasContext', '{0} is used in the region of {1}'),
Expand Down Expand Up @@ -199,6 +200,9 @@ def run_wordnet(input_file, output_file):
obj = obj_dict.get('url')
relname = resource_name(rel)
if relname in REL_MAPPING:
pos, sense = synset_disambig.get(subj, (None, None))
if relname == 'hypernym' and pos == 'v':
relname = 'hypernym-v'
rel, frame = REL_MAPPING[relname]
reversed_frame = False
if rel.startswith('~'):
Expand All @@ -217,7 +221,6 @@ def run_wordnet(input_file, output_file):
if (not text) or '!' in text:
continue
lang = obj_dict['lang']
pos, sense = synset_disambig.get(subj, (None, None))
obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense)
obj_label = text

Expand Down

0 comments on commit 1414015

Please sign in to comment.