Merge pull request #148 from commonsense/setup-fixes

Fixes to the ConceptNet build process and some readers
commonsense · Feb 2, 2018 · 1414015 · 1414015
2 parents 709d757 + acc705e
commit 1414015
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 37 deletions.
diff --git a/Snakefile b/Snakefile
@@ -37,6 +37,9 @@ WIKTIONARY_VERSIONS = {
 }
 WIKTIONARY_LANGUAGES = sorted(list(WIKTIONARY_VERSIONS))
 
+# Languages where morphemes should not be split anywhere except at spaces
+ATOMIC_SPACE_LANGUAGES = {'vi'}
+
 # Languages that the CLDR emoji data is available in. These match the original
 # filenames, not ConceptNet language codes; they are turned into ConceptNet
 # language codes by the reader.
@@ -56,7 +59,17 @@ WIKT_PARSER_VERSION = "1"
 
 RETROFIT_SHARDS = 6
 
-RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016"
+# Dataset filenames
+# =================
+# The goal of reader steps is to produce Msgpack files, and later CSV files,
+# with these names.
+#
+# We distingish *core dataset names*, which collectively determine the set of
+# terms that ConceptNet will attempt to represent, from the additional datasets
+# that will mainly be used to find more information about those terms.
+
+
+RAW_DATA_URL = "https://zenodo.org/record/998169/files/conceptnet-raw-data-5.5.zip"
 PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
 PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
 PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH
@@ -85,16 +98,6 @@ if TESTMODE:
     EMOJI_LANGUAGES = ['en', 'en_001']
 
 
-# Dataset filenames
-# =================
-# The goal of reader steps is to produce Msgpack files, and later CSV files,
-# with these names.
-#
-# We distingish *core dataset names*, which collectively determine the set of
-# terms that ConceptNet will attempt to represent, from the additional datasets
-# that will mainly be used to find more information about those terms.
-
-
 CORE_DATASET_NAMES = [
     "jmdict/jmdict",
     "nadya/nadya",
@@ -151,8 +154,8 @@ rule webdata:
 rule clean:
     shell:
         "for subdir in assertions assoc collated db edges psql tmp vectors stats; "
-        "do echo Removing %(data)s/$subdir; "
-        "rm -rf %(data)s/$subdir; done" % {'data': DATA}
+        "do echo Removing {DATA}/$subdir; "
+        "rm -rf {DATA}/$subdir; done"
 
 rule test:
     input:
@@ -164,11 +167,19 @@ rule test:
 
 # Downloaders
 # ===========
-rule download_raw:
+rule download_raw_package:
+    output:
+        DATA + "/raw/conceptnet-raw-data-5.5.zip"
+    shell:
+        "wget -nv {RAW_DATA_URL} -O {output}"
+
+rule extract_raw:
+    input:
+        DATA + "/raw/conceptnet-raw-data-5.5.zip"
     output:
         DATA + "/raw/{dirname}/{filename}"
     shell:
-        "wget -nv {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}"
+        "unzip {input} raw/{wildcards.dirname}/{wildcards.filename} -d {DATA}"
 
 rule download_conceptnet_ppmi:
     output:
@@ -251,9 +262,9 @@ rule read_dbpedia:
     output:
         DATA + "/edges/dbpedia/dbpedia_en.msgpack",
     shell:
-        "cn5-read dbpedia %(data)s/raw/dbpedia "
+        "cn5-read dbpedia {DATA}/raw/dbpedia "
         "{output} "
-        "%(data)s/stats/core_concepts.txt " % {'data': DATA}
+        "{DATA}/stats/core_concepts.txt "
 
 rule read_jmdict:
     input:
@@ -305,9 +316,9 @@ rule prescan_wiktionary:
     output:
         DATA + "/db/wiktionary.db"
     shell:
-        "mkdir -p %(data)s/tmp && "
-        "cn5-read wiktionary_pre {input} %(data)s/tmp/wiktionary.db && "
-        "mv %(data)s/tmp/wiktionary.db {output}" % {'data': DATA}
+        "mkdir -p {DATA}/tmp && "
+        "cn5-read wiktionary_pre {input} {DATA}/tmp/wiktionary.db && "
+        "mv {DATA}/tmp/wiktionary.db {output}"
 
 rule read_wiktionary:
     input:
@@ -368,7 +379,7 @@ rule sort_edges:
     output:
         DATA + "/collated/sorted/edges.csv"
     shell:
-        "mkdir -p %(data)s/tmp && cat {input} | LC_ALL=C sort -T %(data)s/tmp | LC_ALL=C uniq > {output}" % {'data': DATA}
+        "mkdir -p {DATA}/tmp && cat {input} | LC_ALL=C sort -T {DATA}/tmp | LC_ALL=C uniq > {output}"
 
 rule combine_assertions:
     input:
@@ -393,7 +404,7 @@ rule prepare_db:
         DATA + "/psql/sources.csv",
         DATA + "/psql/relations.csv"
     shell:
-        "cn5-db prepare_data {input} %(data)s/psql" % {'data': DATA}
+        "cn5-db prepare_data {input} {DATA}/psql"
 
 rule gzip_db:
     input:
@@ -415,7 +426,7 @@ rule load_db:
     output:
         DATA + "/psql/done"
     shell:
-        "cn5-db load_data %(data)s/psql && touch {output}" % {'data': DATA}
+        "cn5-db load_data {DATA}/psql && touch {output}"
 
 
 # Collecting statistics
@@ -623,11 +634,11 @@ rule retrofit:
         DATA + "/vectors/{name}.h5",
         DATA + "/assoc/reduced.csv"
     output:
-        expand(DATA + "/vectors/{{name}}-retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS))
+        temp(expand(DATA + "/vectors/{{name}}-retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS)))
     resources:
-        ram=16
+        ram=24
     shell:
-        "cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} %(data)s/vectors/{wildcards.name}-retrofit.h5" % {'data': DATA}
+        "cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} {DATA}/vectors/{wildcards.name}-retrofit.h5"
 
 rule join_retrofit:
     input:
@@ -738,7 +749,7 @@ rule compare_embeddings:
     run:
         input_embeddings = input[:-2]
         input_embeddings_str = ' '.join(input_embeddings)
-        shell("cn5-vectors compare_embeddings %s {output}" % input_embeddings_str)
+        shell("cn5-vectors compare_embeddings {input_embeddings_str} {output}")
 
 rule comparison_graph:
     input:

diff --git a/build.sh b/build.sh
@@ -30,4 +30,4 @@ check_db () {
 check_disk_space
 pip install -e '.[vectors]'
 check_db
-snakemake --resources 'ram=30' -j
+snakemake --resources 'ram=30' -j 2
diff --git a/conceptnet5/db/config.py b/conceptnet5/db/config.py
@@ -12,6 +12,9 @@
 
 DB_USERNAME = os.environ.get('CONCEPTNET_DB_USER', os.environ.get('USER', 'postgres'))
 DB_NAME = os.environ.get('CONCEPTNET_DB_NAME', 'conceptnet5')
-DB_PASSWORD = os.environ.get('CONCEPTNET_DB_PASSWORD')
 DB_SOCKET = '/var/run/postgresql/.s.PGSQL.5432'
 
+# These will not be used if DB_PASSWORD is blank -- instead, we'll use DB_SOCKET
+DB_PASSWORD = os.environ.get('CONCEPTNET_DB_PASSWORD', '')
+DB_HOSTNAME = os.environ.get('CONCEPTNET_DB_HOSTNAME', 'localhost')
+DB_PORT = int(os.environ.get('CONCEPTNET_DB_PORT', '5432'))
diff --git a/conceptnet5/db/connection.py b/conceptnet5/db/connection.py
@@ -40,12 +40,21 @@ def get_db_connection(dbname=None, building=False):
 
 
 def _get_db_connection_inner(dbname):
-    conn = pg8000.connect(
-        user=config.DB_USERNAME,
-        password=config.DB_PASSWORD or None,
-        unix_sock=config.DB_SOCKET,
-        database=dbname
-    )
+    if not config.DB_PASSWORD:
+        conn = pg8000.connect(
+            user=config.DB_USERNAME,
+            unix_sock=config.DB_SOCKET,
+            database=dbname
+        )
+    else:
+        conn = pg8000.connect(
+            user=config.DB_USERNAME,
+            password=config.DB_PASSWORD,
+            host=config.DB_HOSTNAME,
+            port=config.DB_PORT,
+            database=dbname
+        )
+
     pg8000.paramstyle = 'named'
     return conn
 

diff --git a/conceptnet5/readers/conceptnet4.py b/conceptnet5/readers/conceptnet4.py
@@ -88,6 +88,7 @@
     '/s/contributor/omcs/mrt',
     '/s/contributor/omcs/humplik',
     '/s/contributor/omcs/mickh',
+    '/s/contributor/omcs/visionsofkaos',
 }
 CONCEPT_BLACKLIST = {
     # Too vague
@@ -112,7 +113,7 @@
     "response to picture",
     "response to diagram",
     "commons2_reject",
-    "globalmind",    # avoid double-counting with the GlobalMind reader
+    "globalmind",
     "pycommons/question"
 }
 

diff --git a/conceptnet5/readers/wordnet.py b/conceptnet5/readers/wordnet.py
@@ -24,6 +24,7 @@
 
 REL_MAPPING = {
     'hypernym': ('IsA', '{0} is a type of {1}'),
+    'hypernym-v': ('MannerOf', '{0} is a way to {1}'),
     'part_meronym': ('PartOf', '{0} is a part of {1}'),
     'domain_category': ('HasContext', '{0} is used in the context of {1}'),
     'domain_region': ('HasContext', '{0} is used in the region of {1}'),
@@ -199,6 +200,9 @@ def run_wordnet(input_file, output_file):
         obj = obj_dict.get('url')
         relname = resource_name(rel)
         if relname in REL_MAPPING:
+            pos, sense = synset_disambig.get(subj, (None, None))
+            if relname == 'hypernym' and pos == 'v':
+                relname = 'hypernym-v'
             rel, frame = REL_MAPPING[relname]
             reversed_frame = False
             if rel.startswith('~'):
@@ -217,7 +221,6 @@ def run_wordnet(input_file, output_file):
                 if (not text) or '!' in text:
                     continue
                 lang = obj_dict['lang']
-                pos, sense = synset_disambig.get(subj, (None, None))
                 obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense)
                 obj_label = text