commonsense · jlowryduda · Feb 2, 2018 · Nov 21, 2017 · Nov 21, 2017 · Nov 21, 2017
diff --git a/Snakefile b/Snakefile
@@ -31,6 +31,9 @@ WIKTIONARY_VERSIONS = {
 }
 WIKTIONARY_LANGUAGES = sorted(list(WIKTIONARY_VERSIONS))
 
+# Languages where morphemes should not be split anywhere except at spaces
+ATOMIC_SPACE_LANGUAGES = {'vi'}
+
 # Languages that the CLDR emoji data is available in. These match the original
 # filenames, not ConceptNet language codes; they are turned into ConceptNet
 # language codes by the reader.
@@ -50,7 +53,34 @@ WIKT_PARSER_VERSION = "1"
 
 RETROFIT_SHARDS = 6
 
-RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016"
+# Dataset filenames
+# =================
+# The goal of reader steps is to produce Msgpack files, and later CSV files,
+# with these names.
+#
+# We distingish *core dataset names*, which collectively determine the set of
+# terms that ConceptNet will attempt to represent, from the additional datasets
+# that will mainly be used to find more information about those terms.
+
+
+CORE_DATASET_NAMES = [
+    "jmdict/jmdict",
+    "nadya/nadya",
+    "ptt_petgame/api",
+    "opencyc/opencyc",
+    "verbosity/verbosity",
+    "wordnet/wordnet",
+    "cedict/cedict"
+]
+CORE_DATASET_NAMES += ["conceptnet4/conceptnet4_flat_{}".format(num) for num in range(10)]
+CORE_DATASET_NAMES += ["ptt_petgame/part{}".format(num) for num in range(1, 13)]
+CORE_DATASET_NAMES += ["wiktionary/{}".format(lang) for lang in WIKTIONARY_LANGUAGES]
+CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES]
+
+
+DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"]
+
+RAW_DATA_URL = "https://zenodo.org/record/998169/files/conceptnet-raw-data-5.5.zip"
 PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
 PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
 PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH
@@ -143,8 +173,8 @@ rule webdata:
 rule clean:
     shell:
         "for subdir in assertions assoc collated db edges psql tmp vectors stats; "
-        "do echo Removing %(data)s/$subdir; "
-        "rm -rf %(data)s/$subdir; done" % {'data': DATA}
+        "do echo Removing {DATA}/$subdir; "
+        "rm -rf {DATA}/$subdir; done"
 
 rule test:
     input:
@@ -156,11 +186,19 @@ rule test:
 
 # Downloaders
 # ===========
-rule download_raw:
+rule download_raw_package:
+    output:
+        DATA + "/raw/conceptnet-raw-data-5.5.zip"
+    shell:
+        "wget -nv {RAW_DATA_URL} -O {output}"
+
+rule extract_raw:
+    input:
+        DATA + "/raw/conceptnet-raw-data-5.5.zip"
     output:
         DATA + "/raw/{dirname}/{filename}"
     shell:
-        "wget -nv {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}"
+        "unzip {input} raw/{wildcards.dirname}/{wildcards.filename} -d {DATA}"
 
 rule download_conceptnet_ppmi:
     output:
@@ -243,9 +281,9 @@ rule read_dbpedia:
     output:
         DATA + "/edges/dbpedia/dbpedia_en.msgpack",
     shell:
-        "cn5-read dbpedia %(data)s/raw/dbpedia "
+        "cn5-read dbpedia {DATA}/raw/dbpedia "
         "{output} "
-        "%(data)s/stats/core_concepts.txt " % {'data': DATA}
+        "{DATA}/stats/core_concepts.txt "
 
 rule read_jmdict:
     input:
@@ -297,9 +335,9 @@ rule prescan_wiktionary:
     output:
         DATA + "/db/wiktionary.db"
     shell:
-        "mkdir -p %(data)s/tmp && "
-        "cn5-read wiktionary_pre {input} %(data)s/tmp/wiktionary.db && "
-        "mv %(data)s/tmp/wiktionary.db {output}" % {'data': DATA}
+        "mkdir -p {DATA}/tmp && "
+        "cn5-read wiktionary_pre {input} {DATA}/tmp/wiktionary.db && "
+        "mv {DATA}/tmp/wiktionary.db {output}"
 
 rule read_wiktionary:
     input:
@@ -360,7 +398,7 @@ rule sort_edges:
     output:
         DATA + "/collated/sorted/edges.csv"
     shell:
-        "mkdir -p %(data)s/tmp && cat {input} | LC_ALL=C sort -T %(data)s/tmp | LC_ALL=C uniq > {output}" % {'data': DATA}
+        "mkdir -p {DATA}/tmp && cat {input} | LC_ALL=C sort -T {DATA}/tmp | LC_ALL=C uniq > {output}"
 
 rule combine_assertions:
     input:
@@ -385,7 +423,7 @@ rule prepare_db:
         DATA + "/psql/sources.csv",
         DATA + "/psql/relations.csv"
     shell:
-        "cn5-db prepare_data {input} %(data)s/psql" % {'data': DATA}
+        "cn5-db prepare_data {input} {DATA}/psql"
 
 rule gzip_db:
     input:
@@ -407,7 +445,7 @@ rule load_db:
     output:
         DATA + "/psql/done"
     shell:
-        "cn5-db load_data %(data)s/psql && touch {output}" % {'data': DATA}
+        "cn5-db load_data {DATA}/psql && touch {output}"
 
 
 # Collecting statistics
@@ -597,7 +635,7 @@ rule retrofit:
     resources:
         ram=16
     shell:
-        "cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} %(data)s/vectors/{wildcards.name}-retrofit.h5" % {'data': DATA}
+        "cn5-vectors retrofit -s {RETROFIT_SHARDS} {input} {DATA}/vectors/{wildcards.name}-retrofit.h5"
 
 rule join_retrofit:
     input:
@@ -659,6 +697,37 @@ rule export_english_text:
         "cn5-vectors export_text -l en {input} {output}"
 
 
+# Morphology
+# ==========
+
+rule prepare_vocab:
+    input:
+        DATA + "/stats/core_concept_counts.txt"
+    output:
+        DATA + "/morph/vocab/{language}.txt"
+    shell:
+        "cn5-build prepare_morphology {wildcards.language} {input} {output}"
+
+rule morfessor_segmentation:
+    input:
+        DATA + "/morph/vocab/{language}.txt"
+    output:
+        DATA + "/morph/segments/{language}.txt"
+    run:
+        if wildcards.language in ATOMIC_SPACE_LANGUAGES:
+            shell("morfessor-train {input} -S {output} --traindata-list --nosplit-re '[^_].'")
+        else:
+            shell("morfessor-train {input} -S {output} -f '_' --traindata-list")
+
+rule subwords:
+    input:
+        DATA + "/morph/segments/{language}.txt",
+    output:
+        DATA + "/edges/morphology/subwords-{language}.msgpack"
+    shell:
+        "cn5-build subwords {wildcards.language} {input} {output}"
+
+
 # Evaluation
 # ==========
 
@@ -677,7 +746,7 @@ rule compare_embeddings:
     run:
         input_embeddings = input[:-2]
         input_embeddings_str = ' '.join(input_embeddings)
-        shell("cn5-vectors compare_embeddings %s {output}" % input_embeddings_str)
+        shell("cn5-vectors compare_embeddings {input_embeddings_str} {output}")
 
 rule comparison_graph:
     input:

diff --git a/conceptnet5/db/config.py b/conceptnet5/db/config.py
@@ -11,7 +11,11 @@
 import os
 
 DB_USERNAME = os.environ.get('CONCEPTNET_DB_USER', os.environ.get('USER', 'postgres'))
+DB_NAME = os.environ.get('CONCEPTNET_DB_NAME', 'conceptnet5')
+DB_SOCKET = '/var/run/postgresql/.s.PGSQL.5432'
+
+# These will not be used if DB_PASSWORD is blank -- instead, we'll use DB_SOCKET
 DB_PASSWORD = os.environ.get('CONCEPTNET_DB_PASSWORD', '')
 DB_HOSTNAME = os.environ.get('CONCEPTNET_DB_HOSTNAME', 'localhost')
 DB_PORT = int(os.environ.get('CONCEPTNET_DB_PORT', '5432'))
-DB_NAME = os.environ.get('CONCEPTNET_DB_NAME', 'conceptnet5')
+
diff --git a/conceptnet5/db/connection.py b/conceptnet5/db/connection.py
@@ -42,13 +42,21 @@ def get_db_connection(dbname=None, building=False):
 
 
 def _get_db_connection_inner(dbname):
-    conn = pg8000.connect(
-        user=config.DB_USERNAME,
-        password=config.DB_PASSWORD,
-        host=config.DB_HOSTNAME,
-        port=config.DB_PORT,
-        database=dbname
-    )
+    if not config.DB_PASSWORD:
+        conn = pg8000.connect(
+            user=config.DB_USERNAME,
+            unix_sock=config.DB_SOCKET,
+            database=dbname
+        )
+    else:
+        conn = pg8000.connect(
+            user=config.DB_USERNAME,
+            password=config.DB_PASSWORD,
+            host=config.DB_HOSTNAME,
+            port=config.DB_PORT,
+            database=dbname
+        )
+
     pg8000.paramstyle = 'named'
     return conn
 

diff --git a/conceptnet5/readers/conceptnet4.py b/conceptnet5/readers/conceptnet4.py
@@ -88,6 +88,7 @@
     '/s/contributor/omcs/mrt',
     '/s/contributor/omcs/humplik',
     '/s/contributor/omcs/mickh',
+    '/s/contributor/omcs/visionsofkaos',
 }
 CONCEPT_BLACKLIST = {
     # Too vague
@@ -112,7 +113,7 @@
     "response to picture",
     "response to diagram",
     "commons2_reject",
-    "globalmind",    # avoid double-counting with the GlobalMind reader
+    "globalmind",
     "pycommons/question"
 }
 

diff --git a/conceptnet5/readers/wordnet.py b/conceptnet5/readers/wordnet.py
@@ -24,6 +24,7 @@
 
 REL_MAPPING = {
     'hypernym': ('IsA', '{0} is a type of {1}'),
+    'hypernym-v': ('MannerOf', '{0} is a way to {1}'),
     'part_meronym': ('PartOf', '{0} is a part of {1}'),
     'domain_category': ('HasContext', '{0} is used in the context of {1}'),
     'domain_region': ('HasContext', '{0} is used in the region of {1}'),
@@ -199,6 +200,9 @@ def run_wordnet(input_file, output_file):
         obj = obj_dict.get('url')
         relname = resource_name(rel)
         if relname in REL_MAPPING:
+            pos, sense = synset_disambig.get(subj, (None, None))
+            if relname == 'hypernym' and pos == 'v':
+                relname = 'hypernym-v'
             rel, frame = REL_MAPPING[relname]
             reversed_frame = False
             if rel.startswith('~'):
@@ -217,7 +221,6 @@ def run_wordnet(input_file, output_file):
                 if (not text) or '!' in text:
                     continue
                 lang = obj_dict['lang']
-                pos, sense = synset_disambig.get(subj, (None, None))
                 obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense)
                 obj_label = text