Merge pull request #147 from commonsense/code-review-fixes-20171117

Fixes from the code review on 11/17
commonsense · Dec 1, 2017 · 21d7fab · 21d7fab
2 parents 620b713 + 039e8b5
commit 21d7fab
Show file tree

Hide file tree

Showing 256 changed files with 303 additions and 6,336 deletions.
diff --git a/Snakefile b/Snakefile
@@ -50,33 +50,6 @@ WIKT_PARSER_VERSION = "1"
 
 RETROFIT_SHARDS = 6
 
-# Dataset filenames
-# =================
-# The goal of reader steps is to produce Msgpack files, and later CSV files,
-# with these names.
-#
-# We distingish *core dataset names*, which collectively determine the set of
-# terms that ConceptNet will attempt to represent, from the additional datasets
-# that will mainly be used to find more information about those terms.
-
-
-CORE_DATASET_NAMES = [
-    "jmdict/jmdict",
-    "nadya/nadya",
-    "ptt_petgame/api",
-    "opencyc/opencyc",
-    "verbosity/verbosity",
-    "wordnet/wordnet",
-    "cedict/cedict"
-]
-CORE_DATASET_NAMES += ["conceptnet4/conceptnet4_flat_{}".format(num) for num in range(10)]
-CORE_DATASET_NAMES += ["ptt_petgame/part{}".format(num) for num in range(1, 13)]
-CORE_DATASET_NAMES += ["wiktionary/{}".format(lang) for lang in WIKTIONARY_LANGUAGES]
-CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES]
-
-
-DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"]
-
 RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016"
 PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
 PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
@@ -103,6 +76,35 @@ if TESTMODE:
     HASH_WIDTH = 12
     RAW_DATA_URL = "/missing/data"
     PRECOMPUTED_DATA_URL = "/missing/data"
+    EMOJI_LANGUAGES = ['en', 'en_001']
+
+
+# Dataset filenames
+# =================
+# The goal of reader steps is to produce Msgpack files, and later CSV files,
+# with these names.
+#
+# We distingish *core dataset names*, which collectively determine the set of
+# terms that ConceptNet will attempt to represent, from the additional datasets
+# that will mainly be used to find more information about those terms.
+
+
+CORE_DATASET_NAMES = [
+    "jmdict/jmdict",
+    "nadya/nadya",
+    "ptt_petgame/api",
+    "opencyc/opencyc",
+    "verbosity/verbosity",
+    "wordnet/wordnet",
+    "cedict/cedict"
+]
+CORE_DATASET_NAMES += ["conceptnet4/conceptnet4_flat_{}".format(num) for num in range(10)]
+CORE_DATASET_NAMES += ["ptt_petgame/part{}".format(num) for num in range(1, 13)]
+CORE_DATASET_NAMES += ["wiktionary/{}".format(lang) for lang in WIKTIONARY_LANGUAGES]
+CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES]
+
+
+DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"]
 
 
 rule all:

diff --git a/conceptnet5/readers/cc_cedict.py b/conceptnet5/readers/cc_cedict.py
@@ -27,85 +27,80 @@
 LICENSE = Licenses.cc_sharealike
 SOURCE = [{'contributor': '/s/resource/cc_cedict/2017-10'}]
 
-LINE_REGEX = re.compile(r'(.+)\s(.+)\[.+\]\s/(.+)/')  # separate traditional and simplified words
+ABBR_REGEX = re.compile(r'(\b|\s)abbr. (to|of|for)')  # abbreviations
+BRACKETS_REGEX = re.compile(r'\[.+?\]')  # pronunciation
 DATE_RANGE_REGEX = re.compile(r'(.+?)\s\(.+\d.+\),')  # date range
+DEFINITIONS_REGEX = re.compile(r'/|;')  # separate definitions
+HAN_CHAR_REGEX = regex.compile('([\p{IsIdeo}]+[\|·]?)+')  # Han characters
+LINE_REGEX = re.compile(r'(.+)\s(.+)\[.+\]\s/(.+)/')  # separate traditional and simplified words
+LIT_FIG_REGEX = re.compile(r'(\b|\s)(fig|lit).\s')  # literally/figuratively
 PAREN_REGEX = re.compile(r'\(.+?\)')  # parenthesis
-HAN_CHAR_REGEX = regex.compile('([\p{IsIdeo}]+[\|·]?)+') # Han characters
-BRACKETS_REGEX = re.compile(r'\[.+?\]')  # pronunciation
-VARIANT_REGEX = re.compile(r'(see (also )?|(old )?variant of |archaic version of |also written)')
-LIT_FIG_REGEX = re.compile(r'(\b|\s)(fig|lit).\s')
-ABBR_REGEX = re.compile(r'(\b|\s)abbr. (to|of|for)')
+SB_REGEX = re.compile(r'\b(sb)\b')
+STH_REGEX = re.compile(r'\b(sth)\b')
+SEE_ALSO_REGEX = re.compile(r'see( also)?')  # see also
+VARIANT_REGEX = re.compile(r'((old |Japanese )?variant of|archaic version of|also '
+                           r'written|same as)\s')  # variant syntax
 
 
 def remove_reference_syntax(definition):
     """
-    Example: Jiajiang county in Leshan 樂山|乐山[Le4 shan1]
+    Definitions in English may contain references to Chinese words. The reference syntax contains
+    vertical bar-separated Han characters as well as a pronunciation enclosed in brackets,
+    as in "Jiajiang county in Leshan 樂山|乐山[Le4 shan1]".
+
+    Remove the reference syntax.
     """
     definition = HAN_CHAR_REGEX.sub('', definition)
     return BRACKETS_REGEX.sub('', definition)
 
 
 def remove_additional_info(definition):
     """
-    Remove the second sentence of the definition
+    Remove any information in a definition after the first comma. This part of the definition
+    usually provides additional details. For example, in the definition such as 'Salt Lake City,
+    capital of Utah', 'capital of Utah' is removed.
     """
     return definition.split(',')[0]
 
 
 def extract_person(match):
     """
-    Example: "Pierre-Auguste Renoir (1841-1919), French impressionist painter"
-    Check if a date range is mentioned in a definition. This is usually the case when a person is
-    being defined. In that case, we want to only extract the name, without the date range or the
-    second, CV sentence.
-
-    Returns:
-        a list of names extracted from a definition
+    Extract the name of a person mentioned in a definition. A person definition contains a
+    date range (ex. when they were alive or active) and a biography sentence, for example:
+    "Pierre-Auguste Renoir (1841-1919), French impressionist painter". Occasionally, two forms of a
+    person's name are provided, as in "Maria Skłodowska-Curie or Marie Curie". In that case,
+    we return both names and make an edge for each of them.
     """
     person = match.groups()[0]
-    if ',' in person:
-        person = remove_additional_info(person)  # skip the second sentence
-
-    person = HAN_CHAR_REGEX.sub('', person)
-    person = BRACKETS_REGEX.sub('', person) # delete pronunciation
-    person = person.split(' or ') # Take care of "Frederic Chopin or Fryderyk Franciszek Chopin"
+    person = remove_additional_info(person)
+    person = remove_reference_syntax(person)
+    person = person.split(' or ')  # get both versions of a person's name
     return person
 
 
 def extract_measure_words(definition):
     """
-    Example: "CL:枝[zhi1],根[gen1],個|个[ge4],把[ba3]"
+    Extract measure words (classifiers). Measure words are prefixed with "CL:" and separated by a
+    comma. For example: "CL:枝[zhi1],根[gen1],個|个[ge4],把[ba3]"
     """
     words = definition[3:]  # skip 'CL:'
-    words = words.split(',')
+    words = words.split(',')  # separate each measure word
     words = [BRACKETS_REGEX.sub('', word) for word in words]
     measure_words = []
     for word in words:
-        measure_words.extend(word.split('|'))
+        measure_words.extend(word.split('|'))  # separate variants of a measure word
     return measure_words
 
 
-def extract_variants(definition):
+def extract_han_characters(definition):
     """
-    Example: "variant of 齊大非偶|齐大非偶[qi2 da4 fei1 ou3]"
+    Extract han characters. This is used when extracting variants, abbreviations and references
+    to other characters.
     """
-    variants = VARIANT_REGEX.sub('', definition)
-    variants = BRACKETS_REGEX.sub('', variants)
-    variants = variants.split('|')
-    return variants
-
-
-def extract_abbreviations(definition):
-    """
-    abbr.for Luxembourg 盧森堡 | 卢森堡[Lu2 sen1 bao3]
-    Only return a Chinese for which this word is an abbreviation.
-    """
-    reference = regex.search(HAN_CHAR_REGEX, definition)
-    if reference:
-        reference = reference.group(0)
-        reference = reference.split('|')
-        return reference
-    return
+    chars = regex.search(HAN_CHAR_REGEX, definition)
+    if chars:
+        return chars.group(0).split('|')
+    return []
 
 
 def handle_file(filename, output_file):
@@ -129,8 +124,7 @@ def handle_file(filename, output_file):
                          sources=SOURCE)
         out.write(edge)
 
-        definitions = re.split(r'\/|;', definitions)
-        for definition in definitions:
+        for definition in re.split(DEFINITIONS_REGEX, definitions):
 
             # Skip pronunciation information
             if 'Taiwan pr.' in definition or 'also pr.' in definition:
@@ -158,9 +152,6 @@ def handle_file(filename, output_file):
                     out.write(edge)
                 continue
 
-            # Remove clarifying information in parenthesis
-            definition = PAREN_REGEX.sub('', definition)
-
             # Check if a word is a measure word
             if definition.startswith('CL:'):
                 related_words = extract_measure_words(definition)
@@ -182,20 +173,22 @@ def handle_file(filename, output_file):
                     out.write(edge)
                 continue
 
-            # Check if a word is a form/variant of a different word
-            variant_match = re.match(VARIANT_REGEX, definition)
-            if variant_match:
-                variants = extract_variants(definition)
+            # Remove clarifying information in parenthesis
+            definition = PAREN_REGEX.sub('', definition)
+
+            # Handle variants/word forms and abbreviations
+            if re.match(VARIANT_REGEX, definition) or re.match(ABBR_REGEX, definition):
+                variants = extract_han_characters(definition)
                 for variant in variants:
-                    edge = make_edge(rel='/r/RelatedTo',
+                    edge = make_edge(rel='/r/Synonym',
                                      start=standardized_concept_uri('zh-Hant', traditional),
                                      end=standardized_concept_uri('zh', variant),
                                      dataset=DATASET,
                                      license=LICENSE,
                                      sources=SOURCE)
                     out.write(edge)
 
-                    edge = make_edge(rel='/r/RelatedTo',
+                    edge = make_edge(rel='/r/Synonym',
                                      start=standardized_concept_uri('zh-Hans', simplified),
                                      end=standardized_concept_uri('zh', variant),
                                      dataset=DATASET,
@@ -204,38 +197,37 @@ def handle_file(filename, output_file):
                     out.write(edge)
                 continue
 
-            # Handle abbreviations
-            if re.match(ABBR_REGEX, definition):
-                abbreviations = extract_abbreviations(definition)
-                if abbreviations:
-                    for abbr in abbreviations:
-                        edge = make_edge(rel='/r/RelatedTo',
-                                         start=standardized_concept_uri('zh-Hant', traditional),
-                                         end=standardized_concept_uri('zh', abbr),
-                                         dataset=DATASET,
-                                         license=LICENSE,
-                                         sources=SOURCE)
-                        out.write(edge)
-
-                        edge = make_edge(rel='/r/RelatedTo',
-                                         start=standardized_concept_uri('zh-Hans', simplified),
-                                         end=standardized_concept_uri('zh', abbr),
-                                         dataset=DATASET,
-                                         license=LICENSE,
-                                         sources=SOURCE)
-                        out.write(edge)
-                continue
+            if re.match(SEE_ALSO_REGEX, definition):
+                references = extract_han_characters(definition)
+                for reference in references:
+                    edge = make_edge(rel='/r/RelatedTo',
+                                     start=standardized_concept_uri('zh-Hant', traditional),
+                                     end=standardized_concept_uri('zh', reference),
+                                     dataset=DATASET,
+                                     license=LICENSE,
+                                     sources=SOURCE)
+                    out.write(edge)
+
+                    edge = make_edge(rel='/r/RelatedTo',
+                                     start=standardized_concept_uri('zh-Hans', simplified),
+                                     end=standardized_concept_uri('zh', reference),
+                                     dataset=DATASET,
+                                     license=LICENSE,
+                                     sources=SOURCE)
+                    out.write(edge)
 
             # Remove 'lit.', 'fig.'
             definition = LIT_FIG_REGEX.sub('', definition)
 
             # Expand sth and sb
-            definition = definition.replace('sth', 'something')
-            definition = definition.replace('sb', 'someone')
+            definition = SB_REGEX.sub('someone', definition)
+            definition = STH_REGEX.sub('something', definition)
+
+            # Additional cleanups
             definition = remove_reference_syntax(definition)
             definition = remove_additional_info(definition)
 
-            # Skip long definitions
+            # Skip long definitions and make an edge out of remaining information
             if len(definition.split()) < 6:
                 edge = make_edge(rel='/r/Synonym',
                                  start=standardized_concept_uri('zh-Hant', traditional),
@@ -245,7 +237,7 @@ def handle_file(filename, output_file):
                                  sources=SOURCE)
                 out.write(edge)
 
-                edge = make_edge(rel='/r/RelatedTo',
+                edge = make_edge(rel='/r/Synonym',
                                  start=standardized_concept_uri('zh-Hans', simplified),
                                  end=standardized_concept_uri('en', definition),
                                  dataset=DATASET,

diff --git a/conceptnet5/readers/emoji.py b/conceptnet5/readers/emoji.py
@@ -3,8 +3,7 @@
 from conceptnet5.uri import Licenses
 from conceptnet5.nodes import standardized_concept_uri
 
-import xml.etree.ElementTree as ET 
-
+import xml.etree.ElementTree as ET
 
 REL = '/r/SymbolOf'
 DATASET = '/d/emoji'
@@ -27,11 +26,11 @@ def handle_file(input_file, output_file):
     tree = ET.parse(input_file)
     out = MsgpackStreamWriter(output_file)
     root = tree.getroot()
-    lang = root[0][1].attrib['type']
+    lang = root[0][1].attrib['type']  # language is at position [1] within the child node [0]
     for annotation in root[1]:
-               
+
         for word in strip_words(annotation.text):
-            start = standardized_concept_uri('mul', annotation.attrib['cp'])   
+            start = standardized_concept_uri('mul', annotation.attrib['cp'])
             end = standardized_concept_uri(lang, word)
             edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
             out.write(edge)
diff --git a/conceptnet5/vectors/cli.py b/conceptnet5/vectors/cli.py
@@ -271,8 +271,6 @@ def export_background(input_filename, output_dir, concepts_filename, language, t
     big_frame = make_big_frame(frame, language)
     small_frame = make_small_frame(big_frame, concepts_filename, language)
     replacements = make_replacements_faster(small_frame, big_frame, tree_depth, language, verbose)
-    print('replacements: ', len(replacements))
-    print('labels: ', small_frame.shape)
     save_replacements(path.join(output_dir, 'replacements.msgpack'.format(language)),
                       replacements)
 

diff --git a/conceptnet5/vectors/evaluation/analogy.py b/conceptnet5/vectors/evaluation/analogy.py
@@ -143,8 +143,8 @@ def read_bats(category):
 
     For some questions, BATS contains multiple answers. For example, the answer to an
     analogy question Nicaragua:Spanish::Switzerland:? could be German, French, or Italian. These
-    will all be supplied as a list if they are an answer. However, if they are a part of a
-    question, only the first one will be used.
+    will all be supplied as a list if they are an answer (b2). However, if they are a part of a
+    question (b1), only the first one will be used.
     """
     filename = 'bats/{}.txt'.format(category)
     pairs = []
@@ -163,16 +163,22 @@ def read_bats(category):
 
     quads = []
     for i in range(len(pairs)):
-        a_pair = pairs[i]
-        a_pair[1] = a_pair[1][0]  # select only one term for b1, even if more may be available
-        b_pairs = [pair for j, pair in enumerate(pairs) if j != i]
-        for b_pair in b_pairs:
+        first_pair = pairs[i]
+        first_pair[1] = first_pair[1][0]  # select only one term for b1, even if more may be available
+        second_pairs = [pair for j, pair in enumerate(pairs) if j != i]
+        for second_pair in second_pairs:
             quad = []
-            quad.extend([standardized_uri('en', term) for term in a_pair + b_pair[:1]])
-            if isinstance(b_pair[1], list):
-                quad.append([standardized_uri('en', term) for term in b_pair[1]])
+
+            # the first three elements of a quad are the two terms in first_pair and the first
+            # term of the second_pair
+            quad.extend([standardized_uri('en', term) for term in first_pair + second_pair[:1]])
+
+            # if the second element of the second pair (b2) is a list, it means there are multiple
+            # correct answers for b2. We want to keep all of them.
+            if isinstance(second_pair[1], list):
+                quad.append([standardized_uri('en', term) for term in second_pair[1]])
             else:
-                quad.append(standardized_uri('en', b_pair[1]))
+                quad.append(standardized_uri('en', second_pair[1]))
             quads.append(quad)
     return quads
 

diff --git a/conceptnet5/vectors/transforms.py b/conceptnet5/vectors/transforms.py
@@ -70,6 +70,7 @@ def l2_normalize_rows(frame):
     L_2-normalize the rows of this DataFrame, so their lengths in Euclidean
     distance are all 1. This enables cosine similarities to be computed as
     dot-products between these rows.
+    DataFrame of zeros will be normalized to zeros.
     """
     index = frame.index
     return pd.DataFrame(data=normalize(frame, norm='l2', copy=False, axis=1), index=index)