Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes from the code review on 11/17 #147

Merged
merged 11 commits into from
Dec 1, 2017
56 changes: 29 additions & 27 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,33 +50,6 @@ WIKT_PARSER_VERSION = "1"

RETROFIT_SHARDS = 6

# Dataset filenames
# =================
# The goal of reader steps is to produce Msgpack files, and later CSV files,
# with these names.
#
# We distingish *core dataset names*, which collectively determine the set of
# terms that ConceptNet will attempt to represent, from the additional datasets
# that will mainly be used to find more information about those terms.


CORE_DATASET_NAMES = [
"jmdict/jmdict",
"nadya/nadya",
"ptt_petgame/api",
"opencyc/opencyc",
"verbosity/verbosity",
"wordnet/wordnet",
"cedict/cedict"
]
CORE_DATASET_NAMES += ["conceptnet4/conceptnet4_flat_{}".format(num) for num in range(10)]
CORE_DATASET_NAMES += ["ptt_petgame/part{}".format(num) for num in range(1, 13)]
CORE_DATASET_NAMES += ["wiktionary/{}".format(lang) for lang in WIKTIONARY_LANGUAGES]
CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES]


DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"]

RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016"
PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
Expand All @@ -103,6 +76,35 @@ if TESTMODE:
HASH_WIDTH = 12
RAW_DATA_URL = "/missing/data"
PRECOMPUTED_DATA_URL = "/missing/data"
EMOJI_LANGUAGES = ['en', 'en_001']


# Dataset filenames
# =================
# The goal of reader steps is to produce Msgpack files, and later CSV files,
# with these names.
#
# We distingish *core dataset names*, which collectively determine the set of
# terms that ConceptNet will attempt to represent, from the additional datasets
# that will mainly be used to find more information about those terms.


CORE_DATASET_NAMES = [
"jmdict/jmdict",
"nadya/nadya",
"ptt_petgame/api",
"opencyc/opencyc",
"verbosity/verbosity",
"wordnet/wordnet",
"cedict/cedict"
]
CORE_DATASET_NAMES += ["conceptnet4/conceptnet4_flat_{}".format(num) for num in range(10)]
CORE_DATASET_NAMES += ["ptt_petgame/part{}".format(num) for num in range(1, 13)]
CORE_DATASET_NAMES += ["wiktionary/{}".format(lang) for lang in WIKTIONARY_LANGUAGES]
CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES]


DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"]


rule all:
Expand Down
150 changes: 71 additions & 79 deletions conceptnet5/readers/cc_cedict.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,85 +27,80 @@
LICENSE = Licenses.cc_sharealike
SOURCE = [{'contributor': '/s/resource/cc_cedict/2017-10'}]

LINE_REGEX = re.compile(r'(.+)\s(.+)\[.+\]\s/(.+)/') # separate traditional and simplified words
ABBR_REGEX = re.compile(r'(\b|\s)abbr. (to|of|for)') # abbreviations
BRACKETS_REGEX = re.compile(r'\[.+?\]') # pronunciation
DATE_RANGE_REGEX = re.compile(r'(.+?)\s\(.+\d.+\),') # date range
DEFINITIONS_REGEX = re.compile(r'/|;') # separate definitions
HAN_CHAR_REGEX = regex.compile('([\p{IsIdeo}]+[\|·]?)+') # Han characters
LINE_REGEX = re.compile(r'(.+)\s(.+)\[.+\]\s/(.+)/') # separate traditional and simplified words
LIT_FIG_REGEX = re.compile(r'(\b|\s)(fig|lit).\s') # literally/figuratively
PAREN_REGEX = re.compile(r'\(.+?\)') # parenthesis
HAN_CHAR_REGEX = regex.compile('([\p{IsIdeo}]+[\|·]?)+') # Han characters
BRACKETS_REGEX = re.compile(r'\[.+?\]') # pronunciation
VARIANT_REGEX = re.compile(r'(see (also )?|(old )?variant of |archaic version of |also written)')
LIT_FIG_REGEX = re.compile(r'(\b|\s)(fig|lit).\s')
ABBR_REGEX = re.compile(r'(\b|\s)abbr. (to|of|for)')
SB_REGEX = re.compile(r'\b(sb)\b')
STH_REGEX = re.compile(r'\b(sth)\b')
SEE_ALSO_REGEX = re.compile(r'see( also)?') # see also
VARIANT_REGEX = re.compile(r'((old |Japanese )?variant of|archaic version of|also '
r'written|same as)\s') # variant syntax


def remove_reference_syntax(definition):
"""
Example: Jiajiang county in Leshan 樂山|乐山[Le4 shan1]
Definitions in English may contain references to Chinese words. The reference syntax contains
vertical bar-separated Han characters as well as a pronunciation enclosed in brackets,
as in "Jiajiang county in Leshan 樂山|乐山[Le4 shan1]".

Remove the reference syntax.
"""
definition = HAN_CHAR_REGEX.sub('', definition)
return BRACKETS_REGEX.sub('', definition)


def remove_additional_info(definition):
"""
Remove the second sentence of the definition
Remove any information in a definition after the first comma. This part of the definition
usually provides additional details. For example, in the definition such as 'Salt Lake City,
capital of Utah', 'capital of Utah' is removed.
"""
return definition.split(',')[0]


def extract_person(match):
"""
Example: "Pierre-Auguste Renoir (1841-1919), French impressionist painter"
Check if a date range is mentioned in a definition. This is usually the case when a person is
being defined. In that case, we want to only extract the name, without the date range or the
second, CV sentence.

Returns:
a list of names extracted from a definition
Extract the name of a person mentioned in a definition. A person definition contains a
date range (ex. when they were alive or active) and a biography sentence, for example:
"Pierre-Auguste Renoir (1841-1919), French impressionist painter". Occasionally, two forms of a
person's name are provided, as in "Maria Skłodowska-Curie or Marie Curie". In that case,
we return both names and make an edge for each of them.
"""
person = match.groups()[0]
if ',' in person:
person = remove_additional_info(person) # skip the second sentence

person = HAN_CHAR_REGEX.sub('', person)
person = BRACKETS_REGEX.sub('', person) # delete pronunciation
person = person.split(' or ') # Take care of "Frederic Chopin or Fryderyk Franciszek Chopin"
person = remove_additional_info(person)
person = remove_reference_syntax(person)
person = person.split(' or ') # get both versions of a person's name
return person


def extract_measure_words(definition):
"""
Example: "CL:枝[zhi1],根[gen1],個|个[ge4],把[ba3]"
Extract measure words (classifiers). Measure words are prefixed with "CL:" and separated by a
comma. For example: "CL:枝[zhi1],根[gen1],個|个[ge4],把[ba3]"
"""
words = definition[3:] # skip 'CL:'
words = words.split(',')
words = words.split(',') # separate each measure word
words = [BRACKETS_REGEX.sub('', word) for word in words]
measure_words = []
for word in words:
measure_words.extend(word.split('|'))
measure_words.extend(word.split('|')) # separate variants of a measure word
return measure_words


def extract_variants(definition):
def extract_han_characters(definition):
"""
Example: "variant of 齊大非偶|齐大非偶[qi2 da4 fei1 ou3]"
Extract han characters. This is used when extracting variants, abbreviations and references
to other characters.
"""
variants = VARIANT_REGEX.sub('', definition)
variants = BRACKETS_REGEX.sub('', variants)
variants = variants.split('|')
return variants


def extract_abbreviations(definition):
"""
abbr.for Luxembourg 盧森堡 | 卢森堡[Lu2 sen1 bao3]
Only return a Chinese for which this word is an abbreviation.
"""
reference = regex.search(HAN_CHAR_REGEX, definition)
if reference:
reference = reference.group(0)
reference = reference.split('|')
return reference
return
chars = regex.search(HAN_CHAR_REGEX, definition)
if chars:
return chars.group(0).split('|')
return []


def handle_file(filename, output_file):
Expand All @@ -129,8 +124,7 @@ def handle_file(filename, output_file):
sources=SOURCE)
out.write(edge)

definitions = re.split(r'\/|;', definitions)
for definition in definitions:
for definition in re.split(DEFINITIONS_REGEX, definitions):

# Skip pronunciation information
if 'Taiwan pr.' in definition or 'also pr.' in definition:
Expand Down Expand Up @@ -158,9 +152,6 @@ def handle_file(filename, output_file):
out.write(edge)
continue

# Remove clarifying information in parenthesis
definition = PAREN_REGEX.sub('', definition)

# Check if a word is a measure word
if definition.startswith('CL:'):
related_words = extract_measure_words(definition)
Expand All @@ -182,20 +173,22 @@ def handle_file(filename, output_file):
out.write(edge)
continue

# Check if a word is a form/variant of a different word
variant_match = re.match(VARIANT_REGEX, definition)
if variant_match:
variants = extract_variants(definition)
# Remove clarifying information in parenthesis
definition = PAREN_REGEX.sub('', definition)

# Handle variants/word forms and abbreviations
if re.match(VARIANT_REGEX, definition) or re.match(ABBR_REGEX, definition):
variants = extract_han_characters(definition)
for variant in variants:
edge = make_edge(rel='/r/RelatedTo',
edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('zh', variant),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('zh', variant),
dataset=DATASET,
Expand All @@ -204,38 +197,37 @@ def handle_file(filename, output_file):
out.write(edge)
continue

# Handle abbreviations
if re.match(ABBR_REGEX, definition):
abbreviations = extract_abbreviations(definition)
if abbreviations:
for abbr in abbreviations:
edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('zh', abbr),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('zh', abbr),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)
continue
if re.match(SEE_ALSO_REGEX, definition):
references = extract_han_characters(definition)
for reference in references:
edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('zh', reference),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('zh', reference),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

# Remove 'lit.', 'fig.'
definition = LIT_FIG_REGEX.sub('', definition)

# Expand sth and sb
definition = definition.replace('sth', 'something')
definition = definition.replace('sb', 'someone')
definition = SB_REGEX.sub('someone', definition)
definition = STH_REGEX.sub('something', definition)

# Additional cleanups
definition = remove_reference_syntax(definition)
definition = remove_additional_info(definition)

# Skip long definitions
# Skip long definitions and make an edge out of remaining information
if len(definition.split()) < 6:
edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hant', traditional),
Expand All @@ -245,7 +237,7 @@ def handle_file(filename, output_file):
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('en', definition),
dataset=DATASET,
Expand Down
9 changes: 4 additions & 5 deletions conceptnet5/readers/emoji.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from conceptnet5.uri import Licenses
from conceptnet5.nodes import standardized_concept_uri

import xml.etree.ElementTree as ET

import xml.etree.ElementTree as ET

REL = '/r/SymbolOf'
DATASET = '/d/emoji'
Expand All @@ -27,11 +26,11 @@ def handle_file(input_file, output_file):
tree = ET.parse(input_file)
out = MsgpackStreamWriter(output_file)
root = tree.getroot()
lang = root[0][1].attrib['type']
lang = root[0][1].attrib['type'] # language is at position [1] within the child node [0]
for annotation in root[1]:

for word in strip_words(annotation.text):
start = standardized_concept_uri('mul', annotation.attrib['cp'])
start = standardized_concept_uri('mul', annotation.attrib['cp'])
end = standardized_concept_uri(lang, word)
edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
out.write(edge)
2 changes: 0 additions & 2 deletions conceptnet5/vectors/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,6 @@ def export_background(input_filename, output_dir, concepts_filename, language, t
big_frame = make_big_frame(frame, language)
small_frame = make_small_frame(big_frame, concepts_filename, language)
replacements = make_replacements_faster(small_frame, big_frame, tree_depth, language, verbose)
print('replacements: ', len(replacements))
print('labels: ', small_frame.shape)
save_replacements(path.join(output_dir, 'replacements.msgpack'.format(language)),
replacements)

Expand Down
26 changes: 16 additions & 10 deletions conceptnet5/vectors/evaluation/analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,8 @@ def read_bats(category):

For some questions, BATS contains multiple answers. For example, the answer to an
analogy question Nicaragua:Spanish::Switzerland:? could be German, French, or Italian. These
will all be supplied as a list if they are an answer. However, if they are a part of a
question, only the first one will be used.
will all be supplied as a list if they are an answer (b2). However, if they are a part of a
question (b1), only the first one will be used.
"""
filename = 'bats/{}.txt'.format(category)
pairs = []
Expand All @@ -163,16 +163,22 @@ def read_bats(category):

quads = []
for i in range(len(pairs)):
a_pair = pairs[i]
a_pair[1] = a_pair[1][0] # select only one term for b1, even if more may be available
b_pairs = [pair for j, pair in enumerate(pairs) if j != i]
for b_pair in b_pairs:
first_pair = pairs[i]
first_pair[1] = first_pair[1][0] # select only one term for b1, even if more may be available
second_pairs = [pair for j, pair in enumerate(pairs) if j != i]
for second_pair in second_pairs:
quad = []
quad.extend([standardized_uri('en', term) for term in a_pair + b_pair[:1]])
if isinstance(b_pair[1], list):
quad.append([standardized_uri('en', term) for term in b_pair[1]])

# the first three elements of a quad are the two terms in first_pair and the first
# term of the second_pair
quad.extend([standardized_uri('en', term) for term in first_pair + second_pair[:1]])

# if the second element of the second pair (b2) is a list, it means there are multiple
# correct answers for b2. We want to keep all of them.
if isinstance(second_pair[1], list):
quad.append([standardized_uri('en', term) for term in second_pair[1]])
else:
quad.append(standardized_uri('en', b_pair[1]))
quad.append(standardized_uri('en', second_pair[1]))
quads.append(quad)
return quads

Expand Down
1 change: 1 addition & 0 deletions conceptnet5/vectors/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def l2_normalize_rows(frame):
L_2-normalize the rows of this DataFrame, so their lengths in Euclidean
distance are all 1. This enables cosine similarities to be computed as
dot-products between these rows.
DataFrame of zeros will be normalized to zeros.
"""
index = frame.index
return pd.DataFrame(data=normalize(frame, norm='l2', copy=False, axis=1), index=index)
Expand Down
Loading