Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cedict reader #138

Merged
merged 8 commits into from
Nov 15, 2017
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ CORE_DATASET_NAMES = [
"opencyc/opencyc",
"verbosity/verbosity",
"wordnet/wordnet",
"cedict/cedict"
]
CORE_DATASET_NAMES += ["conceptnet4/conceptnet4_flat_{}".format(num) for num in range(10)]
CORE_DATASET_NAMES += ["ptt_petgame/part{}".format(num) for num in range(1, 13)]
Expand Down Expand Up @@ -325,6 +326,14 @@ rule read_emoji:
shell:
"cn5-read emoji {input} {output}"

rule read_cc_cedict:
input:
DATA + "/raw/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz"
output:
DATA + "/edges/cedict/cedict.msgpack",
shell:
"cn5-read cc_cedict {input} {output}"


# Converting msgpack to csv
# =========================
Expand Down
252 changes: 252 additions & 0 deletions conceptnet5/readers/cc_cedict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
"""
CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the aim
to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin
for the Chinese characters.

Creative Commons Attribution-Share Alike 3.0
http://creativecommons.org/licenses/by-sa/3.0/

Referenced works:
CEDICT - Copyright (C) 1997, 1998 Paul Andrew Denisowski

CC-CEDICT can be downloaded from:
http://www.mdbg.net/chindict/chindict.php?page=cc-cedict
"""

import gzip
import re

from conceptnet5.edges import make_edge
from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter
from conceptnet5.nodes import standardized_concept_uri
from conceptnet5.uri import Licenses

DATASET = '/d/cc_cedict'
LICENSE = Licenses.cc_sharealike
SOURCE = [{'contributor': '/s/resource/cc_cedict/2017-10'}]

LINE_REGEX = re.compile(r'(.+)\s(.+)\[.+\]\s/(.+)/') # separate traditional and simplified words
DATE_RANGE_REGEX = re.compile(r'(.+?)\s\(.+\d.+\),') # date range
PAREN_REGEX = re.compile(r'\(.+?\)') # parenthesis
CHINESE_CHAR_REGEX = re.compile(r'([\u4e00-\u9fff]+[\|·]?)+') # Chinese characters
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears to exclude the range from U+3400..U+4DBF, which appears in definitions such as:

㐌 㐌 [ta1] /variant of 它[ta1]/

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For what it's worth, it also excludes the other CJK extensions with codepoints U+20000 and up, but CEDICT never uses those anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was correct for a couple of definitions, so I ended up switching to regex.compile('([\p{IsIdeo}]+[\|·]?)+'), per your suggestion.

BRACKETS_REGEX = re.compile(r'\[.+\]') # pronunciation
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this regex is too greedy and should have a ? on it like PAREN_REGEX does.

I believe that in this definition:

一甲 一甲 [yi1 jia3] /1st rank or top three candidates who passed the imperial examination (i.e. 狀元|状元[zhuang4 yuan2], 榜眼[bang3 yan3], and 探花[tan4 hua1], respectively)/

it will match this text:

[zhuang4 yuan2], 榜眼[bang3 yan3], and 探花[tan4 hua1]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This particular definition would actually have everything inside the parentheses removed before matching the brackets, but the problem was true for a couple of other definitions, so I changed it.

VARIANT_REGEX = re.compile(r'(see (also )?|(old )?variant of |archaic version of |also written)')
LIT_FIG_REGEX = re.compile(r'(\b|\s)(fig|lit).\s')
ABBR_REGEX = re.compile(r'(\b|\s)abbr. (to|of|for)')


def remove_reference_syntax(definition):
"""
Example: Jiajiang county in Leshan 樂山|乐山[Le4 shan1]
"""
definition = CHINESE_CHAR_REGEX.sub('', definition)
return BRACKETS_REGEX.sub('', definition)


def remove_additional_info(definition):
"""
Remove the second sentence of the definition
"""
return definition.split(',')[0]


def extract_person(match):
"""
Example: "Pierre-Auguste Renoir (1841-1919), French impressionist painter"
Check if a date range is mentioned in a definition. This is usually the case when a person is
being defined. In that case, we want to only extract the name, without the date range or the
second, CV sentence.

Returns:
a list of names extracted from a definition
"""
person = match.groups()[0]
if ',' in person:
person = remove_additional_info(person) # skip the second sentence

person = CHINESE_CHAR_REGEX.sub('', person)
person = BRACKETS_REGEX.sub('', person) # delete pronunciation
person = person.split(' or ') # Take care of "Frederic Chopin or Fryderyk Franciszek Chopin"
return person


def extract_measure_words(definition):
"""
Example: "CL:枝[zhi1],根[gen1],個|个[ge4],把[ba3]"
"""
words = definition[3:] # skip 'CL:'
words = words.split(',')
words = [BRACKETS_REGEX.sub('', word) for word in words]
measure_words = []
for word in words:
measure_words.extend(word.split('|'))
return measure_words


def extract_variants(definition):
"""
Example: "variant of 齊大非偶|齐大非偶[qi2 da4 fei1 ou3]"
"""
variants = VARIANT_REGEX.sub('', definition)
variants = BRACKETS_REGEX.sub('', variants)
variants = variants.split('|')
return variants


def extract_abbreviations(definition):
"""
abbr.for Luxembourg 盧森堡 | 卢森堡[Lu2 sen1 bao3]
Only return a Chinese for which this word is an abbreviation.
"""
reference = re.search(CHINESE_CHAR_REGEX, definition)
if reference:
reference = reference.group(0)
reference = reference.split('|')
return reference
return


def handle_file(filename, output_file):
out = MsgpackStreamWriter(output_file)

for line in gzip.open(filename, 'rt'):

# skip the intro information
if line.startswith('#'):
continue

# parse the data to extract the traditional form, simplified form and the English definition
traditional, simplified, definitions = re.match(LINE_REGEX, line).groups()

# Make an edge between the traditional and simplified version
edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('zh-Hans', simplified),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

definitions = re.split(r'\/|;', definitions)
for definition in definitions:

# Skip pronunciation information
if 'Taiwan pr.' in definition or 'also pr.' in definition:
continue

# Check if it's the definition matches a person syntax, i.e. includes a date range
person_match = re.match(DATE_RANGE_REGEX, definition)
if person_match:
persons = extract_person(person_match)
for person in persons:
edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('en', person),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('en', person),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)
continue

# Remove clarifying information in parenthesis
definition = PAREN_REGEX.sub('', definition)

# Check if a word is a measure word
if definition.startswith('CL:'):
related_words = extract_measure_words(definition)
for word in related_words:
edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('zh', word),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('zh', word),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)
continue

# Check if a word is a form/variant of a different word
variant_match = re.match(VARIANT_REGEX, definition)
if variant_match:
variants = extract_variants(definition)
for variant in variants:
edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('zh', variant),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('zh', variant),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)
continue

# Handle abbreviations
if re.match(ABBR_REGEX, definition):
abbreviations = extract_abbreviations(definition)
if abbreviations:
for abbr in abbreviations:
edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('zh', abbr),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('zh', abbr),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)
continue

# Remove 'lit.', 'fig.'
definition = LIT_FIG_REGEX.sub('', definition)

# Expand sth and sb
definition = definition.replace('sth', 'something')
definition = definition.replace('sb', 'someone')
definition = remove_reference_syntax(definition)
definition = remove_additional_info(definition)

# Skip long definitions
if len(definition.split()) < 6:
edge = make_edge(rel='/r/Synonym',
start=standardized_concept_uri('zh-Hant', traditional),
end=standardized_concept_uri('en', definition),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)

edge = make_edge(rel='/r/RelatedTo',
start=standardized_concept_uri('zh-Hans', simplified),
end=standardized_concept_uri('en', definition),
dataset=DATASET,
license=LICENSE,
sources=SOURCE)
out.write(edge)
15 changes: 14 additions & 1 deletion conceptnet5/readers/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import click
from . import (
conceptnet4, dbpedia, emoji, jmdict, nadya, ptt_petgame,
cc_cedict, conceptnet4, dbpedia, emoji, jmdict, nadya, ptt_petgame,
opencyc, verbosity, wiktionary, wordnet
)

Expand Down Expand Up @@ -156,3 +156,16 @@ def run_emoji(input, output):
output: a msgpack file of edges
"""
emoji.handle_file(input, output)


@cli.command(name='cc_cedict')
@click.argument('input', type=click.Path(readable=True, dir_okay=False))
@click.argument('output', type=click.Path(writable=True, dir_okay=False))
def run_cedict(input, output):
"""
Import data from CC_CEDICT.

input: a .txt file containing CC-CEDICT data (cedict_1_0_ts_utf-8_mdbg.txt)
output: a msgpack file of edges
"""
cc_cedict.handle_file(input, output)