Skip to content

Commit

Permalink
Treat synonyms as translations of concepts
Browse files Browse the repository at this point in the history
  • Loading branch information
lggruspe committed Jun 30, 2023
1 parent 9045bd7 commit febd346
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 3 deletions.
10 changes: 8 additions & 2 deletions colexification_graphs/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,16 @@ class TranslationSchema(t.TypedDict):
sense: t.NotRequired[str] # If `None`, treat as equal to `word`.


class SynonymSchema(t.TypedDict):
"""Schema for values inside .senses[*].synonyms."""
word: str


class SenseSchema(t.TypedDict):
"""Schema for values inside .senses."""
# There are other fields, but we only need the translations.
# There are other fields, but we only need translations and synonyms.
translations: t.NotRequired[list[TranslationSchema]]
synonyms: t.NotRequired[list[SynonymSchema]]


class Schema(t.TypedDict):
Expand All @@ -31,4 +37,4 @@ class Schema(t.TypedDict):
translations: t.NotRequired[list[TranslationSchema]]


__all__ = ["Schema", "SenseSchema", "TranslationSchema"]
__all__ = ["Schema", "SenseSchema", "SynonymSchema", "TranslationSchema"]
42 changes: 41 additions & 1 deletion colexification_graphs/wordsenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

from orjson import loads # pylint: disable=no-name-in-module

from colexification_graphs.schema import Schema, SenseSchema, TranslationSchema
from colexification_graphs.schema import (
Schema,
SenseSchema,
SynonymSchema,
TranslationSchema,
)


TranslationError: t.TypeAlias = t.Literal[
Expand Down Expand Up @@ -42,6 +47,18 @@ def get_word_senses(data: Schema) -> t.Iterator[SenseSchema]:
yield from data["senses"]


def get_synonyms(sense: SenseSchema) -> list[SynonymSchema]:
"""Get synonyms from word sense."""
synonyms = []
for synonym in sense.get("synonyms", []):
# If there's an error in one synonym, there's probably an error in
# other synonyms, too.
if not synonym["word"]:
return []
synonyms.append(synonym)
return synonyms


def warn(
kind: TranslationError,
language: str,
Expand Down Expand Up @@ -78,6 +95,7 @@ def get_translations(data: Schema) -> t.Iterator[TranslationSchema]:
word and sense values.
A word is not considered a translation of itself.
"""
language_name = data["lang"]
language = data["lang_code"]
word = data["word"]

Expand All @@ -90,6 +108,14 @@ def get_translations(data: Schema) -> t.Iterator[TranslationSchema]:
yield translation

for sense in get_word_senses(data):
# Get sense description of first translation.
# The translations may have different senses, but the assumption is
# they're all roughly the same.
# But to be sure, we won't change the sense values of translations.
# We'll only use `sense_description` for synonyms.
sense_description = None

# Yield translations.
for translation in sense.get("translations", []):
error = check_translation(translation)
if error is not None:
Expand All @@ -98,6 +124,20 @@ def get_translations(data: Schema) -> t.Iterator[TranslationSchema]:
continue
yield translation

# Set sense description.
if sense_description is None:
sense_description = translation.get("sense")

# Treat synonyms as translations.
if sense_description is not None:
for synonym in get_synonyms(sense):
yield {
"lang": language_name,
"code": language,
"word": synonym["word"],
"sense": sense_description,
}


def fix_whitespace(text: str) -> tuple[bool, str]:
"""Fix whitespace characters in text.
Expand Down

0 comments on commit febd346

Please sign in to comment.