Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
lggruspe committed May 23, 2023
0 parents commit e021263
Show file tree
Hide file tree
Showing 10 changed files with 1,228 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.pyc
/build/
/dist/
/env/
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
DATE=$(shell date '+%Y.%m.%d')
PARENT="colexification-graphs-$(DATE)"

# Expects build/kaikki.jsonl to exist.
# Download from https://kaikki.org/dictionary/All%20languages%20combined/kaikki.org-dictionary-all.json
.PHONY: build
build: build/graph.json

build/wordsenses.tsv: build/kaikki.jsonl
python -m colexification_graphs.wordsenses $< > $@ 2> build/wordsenses.err

build/graph.tsv: build/wordsenses.tsv
python -m colexification_graphs.graph $< > $@

build/graph.json: build/graph.tsv
python -m colexification_graphs.post $< > $@

.PHONY: check
check:
flake8 colexification_graphs
pylint colexification_graphs
mypy --strict colexification_graphs

.PHONY: dist
dist: build
mkdir -p dist
cd build; \
mkdir -p "$(PARENT)"; \
cp ../README.md graph.json graph.tsv "$(PARENT)"; \
tar -czvf "$(PARENT).tar.gz" "$(PARENT)"; \
mv "$(PARENT).tar.gz" ../dist
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# colexification-graphs

Colexification graphs extracted from Wiktionary.

## Interpretation

A colexification graph is an undirected graph.

- Nodes represent meanings/sense-annotated words.
- The weight of an edge between two nodes is the number of languages where the same word is used for both meanings.

## Format

The graphs are available in TSV and JSON formats.

TSV columns:

1. node 1 word
2. node 1 sense
3. node 2 word
4. node 2 sense
5. weight of edge between node 1 and node 2

The JSON file is compatible with [Cytoscape.js](https://js.cytoscape.org/).

## Licenses

### Scripts

Copyright 2023 Levi Gruspe

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.

### Data

Copyright 2023 Levi Gruspe

The published colexification graphs are made available under the [Creative Commons Attribution-ShareAlike License](https://creativecommons.org/licenses/by-sa/3.0/).
This work is derived from Wiktionary.
The copyright of the original work belongs to Wiktionary's editors and contributors.
3 changes: 3 additions & 0 deletions colexification_graphs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright 2023 Levi Gruspe
# Licensed under GNU GPLv3 or later
# See https://www.gnu.org/licenses/gpl-3.0.en.html
129 changes: 129 additions & 0 deletions colexification_graphs/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright 2023 Levi Gruspe
# Licensed under GNU GPLv3 or later
# See https://www.gnu.org/licenses/gpl-3.0.en.html
"""Build colexification graph."""

from argparse import ArgumentParser, Namespace
from collections import Counter
from csv import reader, writer
from pathlib import Path
import sys
import typing as t


Language: t.TypeAlias = str
Word: t.TypeAlias = str
Translation: t.TypeAlias = tuple[Language, Word]
Sense: t.TypeAlias = tuple[str, str]
Edge: t.TypeAlias = tuple[Sense, Sense]


def parse_args() -> Namespace:
"""Parse command-line arguments."""
parser = ArgumentParser(description=__doc__)
parser.add_argument(
"--edge-cutoff",
dest="edge_cutoff",
nargs=1,
type=int,
help="minimum weight required for edges in graph (default: 4)",

# Removes > 2/3 of all edges
default=[4],
)

parser.add_argument(
"--sense-cutoff",
dest="sense_cutoff",
nargs=1,
type=int,
help="minimum languages required for word senses (default: 20)",

# Removes > 80% of all word senses.
default=[20],
)

parser.add_argument(
"tsv",
type=Path,
help="path to TSV file (columns: language, word, sense, gloss)",
)
args = parser.parse_args()
args.edge_cutoff = args.edge_cutoff[0]
args.sense_cutoff = args.sense_cutoff[0]
return args


class InvalidRecord(Exception):
"""Raised when an invalid record is found in a TSV file."""


def get_rows(
tsv: Path,
silent: bool = True,
) -> t.Iterator[tuple[str, str, str, str]]:
"""Read rows from TSV file.
If `silent` is `True`, silently ignores invalid rows.
If not, raises `InvalidRecord`.
"""
with open(tsv, encoding="utf-8") as file:
for row in reader(file, delimiter="\t"):
try:
language, word, sense, gloss = row
except ValueError as exc:
if silent:
continue
raise InvalidRecord from exc

yield language, word, sense, gloss


def write_graph(graph: Counter[Edge]) -> None:
"""Write graph to stdout in TSV format."""
out = writer(sys.stdout, delimiter="\t")
for (source, target), weight in graph.most_common():
sense_s, gloss_s = source
sense_t, gloss_t = target
row = (sense_s, gloss_s, sense_t, gloss_t, weight)
out.writerow(row)


def main(args: Namespace) -> None:
"""Script entrypoint."""
translation_senses: dict[Translation, set[Sense]] = {}
sense_languages: dict[Sense, set[Language]] = {}
for language, word, sense, gloss in get_rows(args.tsv, silent=True):
translation = (language, word)
translation_senses.setdefault(translation, set()).add((sense, gloss))
sense_languages.setdefault((sense, gloss), set()).add(language)

# Only include word senses that are translated in enough languages.
senses = {
sense
for sense, languages in sense_languages.items()
if len(languages) >= args.sense_cutoff
}

# Create colexification graph.
graph_languages: dict[Edge, set[Language]] = {}
for (language, _), word_senses in translation_senses.items():
nodes = sorted(sense for sense in word_senses if sense in senses)
for i in range(1, len(nodes)):
for j in range(i):
graph_languages.setdefault(
(nodes[j], nodes[i]),
set(),
).add(language)

write_graph(
Counter({
edge: weight
for edge, languages in graph_languages.items()
if (weight := len(languages)) >= args.edge_cutoff
}),
)


if __name__ == "__main__":
main(parse_args())
111 changes: 111 additions & 0 deletions colexification_graphs/post.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright 2023 Levi Gruspe
# Licensed under GNU GPLv3 or later
# See https://www.gnu.org/licenses/gpl-3.0.en.html

# pylint: disable=too-few-public-methods
"""Turn TSV file of graph edges into a cytoscape JSON file."""

from argparse import ArgumentParser, Namespace
from csv import reader
from json import dumps
from pathlib import Path

import networkx as nx # type: ignore


def parse_args() -> Namespace:
"""Parse command-line arguments."""
parser = ArgumentParser(description=__doc__)
parser.add_argument(
"tsv",
type=Path,
help=(
"path to TSV file "
"(columns: word 1, sense 1, word 2, sense 2, weight)"
),
)
return parser.parse_args()


class NodeDirectory:
"""Maps word senses to IDs."""
def __init__(self) -> None:
self.counter = 0
self.ids: dict[tuple[str, str], int] = {}

def get(self, sense: tuple[str, str]) -> int:
"""Return sense ID."""
self.counter += 1
return self.ids.setdefault(sense, self.counter)


def load_graph(tsv: Path) -> nx.Graph:
"""Create graph from weighted edges in TSV file.
Minimum edge weight to include in the graph.
"""
graph = nx.Graph()
nodes = NodeDirectory()
with open(tsv, encoding="utf-8") as file:
for row in reader(file, delimiter="\t"):
(
source_word,
source_sense,
target_word,
target_sense,
weight,
) = row
source = nodes.get((source_word, source_sense))
target = nodes.get((target_word, target_sense))

graph.add_node(
source,
word=source_word,
sense=source_sense,
)
graph.add_node(
target,
word=target_word,
sense=target_sense,
)
graph.add_edge(source, target, weight=int(weight))
return graph


def to_cytoscape(graph: nx.Graph) -> str:
"""Convert networkx graph into a cytoscape JSON file."""
nodes = [
{
"data": {
**data,
"id": node,
},
}
for node, data in graph.nodes(data=True)
]
edges = [
{
"data": {
**data,
"source": source,
"target": target,
},
}
for source, target, data in graph.edges(data=True)
]
return dumps({
"elements": {
"nodes": nodes,
"edges": edges,
},
})


def main(args: Namespace) -> None:
"""Script entrypoint."""
graph = load_graph(args.tsv)
print(to_cytoscape(graph))


if __name__ == "__main__":
main(parse_args())
34 changes: 34 additions & 0 deletions colexification_graphs/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2023 Levi Gruspe
# Licensed under GNU GPLv3 or later
# See https://www.gnu.org/licenses/gpl-3.0.en.html
"""kaikki.org dictionary word schema."""

import typing as t


class TranslationSchema(t.TypedDict):
"""Schema for values inside .senses[*].translations."""
lang: str
code: t.NotRequired[str] # Some translations have missing codes :(
word: t.NotRequired[str] # Some translations have missing words :(
roman: t.NotRequired[str]
sense: t.NotRequired[str] # If `None`, treat as equal to `word`.


class SenseSchema(t.TypedDict):
"""Schema for values inside .senses."""
# There are other fields, but we only need the translations.
translations: t.NotRequired[list[TranslationSchema]]


class Schema(t.TypedDict):
"""Schema for each line in a kaikki.org dictionary."""
word: str
pos: str
lang: str
lang_code: str
senses: list[SenseSchema]
translations: t.NotRequired[list[TranslationSchema]]


__all__ = ["Schema", "SenseSchema", "TranslationSchema"]
Loading

0 comments on commit e021263

Please sign in to comment.