-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit e021263
Showing
10 changed files
with
1,228 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
*.pyc | ||
/build/ | ||
/dist/ | ||
/env/ |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
DATE=$(shell date '+%Y.%m.%d') | ||
PARENT="colexification-graphs-$(DATE)" | ||
|
||
# Expects build/kaikki.jsonl to exist. | ||
# Download from https://kaikki.org/dictionary/All%20languages%20combined/kaikki.org-dictionary-all.json | ||
.PHONY: build | ||
build: build/graph.json | ||
|
||
build/wordsenses.tsv: build/kaikki.jsonl | ||
python -m colexification_graphs.wordsenses $< > $@ 2> build/wordsenses.err | ||
|
||
build/graph.tsv: build/wordsenses.tsv | ||
python -m colexification_graphs.graph $< > $@ | ||
|
||
build/graph.json: build/graph.tsv | ||
python -m colexification_graphs.post $< > $@ | ||
|
||
.PHONY: check | ||
check: | ||
flake8 colexification_graphs | ||
pylint colexification_graphs | ||
mypy --strict colexification_graphs | ||
|
||
.PHONY: dist | ||
dist: build | ||
mkdir -p dist | ||
cd build; \ | ||
mkdir -p "$(PARENT)"; \ | ||
cp ../README.md graph.json graph.tsv "$(PARENT)"; \ | ||
tar -czvf "$(PARENT).tar.gz" "$(PARENT)"; \ | ||
mv "$(PARENT).tar.gz" ../dist |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# colexification-graphs | ||
|
||
Colexification graphs extracted from Wiktionary. | ||
|
||
## Interpretation | ||
|
||
A colexification graph is an undirected graph. | ||
|
||
- Nodes represent meanings/sense-annotated words. | ||
- The weight of an edge between two nodes is the number of languages where the same word is used for both meanings. | ||
|
||
## Format | ||
|
||
The graphs are available in TSV and JSON formats. | ||
|
||
TSV columns: | ||
|
||
1. node 1 word | ||
2. node 1 sense | ||
3. node 2 word | ||
4. node 2 sense | ||
5. weight of edge between node 1 and node 2 | ||
|
||
The JSON file is compatible with [Cytoscape.js](https://js.cytoscape.org/). | ||
|
||
## Licenses | ||
|
||
### Scripts | ||
|
||
Copyright 2023 Levi Gruspe | ||
|
||
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
|
||
### Data | ||
|
||
Copyright 2023 Levi Gruspe | ||
|
||
The published colexification graphs are made available under the [Creative Commons Attribution-ShareAlike License](https://creativecommons.org/licenses/by-sa/3.0/). | ||
This work is derived from Wiktionary. | ||
The copyright of the original work belongs to Wiktionary's editors and contributors. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Copyright 2023 Levi Gruspe | ||
# Licensed under GNU GPLv3 or later | ||
# See https://www.gnu.org/licenses/gpl-3.0.en.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# Copyright 2023 Levi Gruspe | ||
# Licensed under GNU GPLv3 or later | ||
# See https://www.gnu.org/licenses/gpl-3.0.en.html | ||
"""Build colexification graph.""" | ||
|
||
from argparse import ArgumentParser, Namespace | ||
from collections import Counter | ||
from csv import reader, writer | ||
from pathlib import Path | ||
import sys | ||
import typing as t | ||
|
||
|
||
Language: t.TypeAlias = str | ||
Word: t.TypeAlias = str | ||
Translation: t.TypeAlias = tuple[Language, Word] | ||
Sense: t.TypeAlias = tuple[str, str] | ||
Edge: t.TypeAlias = tuple[Sense, Sense] | ||
|
||
|
||
def parse_args() -> Namespace: | ||
"""Parse command-line arguments.""" | ||
parser = ArgumentParser(description=__doc__) | ||
parser.add_argument( | ||
"--edge-cutoff", | ||
dest="edge_cutoff", | ||
nargs=1, | ||
type=int, | ||
help="minimum weight required for edges in graph (default: 4)", | ||
|
||
# Removes > 2/3 of all edges | ||
default=[4], | ||
) | ||
|
||
parser.add_argument( | ||
"--sense-cutoff", | ||
dest="sense_cutoff", | ||
nargs=1, | ||
type=int, | ||
help="minimum languages required for word senses (default: 20)", | ||
|
||
# Removes > 80% of all word senses. | ||
default=[20], | ||
) | ||
|
||
parser.add_argument( | ||
"tsv", | ||
type=Path, | ||
help="path to TSV file (columns: language, word, sense, gloss)", | ||
) | ||
args = parser.parse_args() | ||
args.edge_cutoff = args.edge_cutoff[0] | ||
args.sense_cutoff = args.sense_cutoff[0] | ||
return args | ||
|
||
|
||
class InvalidRecord(Exception): | ||
"""Raised when an invalid record is found in a TSV file.""" | ||
|
||
|
||
def get_rows( | ||
tsv: Path, | ||
silent: bool = True, | ||
) -> t.Iterator[tuple[str, str, str, str]]: | ||
"""Read rows from TSV file. | ||
If `silent` is `True`, silently ignores invalid rows. | ||
If not, raises `InvalidRecord`. | ||
""" | ||
with open(tsv, encoding="utf-8") as file: | ||
for row in reader(file, delimiter="\t"): | ||
try: | ||
language, word, sense, gloss = row | ||
except ValueError as exc: | ||
if silent: | ||
continue | ||
raise InvalidRecord from exc | ||
|
||
yield language, word, sense, gloss | ||
|
||
|
||
def write_graph(graph: Counter[Edge]) -> None: | ||
"""Write graph to stdout in TSV format.""" | ||
out = writer(sys.stdout, delimiter="\t") | ||
for (source, target), weight in graph.most_common(): | ||
sense_s, gloss_s = source | ||
sense_t, gloss_t = target | ||
row = (sense_s, gloss_s, sense_t, gloss_t, weight) | ||
out.writerow(row) | ||
|
||
|
||
def main(args: Namespace) -> None: | ||
"""Script entrypoint.""" | ||
translation_senses: dict[Translation, set[Sense]] = {} | ||
sense_languages: dict[Sense, set[Language]] = {} | ||
for language, word, sense, gloss in get_rows(args.tsv, silent=True): | ||
translation = (language, word) | ||
translation_senses.setdefault(translation, set()).add((sense, gloss)) | ||
sense_languages.setdefault((sense, gloss), set()).add(language) | ||
|
||
# Only include word senses that are translated in enough languages. | ||
senses = { | ||
sense | ||
for sense, languages in sense_languages.items() | ||
if len(languages) >= args.sense_cutoff | ||
} | ||
|
||
# Create colexification graph. | ||
graph_languages: dict[Edge, set[Language]] = {} | ||
for (language, _), word_senses in translation_senses.items(): | ||
nodes = sorted(sense for sense in word_senses if sense in senses) | ||
for i in range(1, len(nodes)): | ||
for j in range(i): | ||
graph_languages.setdefault( | ||
(nodes[j], nodes[i]), | ||
set(), | ||
).add(language) | ||
|
||
write_graph( | ||
Counter({ | ||
edge: weight | ||
for edge, languages in graph_languages.items() | ||
if (weight := len(languages)) >= args.edge_cutoff | ||
}), | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main(parse_args()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# Copyright 2023 Levi Gruspe | ||
# Licensed under GNU GPLv3 or later | ||
# See https://www.gnu.org/licenses/gpl-3.0.en.html | ||
|
||
# pylint: disable=too-few-public-methods | ||
"""Turn TSV file of graph edges into a cytoscape JSON file.""" | ||
|
||
from argparse import ArgumentParser, Namespace | ||
from csv import reader | ||
from json import dumps | ||
from pathlib import Path | ||
|
||
import networkx as nx # type: ignore | ||
|
||
|
||
def parse_args() -> Namespace: | ||
"""Parse command-line arguments.""" | ||
parser = ArgumentParser(description=__doc__) | ||
parser.add_argument( | ||
"tsv", | ||
type=Path, | ||
help=( | ||
"path to TSV file " | ||
"(columns: word 1, sense 1, word 2, sense 2, weight)" | ||
), | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
class NodeDirectory: | ||
"""Maps word senses to IDs.""" | ||
def __init__(self) -> None: | ||
self.counter = 0 | ||
self.ids: dict[tuple[str, str], int] = {} | ||
|
||
def get(self, sense: tuple[str, str]) -> int: | ||
"""Return sense ID.""" | ||
self.counter += 1 | ||
return self.ids.setdefault(sense, self.counter) | ||
|
||
|
||
def load_graph(tsv: Path) -> nx.Graph: | ||
"""Create graph from weighted edges in TSV file. | ||
Minimum edge weight to include in the graph. | ||
""" | ||
graph = nx.Graph() | ||
nodes = NodeDirectory() | ||
with open(tsv, encoding="utf-8") as file: | ||
for row in reader(file, delimiter="\t"): | ||
( | ||
source_word, | ||
source_sense, | ||
target_word, | ||
target_sense, | ||
weight, | ||
) = row | ||
source = nodes.get((source_word, source_sense)) | ||
target = nodes.get((target_word, target_sense)) | ||
|
||
graph.add_node( | ||
source, | ||
word=source_word, | ||
sense=source_sense, | ||
) | ||
graph.add_node( | ||
target, | ||
word=target_word, | ||
sense=target_sense, | ||
) | ||
graph.add_edge(source, target, weight=int(weight)) | ||
return graph | ||
|
||
|
||
def to_cytoscape(graph: nx.Graph) -> str: | ||
"""Convert networkx graph into a cytoscape JSON file.""" | ||
nodes = [ | ||
{ | ||
"data": { | ||
**data, | ||
"id": node, | ||
}, | ||
} | ||
for node, data in graph.nodes(data=True) | ||
] | ||
edges = [ | ||
{ | ||
"data": { | ||
**data, | ||
"source": source, | ||
"target": target, | ||
}, | ||
} | ||
for source, target, data in graph.edges(data=True) | ||
] | ||
return dumps({ | ||
"elements": { | ||
"nodes": nodes, | ||
"edges": edges, | ||
}, | ||
}) | ||
|
||
|
||
def main(args: Namespace) -> None: | ||
"""Script entrypoint.""" | ||
graph = load_graph(args.tsv) | ||
print(to_cytoscape(graph)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main(parse_args()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Copyright 2023 Levi Gruspe | ||
# Licensed under GNU GPLv3 or later | ||
# See https://www.gnu.org/licenses/gpl-3.0.en.html | ||
"""kaikki.org dictionary word schema.""" | ||
|
||
import typing as t | ||
|
||
|
||
class TranslationSchema(t.TypedDict): | ||
"""Schema for values inside .senses[*].translations.""" | ||
lang: str | ||
code: t.NotRequired[str] # Some translations have missing codes :( | ||
word: t.NotRequired[str] # Some translations have missing words :( | ||
roman: t.NotRequired[str] | ||
sense: t.NotRequired[str] # If `None`, treat as equal to `word`. | ||
|
||
|
||
class SenseSchema(t.TypedDict): | ||
"""Schema for values inside .senses.""" | ||
# There are other fields, but we only need the translations. | ||
translations: t.NotRequired[list[TranslationSchema]] | ||
|
||
|
||
class Schema(t.TypedDict): | ||
"""Schema for each line in a kaikki.org dictionary.""" | ||
word: str | ||
pos: str | ||
lang: str | ||
lang_code: str | ||
senses: list[SenseSchema] | ||
translations: t.NotRequired[list[TranslationSchema]] | ||
|
||
|
||
__all__ = ["Schema", "SenseSchema", "TranslationSchema"] |
Oops, something went wrong.