Initial commit

palasimi · May 23, 2023 · e021263 · e021263
commit e021263
Show file tree

Hide file tree

Showing 10 changed files with 1,228 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+/build/
+/dist/
+/env/
diff --git a/LICENSE b/LICENSE
diff --git a/Makefile b/Makefile
@@ -0,0 +1,31 @@
+DATE=$(shell date '+%Y.%m.%d')
+PARENT="colexification-graphs-$(DATE)"
+
+# Expects build/kaikki.jsonl to exist.
+# Download from https://kaikki.org/dictionary/All%20languages%20combined/kaikki.org-dictionary-all.json
+.PHONY:	build
+build:	build/graph.json
+
+build/wordsenses.tsv:	build/kaikki.jsonl
+	python -m colexification_graphs.wordsenses $< > $@ 2> build/wordsenses.err
+
+build/graph.tsv:	build/wordsenses.tsv
+	python -m colexification_graphs.graph $< > $@
+
+build/graph.json:	build/graph.tsv
+	python -m colexification_graphs.post $< > $@
+
+.PHONY:	check
+check:
+	flake8 colexification_graphs
+	pylint colexification_graphs
+	mypy --strict colexification_graphs
+
+.PHONY:	dist
+dist:	build
+	mkdir -p dist
+	cd build; \
+		mkdir -p "$(PARENT)"; \
+		cp ../README.md graph.json graph.tsv "$(PARENT)"; \
+		tar -czvf "$(PARENT).tar.gz" "$(PARENT)"; \
+		mv "$(PARENT).tar.gz" ../dist
diff --git a/README.md b/README.md
@@ -0,0 +1,44 @@
+# colexification-graphs
+
+Colexification graphs extracted from Wiktionary.
+
+## Interpretation
+
+A colexification graph is an undirected graph.
+
+- Nodes represent meanings/sense-annotated words.
+- The weight of an edge between two nodes is the number of languages where the same word is used for both meanings.
+
+## Format
+
+The graphs are available in TSV and JSON formats.
+
+TSV columns:
+
+1. node 1 word
+2. node 1 sense
+3. node 2 word
+4. node 2 sense
+5. weight of edge between node 1 and node 2
+
+The JSON file is compatible with [Cytoscape.js](https://js.cytoscape.org/).
+
+## Licenses
+
+### Scripts
+
+Copyright 2023 Levi Gruspe
+
+This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. 
+
+### Data
+
+Copyright 2023 Levi Gruspe
+
+The published colexification graphs are made available under the [Creative Commons Attribution-ShareAlike License](https://creativecommons.org/licenses/by-sa/3.0/).
+This work is derived from Wiktionary.
+The copyright of the original work belongs to Wiktionary's editors and contributors.
diff --git a/colexification_graphs/__init__.py b/colexification_graphs/__init__.py
@@ -0,0 +1,3 @@
+# Copyright 2023 Levi Gruspe
+# Licensed under GNU GPLv3 or later
+# See https://www.gnu.org/licenses/gpl-3.0.en.html
diff --git a/colexification_graphs/graph.py b/colexification_graphs/graph.py
@@ -0,0 +1,129 @@
+# Copyright 2023 Levi Gruspe
+# Licensed under GNU GPLv3 or later
+# See https://www.gnu.org/licenses/gpl-3.0.en.html
+"""Build colexification graph."""
+
+from argparse import ArgumentParser, Namespace
+from collections import Counter
+from csv import reader, writer
+from pathlib import Path
+import sys
+import typing as t
+
+
+Language: t.TypeAlias = str
+Word: t.TypeAlias = str
+Translation: t.TypeAlias = tuple[Language, Word]
+Sense: t.TypeAlias = tuple[str, str]
+Edge: t.TypeAlias = tuple[Sense, Sense]
+
+
+def parse_args() -> Namespace:
+    """Parse command-line arguments."""
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--edge-cutoff",
+        dest="edge_cutoff",
+        nargs=1,
+        type=int,
+        help="minimum weight required for edges in graph (default: 4)",
+
+        # Removes > 2/3 of all edges
+        default=[4],
+    )
+
+    parser.add_argument(
+        "--sense-cutoff",
+        dest="sense_cutoff",
+        nargs=1,
+        type=int,
+        help="minimum languages required for word senses (default: 20)",
+
+        # Removes > 80% of all word senses.
+        default=[20],
+    )
+
+    parser.add_argument(
+        "tsv",
+        type=Path,
+        help="path to TSV file (columns: language, word, sense, gloss)",
+    )
+    args = parser.parse_args()
+    args.edge_cutoff = args.edge_cutoff[0]
+    args.sense_cutoff = args.sense_cutoff[0]
+    return args
+
+
+class InvalidRecord(Exception):
+    """Raised when an invalid record is found in a TSV file."""
+
+
+def get_rows(
+    tsv: Path,
+    silent: bool = True,
+) -> t.Iterator[tuple[str, str, str, str]]:
+    """Read rows from TSV file.
+
+    If `silent` is `True`, silently ignores invalid rows.
+    If not, raises `InvalidRecord`.
+    """
+    with open(tsv, encoding="utf-8") as file:
+        for row in reader(file, delimiter="\t"):
+            try:
+                language, word, sense, gloss = row
+            except ValueError as exc:
+                if silent:
+                    continue
+                raise InvalidRecord from exc
+
+            yield language, word, sense, gloss
+
+
+def write_graph(graph: Counter[Edge]) -> None:
+    """Write graph to stdout in TSV format."""
+    out = writer(sys.stdout, delimiter="\t")
+    for (source, target), weight in graph.most_common():
+        sense_s, gloss_s = source
+        sense_t, gloss_t = target
+        row = (sense_s, gloss_s, sense_t, gloss_t, weight)
+        out.writerow(row)
+
+
+def main(args: Namespace) -> None:
+    """Script entrypoint."""
+    translation_senses: dict[Translation, set[Sense]] = {}
+    sense_languages: dict[Sense, set[Language]] = {}
+    for language, word, sense, gloss in get_rows(args.tsv, silent=True):
+        translation = (language, word)
+        translation_senses.setdefault(translation, set()).add((sense, gloss))
+        sense_languages.setdefault((sense, gloss), set()).add(language)
+
+    # Only include word senses that are translated in enough languages.
+    senses = {
+        sense
+        for sense, languages in sense_languages.items()
+        if len(languages) >= args.sense_cutoff
+    }
+
+    # Create colexification graph.
+    graph_languages: dict[Edge, set[Language]] = {}
+    for (language, _), word_senses in translation_senses.items():
+        nodes = sorted(sense for sense in word_senses if sense in senses)
+        for i in range(1, len(nodes)):
+            for j in range(i):
+                graph_languages.setdefault(
+                    (nodes[j], nodes[i]),
+                    set(),
+                ).add(language)
+
+    write_graph(
+        Counter({
+            edge: weight
+            for edge, languages in graph_languages.items()
+            if (weight := len(languages)) >= args.edge_cutoff
+        }),
+    )
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/colexification_graphs/post.py b/colexification_graphs/post.py
@@ -0,0 +1,111 @@
+# Copyright 2023 Levi Gruspe
+# Licensed under GNU GPLv3 or later
+# See https://www.gnu.org/licenses/gpl-3.0.en.html
+
+# pylint: disable=too-few-public-methods
+"""Turn TSV file of graph edges into a cytoscape JSON file."""
+
+from argparse import ArgumentParser, Namespace
+from csv import reader
+from json import dumps
+from pathlib import Path
+
+import networkx as nx   # type: ignore
+
+
+def parse_args() -> Namespace:
+    """Parse command-line arguments."""
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "tsv",
+        type=Path,
+        help=(
+            "path to TSV file "
+            "(columns: word 1, sense 1, word 2, sense 2, weight)"
+        ),
+    )
+    return parser.parse_args()
+
+
+class NodeDirectory:
+    """Maps word senses to IDs."""
+    def __init__(self) -> None:
+        self.counter = 0
+        self.ids: dict[tuple[str, str], int] = {}
+
+    def get(self, sense: tuple[str, str]) -> int:
+        """Return sense ID."""
+        self.counter += 1
+        return self.ids.setdefault(sense, self.counter)
+
+
+def load_graph(tsv: Path) -> nx.Graph:
+    """Create graph from weighted edges in TSV file.
+
+    Minimum edge weight to include in the graph.
+    """
+    graph = nx.Graph()
+    nodes = NodeDirectory()
+    with open(tsv, encoding="utf-8") as file:
+        for row in reader(file, delimiter="\t"):
+            (
+                source_word,
+                source_sense,
+                target_word,
+                target_sense,
+                weight,
+            ) = row
+            source = nodes.get((source_word, source_sense))
+            target = nodes.get((target_word, target_sense))
+
+            graph.add_node(
+                source,
+                word=source_word,
+                sense=source_sense,
+            )
+            graph.add_node(
+                target,
+                word=target_word,
+                sense=target_sense,
+            )
+            graph.add_edge(source, target, weight=int(weight))
+    return graph
+
+
+def to_cytoscape(graph: nx.Graph) -> str:
+    """Convert networkx graph into a cytoscape JSON file."""
+    nodes = [
+        {
+            "data": {
+                **data,
+                "id": node,
+            },
+        }
+        for node, data in graph.nodes(data=True)
+    ]
+    edges = [
+        {
+            "data": {
+                **data,
+                "source": source,
+                "target": target,
+            },
+        }
+        for source, target, data in graph.edges(data=True)
+    ]
+    return dumps({
+        "elements": {
+            "nodes": nodes,
+            "edges": edges,
+        },
+    })
+
+
+def main(args: Namespace) -> None:
+    """Script entrypoint."""
+    graph = load_graph(args.tsv)
+    print(to_cytoscape(graph))
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/colexification_graphs/schema.py b/colexification_graphs/schema.py
@@ -0,0 +1,34 @@
+# Copyright 2023 Levi Gruspe
+# Licensed under GNU GPLv3 or later
+# See https://www.gnu.org/licenses/gpl-3.0.en.html
+"""kaikki.org dictionary word schema."""
+
+import typing as t
+
+
+class TranslationSchema(t.TypedDict):
+    """Schema for values inside .senses[*].translations."""
+    lang: str
+    code: t.NotRequired[str]    # Some translations have missing codes :(
+    word: t.NotRequired[str]    # Some translations have missing words :(
+    roman: t.NotRequired[str]
+    sense: t.NotRequired[str]   # If `None`, treat as equal to `word`.
+
+
+class SenseSchema(t.TypedDict):
+    """Schema for values inside .senses."""
+    # There are other fields, but we only need the translations.
+    translations: t.NotRequired[list[TranslationSchema]]
+
+
+class Schema(t.TypedDict):
+    """Schema for each line in a kaikki.org dictionary."""
+    word: str
+    pos: str
+    lang: str
+    lang_code: str
+    senses: list[SenseSchema]
+    translations: t.NotRequired[list[TranslationSchema]]
+
+
+__all__ = ["Schema", "SenseSchema", "TranslationSchema"]