Skip to content

Commit

Permalink
Check for hash collisions
Browse files Browse the repository at this point in the history
  • Loading branch information
lggruspe committed Jul 13, 2023
1 parent f60487c commit 6bb1f01
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 0 deletions.
54 changes: 54 additions & 0 deletions colexification_graphs/collisions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright 2023 Levi Gruspe
# See https://www.gnu.org/licenses/gpl-3.0.en.html
"""Check for collisions in concept hashes."""

from argparse import ArgumentParser, Namespace
from csv import reader
from pathlib import Path
import sys

from base58 import b58encode

from colexification_graphs.fnv import fnv_1a_64 as fnv_hash


def parse_args() -> Namespace:
"""Parse command-line arguments."""
parser = ArgumentParser(description=__doc__)
parser.add_argument(
"tsv",
type=Path,
help="path to TSV file (columns: language, word, sense, gloss)",
)
return parser.parse_args()


def main(args: Namespace) -> None:
"""Script entrypoint."""
# Get concepts.
concepts = set()
with open(args.tsv, encoding="utf-8") as file:
for _, _, word, sense in reader(file, delimiter="\t"):
concepts.add((word, sense))

# Compute IDs.
ids: dict[str, set[tuple[str, str]]] = {}
for concept in concepts:
key = f"{concept[0]}\t{concept[1]}".encode()
id_ = b58encode(fnv_hash(key)).decode()
ids.setdefault(id_, set()).add(concept)

# Look for collisions.
total = 0
for id_, collisions in ids.items():
if len(collisions) > 1:
total += 1
print(collisions, file=sys.stdout)

if total > 0:
print("Found", total, "collisions", file=sys.stdout)


if __name__ == "__main__":
main(parse_args())
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
astroid==2.15.5
base58==2.1.1
dill==0.3.6
flake8==6.0.0
isort==5.12.0
Expand Down

0 comments on commit 6bb1f01

Please sign in to comment.