Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0.6.14 #388

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
0.6.14
  • Loading branch information
joocer committed Jul 15, 2023
commit 3c748b3cc5e2ca2659d5f588ebb189db1f75a342
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ pip install --upgrade git+https://github.com/mabel-dev/mabel
## Dependencies

>- **[orjson](https://github.com/ijl/orjson)** for JSON (de)serialization
>- **[siphashc](https://github.com/WeblateOrg/siphashc)** for non-cryptographic hashing
>- **[pydantic](https://pydantic-docs.helpmanual.io/)** to define internal data models
>- **[zstandard](https://github.com/indygreg/python-zstandard)** for real-time on disk compression
>- **[LZ4](https://github.com/python-lz4/python-lz4)** for real-time in memory compression
Expand Down
8 changes: 2 additions & 6 deletions mabel/data/internals/dictset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from mabel.data.internals.storage_classes import StorageClassMemory
from mabel.errors import MissingDependencyError
from mabel.utils.ipython import is_running_from_ipython
from siphashc import siphash
from orso.cityhash import CityHash32


class STORAGE_CLASS(int, Enum):
Expand Down Expand Up @@ -522,15 +522,11 @@ def __hash__(self, seed: int = 703115) -> int:
Creates a consistent hash of the _DictSet_ regardless of the order of
the items in the _DictSet_.
"""

def sip(val):
return siphash("TheApolloMission", val)

# The seed is the mission duration of the Apollo 11 mission.
# 703115 = 8 days, 3 hours, 18 minutes, 35 seconds
ordered = map(lambda record: dict(sorted(record.items())), iter(self._iterator))
serialized = map(orjson.dumps, ordered)
hashed = map(sip, serialized)
hashed = map(CityHash32, serialized)
return reduce(lambda x, y: x ^ y, hashed, seed)

def __repr__(self): # pragma: no cover
Expand Down
10 changes: 3 additions & 7 deletions mabel/data/internals/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections import defaultdict

import cython
from siphashc import siphash
from orso.cityhash import CityHash32


def summer(x, y):
Expand All @@ -26,8 +26,6 @@ def summer(x, y):
"AVG": lambda x, y: 1,
}

HASH_SEED = b"Anakin Skywalker"


class TooManyGroups(Exception):
pass
Expand Down Expand Up @@ -73,13 +71,11 @@ def _map(self, collect_columns):

for record in self._dictset:
try:
group_key: cython.uint64_t = siphash(
HASH_SEED,
group_key: cython.uint64_t = CityHash32(
"".join([str(record[column]) for column in self._columns]),
)
except KeyError:
group_key: cython.uint64_t = siphash(
HASH_SEED,
group_key: cython.uint64_t = CityHash32(
"".join([f"{record.get(column, '')}" for column in self._columns]),
)
if group_key not in self._group_keys.keys():
Expand Down
7 changes: 3 additions & 4 deletions mabel/data/internals/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
from typing import Iterable

import orjson
from siphashc import siphash
from orso.cityhash import CityHash32

MAX_INDEX = 4294967295 # 2^32 - 1
SEED = "eschatologically" # needs to be 16 characters long

"""
There are overlapping terms because we're traversing a dataset so we can traverse a
Expand Down Expand Up @@ -68,7 +67,7 @@ def search(self, search_term) -> Iterable:
search_term = [search_term]
result: list = []
for term in search_term:
key = format(siphash(SEED, f"{term}") % MAX_INDEX, "x")
key = format(CityHash32(f"{term}") % MAX_INDEX, "x")
if key in self._index: # type:ignore
result[0:0] = self._index[key] # type:ignore
return result
Expand Down Expand Up @@ -100,7 +99,7 @@ def add(self, position, record):
if not isinstance(values, list):
values = [values]
for value in values:
entry = (format(siphash(SEED, f"{value}") % MAX_INDEX, "x"), position)
entry = (format(CityHash32(f"{value}") % MAX_INDEX, "x"), position)
ret_val.append(entry)
self.temporary_index += ret_val
return ret_val
Expand Down
5 changes: 2 additions & 3 deletions mabel/data/readers/internals/base_inner_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from mabel.logging import get_logger
from mabel.utils import dates
from mabel.utils import paths
from orso.cityhash import CityHash32

BUFFER_SIZE: int = 64 * 1024 * 1024 # 64Mb

Expand Down Expand Up @@ -124,9 +125,7 @@ def read_blob(self, blob: str) -> IOBase:
return io.BytesIO(result)

# hash the blob name for the look up
from siphashc import siphash

blob_hash = str(siphash("RevengeOfTheBlob", blob))
blob_hash = str(CityHash32(blob))

# try to fetch the cached file
result = cache_server.get(blob_hash)
Expand Down
12 changes: 6 additions & 6 deletions mabel/data/readers/internals/cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
midway through the blob if required.
"""
import orjson
from siphashc import siphash
from orso.cityhash import CityHash32


class InvalidCursor(Exception):
Expand All @@ -29,7 +29,7 @@ def __init__(self, readable_blobs, cursor=None):
self.load_cursor(cursor)

def load_cursor(self, cursor):
from bitarray import bitarray
from orso.bitarray import bitarray

if cursor is None:
return
Expand All @@ -46,7 +46,7 @@ def load_cursor(self, cursor):

self.location = cursor["location"]
find_partition = [
blob for blob in self.readable_blobs if siphash("%" * 16, blob) == cursor["partition"]
blob for blob in self.readable_blobs if CityHash32(blob) == cursor["partition"]
]
if len(find_partition) == 1:
self.partition = find_partition[0]
Expand All @@ -66,7 +66,7 @@ def next_blob(self, previous_blob=None):
if self.partition in self.readable_blobs:
return self.partition
partition_finder = [
blob for blob in self.readable_blobs if siphash("%" * 16, blob) == self.partition
blob for blob in self.readable_blobs if CityHash32(blob) == self.partition
]
if len(partition_finder) != 1:
raise ValueError(f"Unable to determine current partition ({self.partition})")
Expand Down Expand Up @@ -94,15 +94,15 @@ def get(self):
}

def __getitem__(self, item):
from bitarray import bitarray
from orso.bitarray import bitarray

if item == "map":
blob_map = bitarray(
"".join(["1" if blob in self.read_blobs else "0" for blob in self.readable_blobs])
)
return blob_map.tobytes().hex()
if item == "partition":
return siphash("%" * 16, self.partition)
return CityHash32(self.partition)
if item == "location":
return self.location
return None
Expand Down
4 changes: 2 additions & 2 deletions mabel/data/readers/internals/inline_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import orjson
from mabel.utils.dates import parse_iso
from siphashc import siphash
from orso.cityhash import CityHash32


def get_year(input):
Expand Down Expand Up @@ -223,7 +223,7 @@ def get_md5(item):
"BOOLEAN": lambda x: str(x).upper() != "FALSE",
"ISNONE": lambda x: x is None,
# HASHING & ENCODING
"HASH": lambda x: format(siphash("INCOMPREHENSIBLE", str(x)), "X"),
"HASH": lambda x: format(CityHash32(str(x)), "X"),
"MD5": get_md5,
"RANDOM": get_random, # return a random number 0-99
# OTHER
Expand Down
Loading