Skip to content

Commit

Permalink
Add ability to get file hashes from inventory
Browse files Browse the repository at this point in the history
  • Loading branch information
marcpage authored and pagerk committed Aug 30, 2024
1 parent 4bed5e6 commit 0e7600a
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 27 deletions.
162 changes: 136 additions & 26 deletions genweb/inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,139 @@
""" Keep track of artifact files """


from os import walk
from os import walk, stat, makedirs
from os.path import join, relpath, isfile, basename, commonpath
from types import SimpleNamespace
from hashlib import new as Hasher
from json import load, dump


class Artifacts:
"""keep track of files used and unused"""

def __init__(self, directory: str):
HASH_FILE_CHUNK_SIZE_BYTES = 1 * 1024 * 1024 # 1 MiB

def __init__(self, directory: str, cache_dir: str = None):
self.cache_dir = cache_dir if cache_dir else join(directory, "metadata")
self.directory = directory
self.accounted = set()
self.unaccounted = set()
self.inventory = {}
self.refresh()
self._load_cache()

@staticmethod
def _new_entry(**kwargs) -> SimpleNamespace:
return SimpleNamespace(
size=kwargs.get("size", None),
modified=kwargs.get("modified", None),
hash=kwargs.get("hash", None),
accounted=kwargs.get("accounted", False),
)

def _cache_path(self) -> str:
return join(self.cache_dir, "artifact hash cache.json")

def _load_cache(self) -> None:
cache_path = self._cache_path()

if not isfile(cache_path):
return

with open(cache_path, "r", encoding="utf-8") as cache_file:
cached = load(cache_file)

for path, info in cached.items():
if path in self.inventory:
self.inventory[path].modified = info.get("modified", None)
self.inventory[path].size = info.get("size", None)
self.inventory[path].hash = info.get("hash", None)

def _save_cache(self) -> None:
cache_data = {
p: {"modified": i.modified, "size": i.size, "hash": i.hash}
for p, i in self.inventory.items()
if i.modified and i.hash and i.size is not None
}
makedirs(self.cache_dir, exist_ok=True)

with open(self._cache_path(), "w", encoding="utf-8") as cache_file:
dump(cache_data, cache_file)

@staticmethod
def _hash_file(path: str) -> str:
hasher = Hasher("sha256")

with open(path, "rb") as contents:
while True:
block = contents.read(Artifacts.HASH_FILE_CHUNK_SIZE_BYTES)

if not block:
break

hasher.update(block)

return hasher.hexdigest()

@staticmethod
def _hash_valid(path: str, entry: SimpleNamespace) -> bool:
if entry.size is None or not entry.modified or not entry.hash:
return False

file_info = stat(path)

if file_info.st_size != entry.size:
return False

def refresh(self):
if file_info.st_mtime != entry.modified:
return False

return True

@staticmethod
def _update_stat(entry: SimpleNamespace, path: str):
file_info = stat(path)
entry.size = file_info.st_size
entry.modified = file_info.st_mtime

def hash(self, path: str) -> str:
"""Gets the hash of the given file.
Hashes are cached along with mdoficiation timestamp and size.
If the size and modification timestamp from the cache match
the file, the cached hash is returned.
If the file has been modified since the hash was generated,
a new hash is generated and cached.
Args:
path (str): The relative path of the file
Returns:
str: The sha256 hash hex digest of the contents of the file
"""
full_path = join(self.directory, path)
assert isfile(full_path), full_path
self.inventory[path] = self.inventory.get(
path,
Artifacts._new_entry(),
)
entry = self.inventory[path]

if not Artifacts._hash_valid(full_path, entry):
entry.hash = Artifacts._hash_file(full_path)
Artifacts._update_stat(entry, full_path)
self._save_cache()

return entry.hash

def refresh(self) -> None:
"""Looks for new files in the artifacts directory"""
all_files = {
relpath(join(r, f), self.directory)
for r, _, fs in walk(self.directory)
for f in fs
}
self.unaccounted = all_files - self.accounted

for file in all_files:
if file not in self.inventory:
self.inventory[file] = Artifacts._new_entry()

def paths(self, filename: str) -> list[str]:
"""Returns all the paths for a given filename
Expand All @@ -36,7 +148,7 @@ def paths(self, filename: str) -> list[str]:
list[str]: The list of relative paths from the artifacts directory for
all files found with that filename
"""
return [f for f in self.accounted | self.unaccounted if basename(f) == filename]
return [f for f in self.inventory if basename(f) == filename]

def suffixed(self, suffix: str) -> list[str]:
"""Finds all relative file paths that end with the given suffix
Expand All @@ -47,7 +159,7 @@ def suffixed(self, suffix: str) -> list[str]:
Returns:
list[str]: The list of relative paths that end with the given suffix
"""
return [f for f in self.accounted | self.unaccounted if f.endswith(suffix)]
return [f for f in self.inventory if f.endswith(suffix)]

def has_file(self, file_path: str) -> bool:
"""See if this file exists
Expand All @@ -58,7 +170,7 @@ def has_file(self, file_path: str) -> bool:
Returns:
bool: Does it exist
"""
return file_path in self.unaccounted or file_path in self.accounted
return file_path in self.inventory

def has_dir(self, dir_path: str) -> bool:
"""Does a directory with a file in it exist
Expand All @@ -69,10 +181,7 @@ def has_dir(self, dir_path: str) -> bool:
Returns:
bool: We found the directory (must have a file in it)
"""
return any(
commonpath([dir_path, f]) == dir_path
for f in self.accounted | self.unaccounted
)
return any(commonpath([dir_path, f]) == dir_path for f in self.inventory)

def files_under(self, dir_path: str) -> list[str]:
"""Get the list of files in a directory (recursively)
Expand All @@ -83,32 +192,33 @@ def files_under(self, dir_path: str) -> list[str]:
Returns:
list[str]: The files under that directory
"""
return [
f
for f in self.accounted | self.unaccounted
if commonpath([dir_path, f]) == dir_path
]
return [f for f in self.inventory if commonpath([dir_path, f]) == dir_path]

def add(self, *relative_file_path: str):
def add(self, *relative_file_path: str) -> None:
"""Adds a file to the inventory
Args:
relative_file_path (str): Files to add
"""
not_found = []

for path in relative_file_path:
if path in self.accounted:
if path not in self.inventory and isfile(join(self.directory, path)):
self.inventory[path] = Artifacts._new_entry(accounted=True)
continue

if path not in self.inventory:
not_found.append(path)
continue

assert path in self.unaccounted or isfile(join(self.directory, path)), path
self.accounted.add(path)
self.inventory[path].accounted = True

if path in self.unaccounted:
self.unaccounted.remove(path)
assert len(not_found) == 0, not_found

def lost(self) -> set[str]:
def lost(self) -> list[str]:
"""Gets a list of all the files in the artifacts directory that have not been referenced
Returns:
set[str]: All unreferenced files
"""
return set(self.unaccounted)
return [p for p, i in self.inventory.items() if not i.accounted]
59 changes: 58 additions & 1 deletion tests/test_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
""" Test Artifacts """


from os.path import dirname, relpath, basename
from os import makedirs, utime
from os.path import dirname, relpath, basename, join
from tempfile import TemporaryDirectory
from time import time

from genweb.inventory import Artifacts

Expand Down Expand Up @@ -61,6 +64,60 @@ def test_basic() -> None:
assert "data/test.xml" in artifacts.files_under("data")


def create_file(path: str, contents: str):
makedirs(dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as file:
file.write(contents)


def test_add() -> None:
with TemporaryDirectory() as working_dir:
create_file(join(working_dir, "file1.txt"), "file1")
create_file(join(working_dir, "file2.txt"), "file2")
create_file(join(working_dir, "dir/file3.txt"), "file3")
artifacts = Artifacts(working_dir)
create_file(join(working_dir, "dir/file4.txt"), "file4")
artifacts.add("dir/file4.txt")
artifacts.add("file1.txt")

try:
artifacts.add("dir/file5.txt")
raise AssertionError("dir/file5.txt should have failed")

except AssertionError:
pass

lost = {"file2.txt", "dir/file3.txt"}
assert set(artifacts.lost()) == lost, artifacts.lost()


def test_hash() -> None:
with TemporaryDirectory() as working_dir:
create_file(join(working_dir, "file1.txt"), "")
create_file(join(working_dir, "dir/file2.txt"), "2")
artifacts = Artifacts(working_dir)
hash_empty = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
hash_2 = "d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35"
assert artifacts.hash("file1.txt") == hash_empty, artifacts.hash("file1.txt")
assert artifacts.hash("dir/file2.txt") == hash_2, artifacts.hash(
"dir/file2.txt"
)
create_file(join(working_dir, "dir/file2.txt"), "2 ")
hash_2_space = (
"5749fdd6b67e4204b3047ba33540bc87f60c84d784a46c6307c78299f8fa67e9"
)
artifacts = Artifacts(working_dir)
assert artifacts.hash("file1.txt") == hash_empty, artifacts.hash("file1.txt")
assert artifacts.hash("dir/file2.txt") == hash_2_space, artifacts.hash(
"dir/file2.txt"
)
minute_ago = time() - 60
utime(join(working_dir, "file1.txt"), (minute_ago, minute_ago))
assert artifacts.hash("file1.txt") == hash_empty, artifacts.hash("file1.txt")


if __name__ == "__main__":
test_basic()
test_suffixed()
test_add()
test_hash()

0 comments on commit 0e7600a

Please sign in to comment.