Skip to content

Commit

Permalink
feat: Add initial DataclassWriter implementation (#3)
Browse files Browse the repository at this point in the history
* chore: update gitignore

* wip: add assertions for writable file

* refactor: extract get_header()

* feat: add fieldnames()

* wip: add assert_file_is_appendable()

* wip: initial DataclassWriter

* wip: include/exclude fields

* refactor: clean up attributes and docs

* refactor: long-form mode

* doc: update docstring
  • Loading branch information
msto authored Apr 13, 2024
1 parent 1494824 commit 9d6a06f
Show file tree
Hide file tree
Showing 9 changed files with 573 additions and 68 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
97 changes: 97 additions & 0 deletions dataclass_io/_lib/assertions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from dataclasses import is_dataclass
from os import R_OK
from os import W_OK
from os import access
from os import stat
from pathlib import Path

from dataclass_io._lib.dataclass_extensions import DataclassInstance
from dataclass_io._lib.dataclass_extensions import fieldnames
from dataclass_io._lib.file import get_header


def assert_file_is_readable(path: Path) -> None:
Expand All @@ -26,6 +30,78 @@ def assert_file_is_readable(path: Path) -> None:
raise PermissionError(f"The input file is not readable: {path}")


def assert_file_is_writable(path: Path, overwrite: bool = True) -> None:
"""
Check that the output file path is writable.
Optionally, ensure the output file does not exist.
Raises:
FileExistsError: If the provided file path exists when `overwrite` is set to `False`.
FileNotFoundError: If the provided file path's parent directory does not exist.
IsADirectoryError: If the provided file path is a directory.
PermissionError: If the provided file path is not writable.
"""

if path.exists():
if not overwrite:
raise FileExistsError(
f"The output file already exists: {path}\n"
"Specify `overwrite=True` to overwrite the existing file."
)

if not path.is_file():
raise IsADirectoryError(f"The output file path is a directory: {path}")

if not access(path, W_OK):
raise PermissionError(f"The output file is not writable: {path}")

else:
if not path.parent.exists():
raise FileNotFoundError(
f"The specified directory for the output file path does not exist: {path.parent}"
)

if not access(path.parent, W_OK):
raise PermissionError(
f"The specified directory for the output file path is not writable: {path.parent}"
)


def assert_file_is_appendable(path: Path, dataclass_type: type[DataclassInstance]) -> None:
if not path.exists():
raise FileNotFoundError(f"The specified output file does not exist: {path}")

if not path.is_file():
raise IsADirectoryError(f"The specified output file path is a directory: {path}")

if not access(path, W_OK):
raise PermissionError(f"The specified output file is not writable: {path}")

if stat(path).st_size == 0:
raise ValueError(f"The specified output file is empty: {path}")

if not access(path, R_OK):
raise PermissionError(
f"The specified output file is not readable: {path}\n"
"The output file must be readable to append to it. "
"The header of the existing output file is checked for consistency with the provided "
"dataclass before appending to it."
)

# TODO: pass delimiter and header_comment_char to get_header
with path.open("r") as f:
header = get_header(f)
if header is None:
raise ValueError(f"Could not find a header in the specified output file: {path}")

if header.fieldnames != fieldnames(dataclass_type):
raise ValueError(
"The specified output file does not have the same field names as the provided "
f"dataclass {path}"
)


def assert_dataclass_is_valid(dataclass_type: type[DataclassInstance]) -> None:
"""
Check that the input type is a parseable dataclass.
Expand All @@ -36,3 +112,24 @@ def assert_dataclass_is_valid(dataclass_type: type[DataclassInstance]) -> None:

if not is_dataclass(dataclass_type):
raise TypeError(f"The provided type must be a dataclass: {dataclass_type.__name__}")


def assert_fieldnames_are_dataclass_attributes(
specified_fieldnames: list[str],
dataclass_type: type[DataclassInstance],
) -> None:
"""
Check that all of the specified fields are attributes on the given dataclass.
Raises:
ValueError: if any of the specified fieldnames are not an attribute on the given dataclass.
"""

invalid_fieldnames = [f for f in specified_fieldnames if f not in fieldnames(dataclass_type)]

if len(invalid_fieldnames) > 0:
raise ValueError(
"One or more of the specified fields are not attributes on the dataclass "
+ f"{dataclass_type.__name__}: "
+ ", ".join(invalid_fieldnames)
)
14 changes: 13 additions & 1 deletion dataclass_io/_lib/dataclass_extensions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

from dataclasses import fields
from dataclasses import is_dataclass
from typing import Any
from typing import ClassVar
from typing import Protocol
Expand All @@ -16,3 +17,14 @@ class DataclassInstance(Protocol):
"""

__dataclass_fields__: ClassVar[dict[str, Any]]


def fieldnames(dataclass_type: type[DataclassInstance]) -> list[str]:
"""
Return the fieldnames of the specified dataclass.
"""

if not is_dataclass(dataclass_type):
raise TypeError(f"The provided type must be a dataclass: {dataclass_type.__name__}")

return [f.name for f in fields(dataclass_type)]
67 changes: 67 additions & 0 deletions dataclass_io/_lib/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from dataclasses import dataclass
from io import TextIOWrapper
from typing import IO
from typing import Optional
from typing import TextIO
from typing import TypeAlias

ReadableFileHandle: TypeAlias = TextIOWrapper | IO | TextIO


@dataclass(frozen=True, kw_only=True)
class FileHeader:
"""
Header of a file.
A file's header contains an optional preface, consisting of lines prefixed by a comment
character and/or empty lines, and a required row of fieldnames before the data rows begin.
Attributes:
preface: A list of any lines preceding the fieldnames.
fieldnames: The field names specified in the final line of the header.
"""

preface: list[str]
fieldnames: list[str]


def get_header(
reader: ReadableFileHandle,
delimiter: str = "\t",
header_comment_char: str = "#",
) -> Optional[FileHeader]:
"""
Read the header from an open file.
The first row after any commented or empty lines will be used as the fieldnames.
Lines preceding the fieldnames will be returned in the `preface.`
NB: This function returns `Optional` instead of raising an error because the name of the
source file is not in scope, making it difficult to provide a helpful error message. It is
the responsibility of the caller to raise an error if the file is empty.
See original proof-of-concept here: https://github.com/fulcrumgenomics/fgpyo/pull/103
Args:
reader: An open, readable file handle.
comment_char: The character which indicates the start of a comment line.
Returns:
A `FileHeader` containing the field names and any preceding lines.
None if the file was empty or contained only comments or empty lines.
"""

preface: list[str] = []

for line in reader:
if line.startswith(header_comment_char) or line.strip() == "":
preface.append(line.strip())
else:
break
else:
return None

fieldnames = line.strip().split(delimiter)

return FileHeader(preface=preface, fieldnames=fieldnames)
77 changes: 10 additions & 67 deletions dataclass_io/reader.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,16 @@
from csv import DictReader
from dataclasses import dataclass
from dataclasses import fields
from io import TextIOWrapper
from pathlib import Path
from types import TracebackType
from typing import IO
from typing import Any
from typing import Optional
from typing import TextIO
from typing import Type
from typing import TypeAlias

from dataclass_io._lib.assertions import assert_dataclass_is_valid
from dataclass_io._lib.assertions import assert_file_is_readable
from dataclass_io._lib.dataclass_extensions import DataclassInstance

ReadableFileHandle: TypeAlias = TextIOWrapper | IO | TextIO


@dataclass(frozen=True, kw_only=True)
class FileHeader:
"""
Header of a file.
A file's header contains an optional preface, consisting of lines prefixed by a comment
character and/or empty lines, and a required row of fieldnames before the data rows begin.
Attributes:
preface: A list of any lines preceding the fieldnames.
fieldnames: The field names specified in the final line of the header.
"""

preface: list[str]
fieldnames: list[str]
from dataclass_io._lib.dataclass_extensions import fieldnames
from dataclass_io._lib.file import FileHeader
from dataclass_io._lib.file import get_header


class DataclassReader:
Expand Down Expand Up @@ -65,11 +43,16 @@ def __init__(

self._fin = path.open("r")

self._header = self._get_header(self._fin)
self._header: FileHeader = get_header(
self._fin,
delimiter=delimiter,
header_comment_char=header_comment_char,
)

if self._header is None:
raise ValueError(f"Could not find a header in the provided file: {path}")

if self._header.fieldnames != [f.name for f in fields(dataclass_type)]:
if self._header.fieldnames != fieldnames(dataclass_type):
raise ValueError(
"The provided file does not have the same field names as the provided dataclass:\n"
f"\tDataclass: {dataclass_type.__name__}\n"
Expand Down Expand Up @@ -116,43 +99,3 @@ def _row_to_dataclass(self, row: dict[str, str]) -> DataclassInstance:
coerced_values[field.name] = field.type(value)

return self.dataclass_type(**coerced_values)

def _get_header(
self,
reader: ReadableFileHandle,
) -> Optional[FileHeader]:
"""
Read the header from an open file.
The first row after any commented or empty lines will be used as the fieldnames.
Lines preceding the fieldnames will be returned in the `preface.`
NB: This function returns `Optional` instead of raising an error because the name of the
source file is not in scope, making it difficult to provide a helpful error message. It is
the responsibility of the caller to raise an error if the file is empty.
See original proof-of-concept here: https://github.com/fulcrumgenomics/fgpyo/pull/103
Args:
reader: An open, readable file handle.
comment_char: The character which indicates the start of a comment line.
Returns:
A `FileHeader` containing the field names and any preceding lines.
None if the file was empty or contained only comments or empty lines.
"""

preface: list[str] = []

for line in reader:
if line.startswith(self.header_comment_char) or line.strip() == "":
preface.append(line.strip())
else:
break
else:
return None

fieldnames = line.strip().split(self.delimiter)

return FileHeader(preface=preface, fieldnames=fieldnames)
Loading

0 comments on commit 9d6a06f

Please sign in to comment.