feat: Add initial DataclassWriter implementation (#3)

* chore: update gitignore * wip: add assertions for writable file * refactor: extract get_header() * feat: add fieldnames() * wip: add assert_file_is_appendable() * wip: initial DataclassWriter * wip: include/exclude fields * refactor: clean up attributes and docs * refactor: long-form mode * doc: update docstring
msto · Apr 13, 2024 · 9d6a06f · 9d6a06f
1 parent 1494824
commit 9d6a06f
Show file tree

Hide file tree

Showing 9 changed files with 573 additions and 68 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+.vscode/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/dataclass_io/_lib/assertions.py b/dataclass_io/_lib/assertions.py
@@ -1,9 +1,13 @@
 from dataclasses import is_dataclass
 from os import R_OK
+from os import W_OK
 from os import access
+from os import stat
 from pathlib import Path
 
 from dataclass_io._lib.dataclass_extensions import DataclassInstance
+from dataclass_io._lib.dataclass_extensions import fieldnames
+from dataclass_io._lib.file import get_header
 
 
 def assert_file_is_readable(path: Path) -> None:
@@ -26,6 +30,78 @@ def assert_file_is_readable(path: Path) -> None:
         raise PermissionError(f"The input file is not readable: {path}")
 
 
+def assert_file_is_writable(path: Path, overwrite: bool = True) -> None:
+    """
+    Check that the output file path is writable.
+
+    Optionally, ensure the output file does not exist.
+
+    Raises:
+        FileExistsError: If the provided file path exists when `overwrite` is set to `False`.
+        FileNotFoundError: If the provided file path's parent directory does not exist.
+        IsADirectoryError: If the provided file path is a directory.
+        PermissionError: If the provided file path is not writable.
+    """
+
+    if path.exists():
+        if not overwrite:
+            raise FileExistsError(
+                f"The output file already exists: {path}\n"
+                "Specify `overwrite=True` to overwrite the existing file."
+            )
+
+        if not path.is_file():
+            raise IsADirectoryError(f"The output file path is a directory: {path}")
+
+        if not access(path, W_OK):
+            raise PermissionError(f"The output file is not writable: {path}")
+
+    else:
+        if not path.parent.exists():
+            raise FileNotFoundError(
+                f"The specified directory for the output file path does not exist: {path.parent}"
+            )
+
+        if not access(path.parent, W_OK):
+            raise PermissionError(
+                f"The specified directory for the output file path is not writable: {path.parent}"
+            )
+
+
+def assert_file_is_appendable(path: Path, dataclass_type: type[DataclassInstance]) -> None:
+    if not path.exists():
+        raise FileNotFoundError(f"The specified output file does not exist: {path}")
+
+    if not path.is_file():
+        raise IsADirectoryError(f"The specified output file path is a directory: {path}")
+
+    if not access(path, W_OK):
+        raise PermissionError(f"The specified output file is not writable: {path}")
+
+    if stat(path).st_size == 0:
+        raise ValueError(f"The specified output file is empty: {path}")
+
+    if not access(path, R_OK):
+        raise PermissionError(
+            f"The specified output file is not readable: {path}\n"
+            "The output file must be readable to append to it. "
+            "The header of the existing output file is checked for consistency with the provided "
+            "dataclass before appending to it."
+        )
+
+    # TODO: pass delimiter and header_comment_char to get_header
+    with path.open("r") as f:
+        header = get_header(f)
+        if header is None:
+            raise ValueError(f"Could not find a header in the specified output file: {path}")
+
+        if header.fieldnames != fieldnames(dataclass_type):
+            raise ValueError(
+                "The specified output file does not have the same field names as the provided "
+                f"dataclass {path}"
+            )
+
+
 def assert_dataclass_is_valid(dataclass_type: type[DataclassInstance]) -> None:
     """
     Check that the input type is a parseable dataclass.
@@ -36,3 +112,24 @@ def assert_dataclass_is_valid(dataclass_type: type[DataclassInstance]) -> None:
 
     if not is_dataclass(dataclass_type):
         raise TypeError(f"The provided type must be a dataclass: {dataclass_type.__name__}")
+
+
+def assert_fieldnames_are_dataclass_attributes(
+    specified_fieldnames: list[str],
+    dataclass_type: type[DataclassInstance],
+) -> None:
+    """
+    Check that all of the specified fields are attributes on the given dataclass.
+
+    Raises:
+        ValueError: if any of the specified fieldnames are not an attribute on the given dataclass.
+    """
+
+    invalid_fieldnames = [f for f in specified_fieldnames if f not in fieldnames(dataclass_type)]
+
+    if len(invalid_fieldnames) > 0:
+        raise ValueError(
+            "One or more of the specified fields are not attributes on the dataclass "
+            + f"{dataclass_type.__name__}: "
+            + ", ".join(invalid_fieldnames)
+        )
diff --git a/dataclass_io/_lib/dataclass_extensions.py b/dataclass_io/_lib/dataclass_extensions.py
@@ -1,4 +1,5 @@
-
+from dataclasses import fields
+from dataclasses import is_dataclass
 from typing import Any
 from typing import ClassVar
 from typing import Protocol
@@ -16,3 +17,14 @@ class DataclassInstance(Protocol):
     """
 
     __dataclass_fields__: ClassVar[dict[str, Any]]
+
+
+def fieldnames(dataclass_type: type[DataclassInstance]) -> list[str]:
+    """
+    Return the fieldnames of the specified dataclass.
+    """
+
+    if not is_dataclass(dataclass_type):
+        raise TypeError(f"The provided type must be a dataclass: {dataclass_type.__name__}")
+
+    return [f.name for f in fields(dataclass_type)]
diff --git a/dataclass_io/_lib/file.py b/dataclass_io/_lib/file.py
@@ -0,0 +1,67 @@
+from dataclasses import dataclass
+from io import TextIOWrapper
+from typing import IO
+from typing import Optional
+from typing import TextIO
+from typing import TypeAlias
+
+ReadableFileHandle: TypeAlias = TextIOWrapper | IO | TextIO
+
+
+@dataclass(frozen=True, kw_only=True)
+class FileHeader:
+    """
+    Header of a file.
+
+    A file's header contains an optional preface, consisting of lines prefixed by a comment
+    character and/or empty lines, and a required row of fieldnames before the data rows begin.
+
+    Attributes:
+        preface: A list of any lines preceding the fieldnames.
+        fieldnames: The field names specified in the final line of the header.
+    """
+
+    preface: list[str]
+    fieldnames: list[str]
+
+
+def get_header(
+    reader: ReadableFileHandle,
+    delimiter: str = "\t",
+    header_comment_char: str = "#",
+) -> Optional[FileHeader]:
+    """
+    Read the header from an open file.
+
+    The first row after any commented or empty lines will be used as the fieldnames.
+
+    Lines preceding the fieldnames will be returned in the `preface.`
+
+    NB: This function returns `Optional` instead of raising an error because the name of the
+    source file is not in scope, making it difficult to provide a helpful error message. It is
+    the responsibility of the caller to raise an error if the file is empty.
+
+    See original proof-of-concept here: https://github.com/fulcrumgenomics/fgpyo/pull/103
+
+    Args:
+        reader: An open, readable file handle.
+        comment_char: The character which indicates the start of a comment line.
+
+    Returns:
+        A `FileHeader` containing the field names and any preceding lines.
+        None if the file was empty or contained only comments or empty lines.
+    """
+
+    preface: list[str] = []
+
+    for line in reader:
+        if line.startswith(header_comment_char) or line.strip() == "":
+            preface.append(line.strip())
+        else:
+            break
+    else:
+        return None
+
+    fieldnames = line.strip().split(delimiter)
+
+    return FileHeader(preface=preface, fieldnames=fieldnames)
diff --git a/dataclass_io/reader.py b/dataclass_io/reader.py
@@ -1,38 +1,16 @@
 from csv import DictReader
-from dataclasses import dataclass
 from dataclasses import fields
-from io import TextIOWrapper
 from pathlib import Path
 from types import TracebackType
-from typing import IO
 from typing import Any
-from typing import Optional
-from typing import TextIO
 from typing import Type
-from typing import TypeAlias
 
 from dataclass_io._lib.assertions import assert_dataclass_is_valid
 from dataclass_io._lib.assertions import assert_file_is_readable
 from dataclass_io._lib.dataclass_extensions import DataclassInstance
-
-ReadableFileHandle: TypeAlias = TextIOWrapper | IO | TextIO
-
-
-@dataclass(frozen=True, kw_only=True)
-class FileHeader:
-    """
-    Header of a file.
-
-    A file's header contains an optional preface, consisting of lines prefixed by a comment
-    character and/or empty lines, and a required row of fieldnames before the data rows begin.
-
-    Attributes:
-        preface: A list of any lines preceding the fieldnames.
-        fieldnames: The field names specified in the final line of the header.
-    """
-
-    preface: list[str]
-    fieldnames: list[str]
+from dataclass_io._lib.dataclass_extensions import fieldnames
+from dataclass_io._lib.file import FileHeader
+from dataclass_io._lib.file import get_header
 
 
 class DataclassReader:
@@ -65,11 +43,16 @@ def __init__(
 
         self._fin = path.open("r")
 
-        self._header = self._get_header(self._fin)
+        self._header: FileHeader = get_header(
+            self._fin,
+            delimiter=delimiter,
+            header_comment_char=header_comment_char,
+        )
+
         if self._header is None:
             raise ValueError(f"Could not find a header in the provided file: {path}")
 
-        if self._header.fieldnames != [f.name for f in fields(dataclass_type)]:
+        if self._header.fieldnames != fieldnames(dataclass_type):
             raise ValueError(
                 "The provided file does not have the same field names as the provided dataclass:\n"
                 f"\tDataclass: {dataclass_type.__name__}\n"
@@ -116,43 +99,3 @@ def _row_to_dataclass(self, row: dict[str, str]) -> DataclassInstance:
             coerced_values[field.name] = field.type(value)
 
         return self.dataclass_type(**coerced_values)
-
-    def _get_header(
-        self,
-        reader: ReadableFileHandle,
-    ) -> Optional[FileHeader]:
-        """
-        Read the header from an open file.
-
-        The first row after any commented or empty lines will be used as the fieldnames.
-
-        Lines preceding the fieldnames will be returned in the `preface.`
-
-        NB: This function returns `Optional` instead of raising an error because the name of the
-        source file is not in scope, making it difficult to provide a helpful error message. It is
-        the responsibility of the caller to raise an error if the file is empty.
-
-        See original proof-of-concept here: https://github.com/fulcrumgenomics/fgpyo/pull/103
-
-        Args:
-            reader: An open, readable file handle.
-            comment_char: The character which indicates the start of a comment line.
-
-        Returns:
-            A `FileHeader` containing the field names and any preceding lines.
-            None if the file was empty or contained only comments or empty lines.
-        """
-
-        preface: list[str] = []
-
-        for line in reader:
-            if line.startswith(self.header_comment_char) or line.strip() == "":
-                preface.append(line.strip())
-            else:
-                break
-        else:
-            return None
-
-        fieldnames = line.strip().split(self.delimiter)
-
-        return FileHeader(preface=preface, fieldnames=fieldnames)