Princeton-CDH · tanhaow · Oct 20, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # CHANGELOG
 
+## 0.3.0
+
+### Sentence corpus creation
+
+- Preliminary support for ALTO XML input as a zipfile of multiple pages
+
 ## [0.2.0] - 2025-10-15
 
 ### Application

diff --git a/src/remarx/sentence/corpus/__init__.py b/src/remarx/sentence/corpus/__init__.py
@@ -2,8 +2,17 @@
 Functionality for loading and chunking input files for sentence corpus creation.
 """
 
+from remarx.sentence.corpus.alto_input import ALTOInput
 from remarx.sentence.corpus.base_input import FileInput
 from remarx.sentence.corpus.tei_input import TEI_TAG, TEIDocument, TEIinput, TEIPage
 from remarx.sentence.corpus.text_input import TextInput
 
-__all__ = ["TEI_TAG", "FileInput", "TEIDocument", "TEIPage", "TEIinput", "TextInput"]
+__all__ = [
+    "TEI_TAG",
+    "ALTOInput",
+    "FileInput",
+    "TEIDocument",
+    "TEIPage",
+    "TEIinput",
+    "TextInput",
+]
diff --git a/src/remarx/sentence/corpus/alto_input.py b/src/remarx/sentence/corpus/alto_input.py
@@ -0,0 +1,114 @@
+"""
+Functionality related to parsing ALTO XML content packaged within a zipfile,
+with the goal of creating a sentence corpus with associated metadata from ALTO.
+"""
+
+import logging
+import xml.etree.ElementTree as ET
+from collections.abc import Generator
+from dataclasses import dataclass, field
+from typing import ClassVar
+from zipfile import ZipFile
+
+from remarx.sentence.corpus.base_input import FileInput, SectionType
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ALTOInput(FileInput):
+    """
+    Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
+    Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
+    """
+
+    ALTO_NAMESPACE: ClassVar[str] = "http://www.loc.gov/standards/alto/ns-v4#"
+
+    field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
+    "List of field names for sentences originating from ALTO XML content."
+
+    file_type: ClassVar[str] = ".zip"
+    "Supported file extension for ALTO zipfiles."
+
+    _validated: bool = field(init=False, default=False)
+    "Flag indicating whether the input archive has already been validated."
+
+    _alto_members: list[str] = field(init=False, default_factory=list)
+    """Sorted list of ALTO XML filenames discovered during validation."""
+
+    def get_text(self) -> Generator[dict[str, str], None, None]:
+        """
+        Iterate over ALTO XML files contained in the zipfile to get all the text content.
+        """
+        self.validate_archive()
+
+        with ZipFile(self.input_file) as archive:
+            for member_name in self._alto_members:
+                logger.info("Processing ALTO XML file: %s", member_name)
+
+                yield from self._yield_text_for_member(archive, member_name)
+
+    def validate_archive(self) -> None:
+        """
+        Validate the zipfile contents: every member must be an XML file, parse
+        cleanly, and declare an ALTO v4 root element. Caches the confirmed filenames
+        so later `get_text` calls can skip rescanning large zipfiles.
+        """
+        if self._validated:
+            return
+
+        with ZipFile(self.input_file) as archive:
+            # ALTO XML filenames discovered in the zipfile
+            member_filenames: list[str] = []
+            for zip_info in archive.infolist():
+                if not zip_info.filename.lower().endswith(".xml"):
+                    raise ValueError(
+                        f"Non-XML file found in ALTO zipfile: {zip_info.filename}"
+                    )
+                member_filenames.append(zip_info.filename)
+
+            if not member_filenames:
+                raise ValueError("ALTO zipfile does not contain any XML files")
+
+            for member_name in member_filenames:
+                with archive.open(member_name) as member_file:
+                    try:
+                        root = ET.parse(member_file).getroot()
+                    except ET.ParseError as exc:
+                        raise ValueError(
+                            f"Invalid XML in ALTO zipfile member: {member_name}"
+                        ) from exc
+
+                namespace, local_tag = self._split_tag(root.tag)
+                if local_tag.lower() != "alto":
+                    raise ValueError(
+                        f"File {member_name} is not an ALTO document (root tag {root.tag})"
+                    )
+                if namespace and namespace != self.ALTO_NAMESPACE:
+                    raise ValueError(
+                        f"Unsupported ALTO namespace in {member_name}: {namespace}"
+                    )
+
+        self._alto_members = sorted(member_filenames)
+        self._validated = True
+
+    def _yield_text_for_member(
+        self, archive: ZipFile, member_name: str
+    ) -> Generator[dict[str, str], None, None]:
+        """
+        Hook for future ALTO parsing.
+        """
+        yield {
+            "text": "",
+            "section_type": SectionType.TEXT.value,
+        }
+
+    @staticmethod
+    def _split_tag(tag: str) -> tuple[str | None, str]:
+        """
+        Split a potentially namespaced XML tag into (namespace, local_tag).
+        """
+        if tag.startswith("{"):
+            namespace, _, local = tag[1:].partition("}")
+            return namespace, local
+        return None, tag
diff --git a/tests/test_sentence/test_corpus/fixtures/alto_sample.zip b/tests/test_sentence/test_corpus/fixtures/alto_sample.zip
diff --git a/tests/test_sentence/test_corpus/test_alto_input.py b/tests/test_sentence/test_corpus/test_alto_input.py
@@ -0,0 +1,109 @@
+import logging
+import shutil
+from pathlib import Path
+from zipfile import ZipFile
+
+import pytest
+
+from remarx.sentence.corpus.alto_input import ALTOInput
+from remarx.sentence.corpus.base_input import FileInput, SectionType
+
+
+@pytest.fixture
+def alto_sample_zip(tmp_path: Path) -> Path:
+    fixtures_dir = Path(__file__).parent / "fixtures"
+    source_zip = fixtures_dir / "alto_sample.zip"
+    destination = tmp_path / source_zip.name
+    shutil.copy(source_zip, destination)
+    return destination
+
+
+def test_field_names():
+    assert ALTOInput.field_names == (*FileInput.field_names, "section_type")
+
+
+def test_get_text_iterates_xml(alto_sample_zip, caplog):
+    alto_input = ALTOInput(input_file=alto_sample_zip)
+
+    with caplog.at_level(logging.INFO, logger="remarx.sentence.corpus.alto_input"):
+        chunks = list(alto_input.get_text())
+
+    expected_files = {
+        "1896-97a.pdf_page_1.xml",
+        "1896-97a.pdf_page_2.xml",
+        "1896-97a.pdf_page_3.xml",
+        "1896-97a.pdf_page_4.xml",
+        "1896-97a.pdf_page_5.xml",
+    }
+
+    assert len(chunks) == len(expected_files)
+    assert all(
+        chunk == {"text": "", "section_type": SectionType.TEXT.value}
+        for chunk in chunks
+    )
+
+    processed_files = {
+        record.getMessage().removeprefix("Processing ALTO XML file: ").strip()
+        for record in caplog.records
+        if record.name == "remarx.sentence.corpus.alto_input"
+        and record.getMessage().startswith("Processing ALTO XML file: ")
+    }
+    assert processed_files == expected_files
+
+
+def test_validate_archive_success(alto_sample_zip):
+    alto_input = ALTOInput(input_file=alto_sample_zip)
+    # Should not raise
+    alto_input.validate_archive()
+    # Second call should reuse cached validation flag without error
+    alto_input.validate_archive()
+    assert alto_input._alto_members == sorted(
+        [
+            "1896-97a.pdf_page_1.xml",
+            "1896-97a.pdf_page_2.xml",
+            "1896-97a.pdf_page_3.xml",
+            "1896-97a.pdf_page_4.xml",
+            "1896-97a.pdf_page_5.xml",
+        ]
+    )
+
+
+def test_validate_archive_rejects_non_xml(tmp_path: Path):
+    archive_path = tmp_path / "invalid.zip"
+    with ZipFile(archive_path, "w") as archive:
+        archive.writestr("page1.txt", "not xml file")
+
+    alto_input = ALTOInput(input_file=archive_path)
+    with pytest.raises(ValueError, match="Non-XML file"):
+        alto_input.validate_archive()
+
+
+def test_validate_archive_rejects_non_alto_xml(tmp_path: Path):
+    archive_path = tmp_path / "not_alto.zip"
+    with ZipFile(archive_path, "w") as archive:
+        archive.writestr("page1.xml", "<root></root>")
+
+    alto_input = ALTOInput(input_file=archive_path)
+    with pytest.raises(ValueError, match="not an ALTO document"):
+        alto_input.validate_archive()
+
+
+def test_validate_archive_rejects_unknown_namespace(tmp_path: Path):
+    archive_path = tmp_path / "unknown_ns.zip"
+    xml_content = '<alto xmlns="http://unknown_namespace.com/alto/ns#"><Description></Description></alto>'
+    with ZipFile(archive_path, "w") as archive:
+        archive.writestr("page1.xml", xml_content)
+
+    alto_input = ALTOInput(input_file=archive_path)
+    with pytest.raises(ValueError, match="Unsupported ALTO namespace"):
+        alto_input.validate_archive()
+
+
+def test_validate_archive_rejects_empty_zip(tmp_path: Path):
+    archive_path = tmp_path / "empty.zip"
+    with ZipFile(archive_path, "w"):
+        pass
+
+    alto_input = ALTOInput(input_file=archive_path)
+    with pytest.raises(ValueError, match="does not contain any XML files"):
+        alto_input.validate_archive()
diff --git a/tests/test_sentence/test_corpus/test_base_input.py b/tests/test_sentence/test_corpus/test_base_input.py
@@ -11,7 +11,7 @@ def test_subclasses():
     subclass_names = [cls.__name__ for cls in FileInput.subclasses()]
     # NOTE: that we use names here rather than importing, to
     # confirm subclasses are found without a direct import
-    for input_cls_name in ["TextInput", "TEIinput"]:
+    for input_cls_name in ["TextInput", "TEIinput", "ALTOInput"]:
         assert input_cls_name in subclass_names
 
 
@@ -42,7 +42,7 @@ def test_field_names(tmp_path: pathlib.Path):
 def test_supported_types():
     # check for expected supported types
     # NOTE: checking directly to avoid importing input classes
-    assert set(FileInput.supported_types()) == {".txt", ".xml"}
+    assert set(FileInput.supported_types()) == {".txt", ".xml", ".zip"}
 
 
 def test_get_text(tmp_path: pathlib.Path):
@@ -113,10 +113,19 @@ def test_create_tei(mock_tei_doc, tmp_path: pathlib.Path):
     mock_tei_doc.init_from_file.assert_called_with(xml_input_file)
 
 
+def test_create_alto(tmp_path: pathlib.Path):
+    from remarx.sentence.corpus.alto_input import ALTOInput
+
+    zip_input_file = tmp_path / "input.zip"
+    zip_input_file.touch()
+    zip_input = FileInput.create(input_file=zip_input_file)
+    assert isinstance(zip_input, ALTOInput)
+
+
 def test_create_unsupported(tmp_path: pathlib.Path):
     test_file = tmp_path / "input.test"
     with pytest.raises(
         ValueError,
-        match="\\.test is not a supported input type \\(must be one of \\.txt, \\.xml\\)",
+        match="\\.test is not a supported input type \\(must be one of \\.txt, \\.xml, \\.zip\\)",
     ):
         FileInput.create(input_file=test_file)