Princeton-CDH · rlskoeser · Oct 21, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 20, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,9 @@
 
 ### Sentence corpus creation
 
-- Preliminary support for ALTO XML input as a zipfile of multiple pages
+- Support for ALTO XML input as a zipfile with multiple pages
+    - Skips non-ALTO files, logs warnings for invalid or empty xml
+    - Yields sentence corpora indexed across pages; ordering based on natural sort of filenames
 
 ## [0.2.0] - 2025-10-15
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
   "voyager>=2.1.0",
   "fastapi",
   "uvicorn",
+  "natsort>=8.4.0",
 ]
 
 [project.optional-dependencies]

diff --git a/src/remarx/sentence/corpus/alto_input.py b/src/remarx/sentence/corpus/alto_input.py
@@ -4,111 +4,210 @@
 """
 
 import logging
-import xml.etree.ElementTree as ET
+import pathlib
 from collections.abc import Generator
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from functools import cached_property
+from timeit import default_timer as time
 from typing import ClassVar
 from zipfile import ZipFile
 
+from lxml import etree
+from natsort import natsorted
+from neuxml import xmlmap
+
 from remarx.sentence.corpus.base_input import FileInput, SectionType
 
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class ALTOInput(FileInput):
+ALTO_NAMESPACE_V4: str = "http://www.loc.gov/standards/alto/ns-v4#"
+
+
+class AltoXmlObject(xmlmap.XmlObject):
     """
-    Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
-    Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
+    Base :class:`neuxml.xmlmap.XmlObject` class for ALTO-XML content.
     """
 
-    ALTO_NAMESPACE: ClassVar[str] = "http://www.loc.gov/standards/alto/ns-v4#"
+    # alto namespace v4; we may eventually need to support other versions
+    ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"alto": ALTO_NAMESPACE_V4}
 
-    field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
-    "List of field names for sentences originating from ALTO XML content."
 
-    file_type: ClassVar[str] = ".zip"
-    "Supported file extension for ALTO zipfiles."
+class AltoBlock(AltoXmlObject):
+    """
+    Base class for an ALTO element with position information.
+    """
 
-    _validated: bool = field(init=False, default=False)
-    "Flag indicating whether the input archive has already been validated."
+    vertical_position = xmlmap.FloatField("@VPOS")
+    horizontal_position = xmlmap.FloatField("@HPOS")
 
-    _alto_members: list[str] = field(init=False, default_factory=list)
-    """Sorted list of ALTO XML filenames discovered during validation."""
 
-    def get_text(self) -> Generator[dict[str, str], None, None]:
+class TextLine(AltoBlock):
+    """
+    Single line of text (`TextLine`) in an ALTO document
+    """
+
+    text_content = xmlmap.StringField("alto:String/@CONTENT")
+
+    def __str__(self) -> str:
         """
-        Iterate over ALTO XML files contained in the zipfile to get all the text content.
+        Override default string method to return text content of this line.
         """
-        self.validate_archive()
+        return self.text_content
 
-        with ZipFile(self.input_file) as archive:
-            for member_name in self._alto_members:
-                logger.info("Processing ALTO XML file: %s", member_name)
 
-                yield from self._yield_text_for_member(archive, member_name)
+class TextBlock(AltoBlock):
+    """
+    Block of text with one or more lines.
+    """
+
+    lines = xmlmap.NodeListField("alto:TextLine", TextLine)
 
-    def validate_archive(self) -> None:
+    @cached_property
+    def sorted_lines(self) -> list[TextLine]:
         """
-        Validate the zipfile contents: every member must be an XML file, parse
-        cleanly, and declare an ALTO v4 root element. Caches the confirmed filenames
-        so later `get_text` calls can skip rescanning large zipfiles.
+        Returns a list of TextLines for this block, sorted by vertical position.
         """
-        if self._validated:
-            return
+        # there's no guarantee that xml document order follows page order,
+        # so sort by @VPOS (may need further refinement for more complicated layouts)
+        return sorted(self.lines, key=lambda line: line.vertical_position)
 
-        with ZipFile(self.input_file) as archive:
-            # ALTO XML filenames discovered in the zipfile
-            member_filenames: list[str] = []
-            for zip_info in archive.infolist():
-                if not zip_info.filename.lower().endswith(".xml"):
-                    raise ValueError(
-                        f"Non-XML file found in ALTO zipfile: {zip_info.filename}"
-                    )
-                member_filenames.append(zip_info.filename)
+    @property
+    def text_content(self) -> str:
+        """
+        Text contents of this block; newline-delimited content of
+        each line within this block, sorted by vertical position.
+        """
+        return "\n".join([line.text_content for line in self.sorted_lines])
 
-            if not member_filenames:
-                raise ValueError("ALTO zipfile does not contain any XML files")
 
-            for member_name in member_filenames:
-                with archive.open(member_name) as member_file:
-                    try:
-                        root = ET.parse(member_file).getroot()
-                    except ET.ParseError as exc:
-                        raise ValueError(
-                            f"Invalid XML in ALTO zipfile member: {member_name}"
-                        ) from exc
-
-                namespace, local_tag = self._split_tag(root.tag)
-                if local_tag.lower() != "alto":
-                    raise ValueError(
-                        f"File {member_name} is not an ALTO document (root tag {root.tag})"
-                    )
-                if namespace and namespace != self.ALTO_NAMESPACE:
-                    raise ValueError(
-                        f"Unsupported ALTO namespace in {member_name}: {namespace}"
-                    )
+class AltoDocument(AltoXmlObject):
+    """
+    :class:`neuxml.xmlmap.XmlObject` instance for a single ALTO XML file
+    """
 
-        self._alto_members = sorted(member_filenames)
-        self._validated = True
+    blocks = xmlmap.NodeListField(".//alto:TextBlock", TextBlock)
+    lines = xmlmap.NodeListField(".//alto:TextLine", TextLine)
 
-    def _yield_text_for_member(
-        self, archive: ZipFile, member_name: str
-    ) -> Generator[dict[str, str], None, None]:
+    def is_alto(self) -> bool:
+        """
+        Check if this is an ALTO-XML document, based on the root element
+        """
+        # parse with QName to access namespace and tag name without namespace
+        root_element = etree.QName(self.node.tag)
+        # both must match
+        return (
+            root_element.namespace == ALTO_NAMESPACE_V4
+            and root_element.localname == "alto"
+        )
+
+    @property
+    def sorted_blocks(self) -> list[TextBlock]:
         """
-        Hook for future ALTO parsing.
+        Returns a list of TextBlocks for this page, sorted by vertical position.
         """
-        yield {
-            "text": "",
-            "section_type": SectionType.TEXT.value,
-        }
+        # there's no guarantee that xml document order follows page order,
+        # so sort by @VPOS (may need further refinement for more complicated layouts).
+        # NOTE: in some cases, a textblock may not have a VPOS attribute;
+        # in that case, use the position for the first line
+        # (text block id = eSc_dummyblock_, but appears to have real content)
+        # if block has no line, sort text block last
+        if not self.blocks:
+            return []
+        return sorted(
+            self.blocks,
+            key=lambda block: block.vertical_position
+            or (
+                block.sorted_lines[0].vertical_position if block.lines else float("inf")
+            ),
+        )
+
+    def text_chunks(self) -> Generator[dict[str, str]]:
+        """
+        Returns a generator of a dictionary of text content and section type,
+        one dictionary per text block on the page.
+        """
+        # yield by block, since in future we may set section type
+        # based on block-level semantic tagging
+        for block in self.sorted_blocks:
+            yield {"text": block.text_content, "section_type": SectionType.TEXT.value}
+
+
+@dataclass
+class ALTOInput(FileInput):
+    """
+    Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
+    Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
+    """
 
-    @staticmethod
-    def _split_tag(tag: str) -> tuple[str | None, str]:
+    field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
+    "List of field names for sentences originating from ALTO XML content."
+
+    file_type: ClassVar[str] = ".zip"
+    "Supported file extension for ALTO zipfiles (.zip)"
+
+    def get_text(self) -> Generator[dict[str, str], None, None]:
         """
-        Split a potentially namespaced XML tag into (namespace, local_tag).
+        Iterate over ALTO XML files contained in the zipfile and return
+        a generator of text content.
         """
-        if tag.startswith("{"):
-            namespace, _, local = tag[1:].partition("}")
-            return namespace, local
-        return None, tag
+        num_files = 0
+        num_valid_files = 0
+
+        start = time()
+        with ZipFile(self.input_file) as archive:
+            # iterate over all files in the zipfile;
+            # use natural sorting to process in logical order
+            for zip_filepath in natsorted(archive.namelist()):
+                num_files += 1
+                base_filename = pathlib.Path(zip_filepath).name
+                # ignore & log non-xml files
+                if not base_filename.lower().endswith(".xml"):
+                    logger.info(
+                        f"Ignoring non-xml file included in ALTO zipfile: {zip_filepath}"
+                    )
+                    continue
+                # if the file is .xml, attempt to open as an ALTO XML
+                with archive.open(zip_filepath) as xmlfile:
+                    logger.info(f"Processing XML file {zip_filepath}")
+                    # zipfile archive open returns a file-like object
+                    try:
+                        alto_xmlobj = xmlmap.load_xmlobject_from_file(
+                            xmlfile, AltoDocument
+                        )
+                    except etree.XMLSyntaxError as err:
+                        logger.warning(f"Skipping {zip_filepath} : invalid XML")
+                        logger.debug(f"XML syntax error: {err}", exc_info=err)
+                        continue
+
+                if not alto_xmlobj.is_alto():
+                    # TODO: add unit test for this case
+                    logger.warning(
+                        f"Skipping non-ALTO XML file {zip_filepath} (root element {alto_xmlobj.node.tag})"
+                    )
+                    continue
+
+                num_valid_files += 1
+                # report total # blocks, lines for each file as processed
+                logger.debug(
+                    f"{base_filename}: {len(alto_xmlobj.blocks)} blocks, {len(alto_xmlobj.lines)} lines"
+                )
+
+                # use the base xml file as filename here, rather than zipfile for all
+                for chunk in alto_xmlobj.text_chunks():
+                    yield chunk | {"file": base_filename}
+
+                # warn if a document has no lines
+                if len(alto_xmlobj.lines) == 0:
+                    logger.warning(
+                        f"No text lines found in ALTO XML file: {base_filename}"
+                    )
+
+        elapsed_time = time() - start
+        logger.info(
+            f"Processed {self.file_name} with {num_files} files ({num_valid_files} valid ALTO) in {elapsed_time:.1f} seconds"
+        )
+
+        # error if no valid files were found
+        if num_valid_files == 0:
+            raise ValueError(f"No valid ALTO XML files found in {self.file_name}")
diff --git a/src/remarx/sentence/corpus/base_input.py b/src/remarx/sentence/corpus/base_input.py
@@ -82,12 +82,18 @@ def get_sentences(self) -> Generator[dict[str, Any]]:
 
                 # character index is not included in output,
                 # but may be useful for sub-chunk metadata (e.g., line number)
-                yield chunk_info | {
-                    "text": sentence,
-                    "file": self.file_name,
-                    "sent_index": sentence_index,
-                    "sent_id": f"{self.file_name}:{sentence_index}",
-                }
+
+                # specify input file name first;
+                # chunk-specific filename take precedence (e.g. alto file within zip)
+                yield (
+                    {"file": self.file_name}
+                    | chunk_info
+                    | {
+                        "text": sentence,
+                        "sent_index": sentence_index,
+                        "sent_id": f"{self.file_name}:{sentence_index}",
+                    }
+                )
 
                 # increment sentence index
                 sentence_index += 1

diff --git a/src/remarx/sentence/corpus/tei_input.py b/src/remarx/sentence/corpus/tei_input.py
@@ -39,7 +39,7 @@ class BaseTEIXmlObject(xmlmap.XmlObject):
 
 class TEIPage(BaseTEIXmlObject):
     """
-    Custom :class:`eulxml.xmlmap.XmlObject` instance for a page
+    Custom :class:`neuxml.xmlmap.XmlObject` instance for a page
     of content within a TEI XML document.
     """
 
@@ -155,7 +155,7 @@ def __str__(self) -> str:
 
 class TEIDocument(BaseTEIXmlObject):
     """
-    Custom :class:`eulxml.xmlmap.XmlObject` instance for TEI XML document.
+    Custom :class:`neuxml.xmlmap.XmlObject` instance for TEI XML document.
     Customized for MEGA TEI XML.
     """
 

diff --git a/tests/test_sentence/__init__.py b/tests/test_sentence/__init__.py
diff --git a/tests/test_sentence/test_corpus/__init__.py b/tests/test_sentence/test_corpus/__init__.py