Princeton-CDH · tanhaow · Oct 27, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,16 @@
 # CHANGELOG
 
-## 0.2.0
+## 0.3.0
+
+### Sentence corpus creation
+
+- Sentence corpora generated from TEI now include line number field (`line_number`) based on line begin tag (`<lb>` n attribute)
+- Support for ALTO XML input as a zipfile with multiple pages
+    - Skips non-ALTO files, logs warnings for invalid or empty xml
+    - Yields sentence corpora indexed across pages; ordering based on natural sort of filenames
+- Improved logging output for `remarx-create-corpus` script, with optional verbose mode
+
+## [0.2.0] - 2025-10-15
 
 ### Application
 
@@ -59,3 +69,4 @@ _Initial release._
 - Add GitHub Actions workflow to build and publish python package on PyPI when a new GitHub release created
 
 [0.1.0]: https://github.com/Princeton-CDH/remarx/tree/0.1
+[0.2.0]: https://github.com/Princeton-CDH/remarx/tree/0.2
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
   "voyager>=2.1.0",
   "fastapi",
   "uvicorn",
+  "natsort>=8.4.0",
 ]
 
 [project.optional-dependencies]
@@ -81,6 +82,11 @@ omit = [
 
 [tool.coverage.report]
 show_missing = true   # Helpful for debugging
+exclude_lines = [
+    "# pragma: no cover",
+    # skip command-line configuration for main method on scripts
+    "if __name__ == .__main__.:"
+]
 
 [tool.coverage.html]
 directory = "coverage_html_report"

diff --git a/src/remarx/__init__.py b/src/remarx/__init__.py
@@ -5,6 +5,6 @@
 
 from remarx import app, sentence
 
-__version__ = "0.2"
+__version__ = "0.3rc1"
 
 __all__ = ["__version__", "app", "sentence"]
diff --git a/src/remarx/sentence/corpus/__init__.py b/src/remarx/sentence/corpus/__init__.py
@@ -2,8 +2,17 @@
 Functionality for loading and chunking input files for sentence corpus creation.
 """
 
+from remarx.sentence.corpus.alto_input import ALTOInput
 from remarx.sentence.corpus.base_input import FileInput
 from remarx.sentence.corpus.tei_input import TEI_TAG, TEIDocument, TEIinput, TEIPage
 from remarx.sentence.corpus.text_input import TextInput
 
-__all__ = ["TEI_TAG", "FileInput", "TEIDocument", "TEIPage", "TEIinput", "TextInput"]
+__all__ = [
+    "TEI_TAG",
+    "ALTOInput",
+    "FileInput",
+    "TEIDocument",
+    "TEIPage",
+    "TEIinput",
+    "TextInput",
+]
diff --git a/src/remarx/sentence/corpus/alto_input.py b/src/remarx/sentence/corpus/alto_input.py
@@ -0,0 +1,213 @@
+"""
+Functionality related to parsing ALTO XML content packaged within a zipfile,
+with the goal of creating a sentence corpus with associated metadata from ALTO.
+"""
+
+import logging
+import pathlib
+from collections.abc import Generator
+from dataclasses import dataclass
+from functools import cached_property
+from timeit import default_timer as time
+from typing import ClassVar
+from zipfile import ZipFile
+
+from lxml import etree
+from natsort import natsorted
+from neuxml import xmlmap
+
+from remarx.sentence.corpus.base_input import FileInput, SectionType
+
+logger = logging.getLogger(__name__)
+
+
+ALTO_NAMESPACE_V4: str = "http://www.loc.gov/standards/alto/ns-v4#"
+
+
+class AltoXmlObject(xmlmap.XmlObject):
+    """
+    Base :class:`neuxml.xmlmap.XmlObject` class for ALTO-XML content.
+    """
+
+    # alto namespace v4; we may eventually need to support other versions
+    ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"alto": ALTO_NAMESPACE_V4}
+
+
+class AltoBlock(AltoXmlObject):
+    """
+    Base class for an ALTO element with position information.
+    """
+
+    vertical_position = xmlmap.FloatField("@VPOS")
+    horizontal_position = xmlmap.FloatField("@HPOS")
+
+
+class TextLine(AltoBlock):
+    """
+    Single line of text (`TextLine`) in an ALTO document
+    """
+
+    text_content = xmlmap.StringField("alto:String/@CONTENT")
+
+    def __str__(self) -> str:
+        """
+        Override default string method to return text content of this line.
+        """
+        return self.text_content
+
+
+class TextBlock(AltoBlock):
+    """
+    Block of text with one or more lines.
+    """
+
+    lines = xmlmap.NodeListField("alto:TextLine", TextLine)
+
+    @cached_property
+    def sorted_lines(self) -> list[TextLine]:
+        """
+        Returns a list of TextLines for this block, sorted by vertical position.
+        """
+        # there's no guarantee that xml document order follows page order,
+        # so sort by @VPOS (may need further refinement for more complicated layouts)
+        return sorted(self.lines, key=lambda line: line.vertical_position)
+
+    @property
+    def text_content(self) -> str:
+        """
+        Text contents of this block; newline-delimited content of
+        each line within this block, sorted by vertical position.
+        """
+        return "\n".join([line.text_content for line in self.sorted_lines])
+
+
+class AltoDocument(AltoXmlObject):
+    """
+    :class:`neuxml.xmlmap.XmlObject` instance for a single ALTO XML file
+    """
+
+    blocks = xmlmap.NodeListField(".//alto:TextBlock", TextBlock)
+    lines = xmlmap.NodeListField(".//alto:TextLine", TextLine)
+
+    def is_alto(self) -> bool:
+        """
+        Check if this is an ALTO-XML document, based on the root element
+        """
+        # parse with QName to access namespace and tag name without namespace
+        root_element = etree.QName(self.node.tag)
+        # both must match
+        return (
+            root_element.namespace == ALTO_NAMESPACE_V4
+            and root_element.localname == "alto"
+        )
+
+    @property
+    def sorted_blocks(self) -> list[TextBlock]:
+        """
+        Returns a list of TextBlocks for this page, sorted by vertical position.
+        """
+        # there's no guarantee that xml document order follows page order,
+        # so sort by @VPOS (may need further refinement for more complicated layouts).
+        # NOTE: in some cases, a textblock may not have a VPOS attribute;
+        # in that case, use the position for the first line
+        # (text block id = eSc_dummyblock_, but appears to have real content)
+        # if block has no line, sort text block last
+        if not self.blocks:
+            return []
+        return sorted(
+            self.blocks,
+            key=lambda block: block.vertical_position
+            or (
+                block.sorted_lines[0].vertical_position if block.lines else float("inf")
+            ),
+        )
+
+    def text_chunks(self) -> Generator[dict[str, str]]:
+        """
+        Returns a generator of a dictionary of text content and section type,
+        one dictionary per text block on the page.
+        """
+        # yield by block, since in future we may set section type
+        # based on block-level semantic tagging
+        for block in self.sorted_blocks:
+            yield {"text": block.text_content, "section_type": SectionType.TEXT.value}
+
+
+@dataclass
+class ALTOInput(FileInput):
+    """
+    Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
+    Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
+    """
+
+    field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
+    "List of field names for sentences originating from ALTO XML content."
+
+    file_type: ClassVar[str] = ".zip"
+    "Supported file extension for ALTO zipfiles (.zip)"
+
+    def get_text(self) -> Generator[dict[str, str], None, None]:
+        """
+        Iterate over ALTO XML files contained in the zipfile and return
+        a generator of text content.
+        """
+        num_files = 0
+        num_valid_files = 0
+
+        start = time()
+        with ZipFile(self.input_file) as archive:
+            # iterate over all files in the zipfile;
+            # use natural sorting to process in logical order
+            for zip_filepath in natsorted(archive.namelist()):
+                num_files += 1
+                base_filename = pathlib.Path(zip_filepath).name
+                # ignore & log non-xml files
+                if not base_filename.lower().endswith(".xml"):
+                    logger.info(
+                        f"Ignoring non-xml file included in ALTO zipfile: {zip_filepath}"
+                    )
+                    continue
+                # if the file is .xml, attempt to open as an ALTO XML
+                with archive.open(zip_filepath) as xmlfile:
+                    logger.info(f"Processing XML file {zip_filepath}")
+                    # zipfile archive open returns a file-like object
+                    try:
+                        alto_xmlobj = xmlmap.load_xmlobject_from_file(
+                            xmlfile, AltoDocument
+                        )
+                    except etree.XMLSyntaxError as err:
+                        logger.warning(f"Skipping {zip_filepath} : invalid XML")
+                        logger.debug(f"XML syntax error: {err}", exc_info=err)
+                        continue
+
+                if not alto_xmlobj.is_alto():
+                    # TODO: add unit test for this case
+                    logger.warning(
+                        f"Skipping non-ALTO XML file {zip_filepath} (root element {alto_xmlobj.node.tag})"
+                    )
+                    continue
+
+                num_valid_files += 1
+                # report total # blocks, lines for each file as processed
+                logger.debug(
+                    f"{base_filename}: {len(alto_xmlobj.blocks)} blocks, {len(alto_xmlobj.lines)} lines"
+                )
+
+                # use the base xml file as filename here, rather than zipfile for all
+                for chunk in alto_xmlobj.text_chunks():
+                    yield chunk | {"file": base_filename}
+
+                # warn if a document has no lines
+                if len(alto_xmlobj.lines) == 0:
+                    logger.warning(
+                        f"No text lines found in ALTO XML file: {base_filename}"
+                    )
+
+        elapsed_time = time() - start
+        logger.info(
+            f"Processed {self.file_name} with {num_files} files ({num_valid_files} valid ALTO) in {elapsed_time:.1f} seconds"
+        )
+
+        # error if no valid files were found
+        if num_valid_files == 0:
+            raise ValueError(f"No valid ALTO XML files found in {self.file_name}")
diff --git a/src/remarx/sentence/corpus/base_input.py b/src/remarx/sentence/corpus/base_input.py
@@ -60,6 +60,16 @@ def get_text(self) -> Generator[dict[str, str]]:
         """
         raise NotImplementedError
 
+    def get_extra_metadata(
+        self, chunk_info: dict[str, Any], _char_idx: int, sentence: str
+    ) -> dict[str, Any]:
+        """
+        Hook method for subclasses to override to provide extra metadata for a sentence (e.g. line number).
+
+        :returns: Dictionary of additional metadata fields to include, or empty dict
+        """
+        return {}
+
     def get_sentences(self) -> Generator[dict[str, Any]]:
         """
         Get sentences for this file, with associated metadata.
@@ -82,12 +92,20 @@ def get_sentences(self) -> Generator[dict[str, Any]]:
 
                 # character index is not included in output,
                 # but may be useful for sub-chunk metadata (e.g., line number)
-                yield chunk_info | {
-                    "text": sentence,
-                    "file": self.file_name,
-                    "sent_index": sentence_index,
-                    "sent_id": f"{self.file_name}:{sentence_index}",
-                }
+
+                # specify input file name first;
+                # chunk-specific filename take precedence (e.g. alto file within zip)
+                yield (
+                    {"file": self.file_name}
+                    | chunk_info
+                    | {
+                        "text": sentence,
+                        "sent_index": sentence_index,
+                        "sent_id": f"{self.file_name}:{sentence_index}",
+                    }
+                    # Include any extra metadata (subclass specific)
+                    | self.get_extra_metadata(chunk_info, _char_idx, sentence)
+                )
 
                 # increment sentence index
                 sentence_index += 1

diff --git a/src/remarx/sentence/corpus/create.py b/src/remarx/sentence/corpus/create.py
@@ -12,9 +12,12 @@
 
 import argparse
 import csv
+import logging
 import pathlib
+import sys
 
 from remarx.sentence.corpus.base_input import FileInput
+from remarx.utils import configure_logging
 
 
 def create_corpus(
@@ -56,9 +59,19 @@ def main() -> None:
     parser.add_argument(
         "output_csv", type=pathlib.Path, help="Path to output sentence corpus (CSV)"
     )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Verbose output (debug logging)",
+        default=False,
+    )
 
     args = parser.parse_args()
 
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+
+    configure_logging(sys.stdout, log_level=log_level)
     create_corpus(
         args.input_file,
         args.output_csv,