Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# CHANGELOG

## 0.3.0

### Sentence corpus creation

- Preliminary support for ALTO XML input as a zipfile of multiple pages

## [0.2.0] - 2025-10-15

### Application
Expand Down
11 changes: 10 additions & 1 deletion src/remarx/sentence/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@
Functionality for loading and chunking input files for sentence corpus creation.
"""

from remarx.sentence.corpus.alto_input import ALTOInput
from remarx.sentence.corpus.base_input import FileInput
from remarx.sentence.corpus.tei_input import TEI_TAG, TEIDocument, TEIinput, TEIPage
from remarx.sentence.corpus.text_input import TextInput

__all__ = ["TEI_TAG", "FileInput", "TEIDocument", "TEIPage", "TEIinput", "TextInput"]
__all__ = [
"TEI_TAG",
"ALTOInput",
"FileInput",
"TEIDocument",
"TEIPage",
"TEIinput",
"TextInput",
]
114 changes: 114 additions & 0 deletions src/remarx/sentence/corpus/alto_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""
Functionality related to parsing ALTO XML content packaged within a zipfile,
with the goal of creating a sentence corpus with associated metadata from ALTO.
"""

import logging
import xml.etree.ElementTree as ET
from collections.abc import Generator
from dataclasses import dataclass, field
from typing import ClassVar
from zipfile import ZipFile

from remarx.sentence.corpus.base_input import FileInput, SectionType

logger = logging.getLogger(__name__)


@dataclass
class ALTOInput(FileInput):
"""
Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
"""

ALTO_NAMESPACE: ClassVar[str] = "http://www.loc.gov/standards/alto/ns-v4#"

field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
"List of field names for sentences originating from ALTO XML content."

file_type: ClassVar[str] = ".zip"
"Supported file extension for ALTO zipfiles."

_validated: bool = field(init=False, default=False)
"Flag indicating whether the input archive has already been validated."

_alto_members: list[str] = field(init=False, default_factory=list)
"""Sorted list of ALTO XML filenames discovered during validation."""

def get_text(self) -> Generator[dict[str, str], None, None]:
"""
Iterate over ALTO XML files contained in the zipfile to get all the text content.
"""
self.validate_archive()

with ZipFile(self.input_file) as archive:
for member_name in self._alto_members:
logger.info("Processing ALTO XML file: %s", member_name)

yield from self._yield_text_for_member(archive, member_name)

def validate_archive(self) -> None:
"""
Validate the zipfile contents: every member must be an XML file, parse
cleanly, and declare an ALTO v4 root element. Caches the confirmed filenames
so later `get_text` calls can skip rescanning large zipfiles.
"""
if self._validated:
return

with ZipFile(self.input_file) as archive:
# ALTO XML filenames discovered in the zipfile
member_filenames: list[str] = []
for zip_info in archive.infolist():
if not zip_info.filename.lower().endswith(".xml"):
raise ValueError(
f"Non-XML file found in ALTO zipfile: {zip_info.filename}"
)
member_filenames.append(zip_info.filename)

if not member_filenames:
raise ValueError("ALTO zipfile does not contain any XML files")

for member_name in member_filenames:
with archive.open(member_name) as member_file:
try:
root = ET.parse(member_file).getroot()
except ET.ParseError as exc:
raise ValueError(
f"Invalid XML in ALTO zipfile member: {member_name}"
) from exc

namespace, local_tag = self._split_tag(root.tag)
if local_tag.lower() != "alto":
raise ValueError(
f"File {member_name} is not an ALTO document (root tag {root.tag})"
)
if namespace and namespace != self.ALTO_NAMESPACE:
raise ValueError(
f"Unsupported ALTO namespace in {member_name}: {namespace}"
)

self._alto_members = sorted(member_filenames)
self._validated = True

def _yield_text_for_member(
self, archive: ZipFile, member_name: str
) -> Generator[dict[str, str], None, None]:
"""
Hook for future ALTO parsing.
"""
yield {
"text": "",
"section_type": SectionType.TEXT.value,
}

@staticmethod
def _split_tag(tag: str) -> tuple[str | None, str]:
"""
Split a potentially namespaced XML tag into (namespace, local_tag).
"""
if tag.startswith("{"):
namespace, _, local = tag[1:].partition("}")
return namespace, local
return None, tag
Binary file not shown.
109 changes: 109 additions & 0 deletions tests/test_sentence/test_corpus/test_alto_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import logging
import shutil
from pathlib import Path
from zipfile import ZipFile

import pytest

from remarx.sentence.corpus.alto_input import ALTOInput
from remarx.sentence.corpus.base_input import FileInput, SectionType


@pytest.fixture
def alto_sample_zip(tmp_path: Path) -> Path:
fixtures_dir = Path(__file__).parent / "fixtures"
source_zip = fixtures_dir / "alto_sample.zip"
destination = tmp_path / source_zip.name
shutil.copy(source_zip, destination)
return destination


def test_field_names():
assert ALTOInput.field_names == (*FileInput.field_names, "section_type")


def test_get_text_iterates_xml(alto_sample_zip, caplog):
alto_input = ALTOInput(input_file=alto_sample_zip)

with caplog.at_level(logging.INFO, logger="remarx.sentence.corpus.alto_input"):
chunks = list(alto_input.get_text())

expected_files = {
"1896-97a.pdf_page_1.xml",
"1896-97a.pdf_page_2.xml",
"1896-97a.pdf_page_3.xml",
"1896-97a.pdf_page_4.xml",
"1896-97a.pdf_page_5.xml",
}

assert len(chunks) == len(expected_files)
assert all(
chunk == {"text": "", "section_type": SectionType.TEXT.value}
for chunk in chunks
)

processed_files = {
record.getMessage().removeprefix("Processing ALTO XML file: ").strip()
for record in caplog.records
if record.name == "remarx.sentence.corpus.alto_input"
and record.getMessage().startswith("Processing ALTO XML file: ")
}
assert processed_files == expected_files


def test_validate_archive_success(alto_sample_zip):
alto_input = ALTOInput(input_file=alto_sample_zip)
# Should not raise
alto_input.validate_archive()
# Second call should reuse cached validation flag without error
alto_input.validate_archive()
assert alto_input._alto_members == sorted(
[
"1896-97a.pdf_page_1.xml",
"1896-97a.pdf_page_2.xml",
"1896-97a.pdf_page_3.xml",
"1896-97a.pdf_page_4.xml",
"1896-97a.pdf_page_5.xml",
]
)


def test_validate_archive_rejects_non_xml(tmp_path: Path):
archive_path = tmp_path / "invalid.zip"
with ZipFile(archive_path, "w") as archive:
archive.writestr("page1.txt", "not xml file")

alto_input = ALTOInput(input_file=archive_path)
with pytest.raises(ValueError, match="Non-XML file"):
alto_input.validate_archive()


def test_validate_archive_rejects_non_alto_xml(tmp_path: Path):
archive_path = tmp_path / "not_alto.zip"
with ZipFile(archive_path, "w") as archive:
archive.writestr("page1.xml", "<root></root>")

alto_input = ALTOInput(input_file=archive_path)
with pytest.raises(ValueError, match="not an ALTO document"):
alto_input.validate_archive()


def test_validate_archive_rejects_unknown_namespace(tmp_path: Path):
archive_path = tmp_path / "unknown_ns.zip"
xml_content = '<alto xmlns="http://unknown_namespace.com/alto/ns#"><Description></Description></alto>'
with ZipFile(archive_path, "w") as archive:
archive.writestr("page1.xml", xml_content)

alto_input = ALTOInput(input_file=archive_path)
with pytest.raises(ValueError, match="Unsupported ALTO namespace"):
alto_input.validate_archive()


def test_validate_archive_rejects_empty_zip(tmp_path: Path):
archive_path = tmp_path / "empty.zip"
with ZipFile(archive_path, "w"):
pass

alto_input = ALTOInput(input_file=archive_path)
with pytest.raises(ValueError, match="does not contain any XML files"):
alto_input.validate_archive()
15 changes: 12 additions & 3 deletions tests/test_sentence/test_corpus/test_base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_subclasses():
subclass_names = [cls.__name__ for cls in FileInput.subclasses()]
# NOTE: that we use names here rather than importing, to
# confirm subclasses are found without a direct import
for input_cls_name in ["TextInput", "TEIinput"]:
for input_cls_name in ["TextInput", "TEIinput", "ALTOInput"]:
assert input_cls_name in subclass_names


Expand Down Expand Up @@ -42,7 +42,7 @@ def test_field_names(tmp_path: pathlib.Path):
def test_supported_types():
# check for expected supported types
# NOTE: checking directly to avoid importing input classes
assert set(FileInput.supported_types()) == {".txt", ".xml"}
assert set(FileInput.supported_types()) == {".txt", ".xml", ".zip"}


def test_get_text(tmp_path: pathlib.Path):
Expand Down Expand Up @@ -113,10 +113,19 @@ def test_create_tei(mock_tei_doc, tmp_path: pathlib.Path):
mock_tei_doc.init_from_file.assert_called_with(xml_input_file)


def test_create_alto(tmp_path: pathlib.Path):
from remarx.sentence.corpus.alto_input import ALTOInput

zip_input_file = tmp_path / "input.zip"
zip_input_file.touch()
zip_input = FileInput.create(input_file=zip_input_file)
assert isinstance(zip_input, ALTOInput)


def test_create_unsupported(tmp_path: pathlib.Path):
test_file = tmp_path / "input.test"
with pytest.raises(
ValueError,
match="\\.test is not a supported input type \\(must be one of \\.txt, \\.xml\\)",
match="\\.test is not a supported input type \\(must be one of \\.txt, \\.xml, \\.zip\\)",
):
FileInput.create(input_file=test_file)