Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

### Sentence corpus creation

- Preliminary support for ALTO XML input as a zipfile of multiple pages
- Support for ALTO XML input as a zipfile with multiple pages
- Skips non-ALTO files, logs warnings for invalid or empty xml
- Yields sentence corpora indexed across pages; ordering based on natural sort of filenames

## [0.2.0] - 2025-10-15

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"voyager>=2.1.0",
"fastapi",
"uvicorn",
"natsort>=8.4.0",
]

[project.optional-dependencies]
Expand Down
247 changes: 173 additions & 74 deletions src/remarx/sentence/corpus/alto_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,111 +4,210 @@
"""

import logging
import xml.etree.ElementTree as ET
import pathlib
from collections.abc import Generator
from dataclasses import dataclass, field
from dataclasses import dataclass
from functools import cached_property
from timeit import default_timer as time
from typing import ClassVar
from zipfile import ZipFile

from lxml import etree
from natsort import natsorted
from neuxml import xmlmap

from remarx.sentence.corpus.base_input import FileInput, SectionType

logger = logging.getLogger(__name__)


@dataclass
class ALTOInput(FileInput):
ALTO_NAMESPACE_V4: str = "http://www.loc.gov/standards/alto/ns-v4#"


class AltoXmlObject(xmlmap.XmlObject):
"""
Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
Base :class:`neuxml.xmlmap.XmlObject` class for ALTO-XML content.
"""

ALTO_NAMESPACE: ClassVar[str] = "http://www.loc.gov/standards/alto/ns-v4#"
# alto namespace v4; we may eventually need to support other versions
ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"alto": ALTO_NAMESPACE_V4}

field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
"List of field names for sentences originating from ALTO XML content."

file_type: ClassVar[str] = ".zip"
"Supported file extension for ALTO zipfiles."
class AltoBlock(AltoXmlObject):
"""
Base class for an ALTO element with position information.
"""

_validated: bool = field(init=False, default=False)
"Flag indicating whether the input archive has already been validated."
vertical_position = xmlmap.FloatField("@VPOS")
horizontal_position = xmlmap.FloatField("@HPOS")

_alto_members: list[str] = field(init=False, default_factory=list)
"""Sorted list of ALTO XML filenames discovered during validation."""

def get_text(self) -> Generator[dict[str, str], None, None]:
class TextLine(AltoBlock):
"""
Single line of text (`TextLine`) in an ALTO document
"""

text_content = xmlmap.StringField("alto:String/@CONTENT")

def __str__(self) -> str:
"""
Iterate over ALTO XML files contained in the zipfile to get all the text content.
Override default string method to return text content of this line.
"""
self.validate_archive()
return self.text_content

with ZipFile(self.input_file) as archive:
for member_name in self._alto_members:
logger.info("Processing ALTO XML file: %s", member_name)

yield from self._yield_text_for_member(archive, member_name)
class TextBlock(AltoBlock):
"""
Block of text with one or more lines.
"""

lines = xmlmap.NodeListField("alto:TextLine", TextLine)

def validate_archive(self) -> None:
@cached_property
def sorted_lines(self) -> list[TextLine]:
"""
Validate the zipfile contents: every member must be an XML file, parse
cleanly, and declare an ALTO v4 root element. Caches the confirmed filenames
so later `get_text` calls can skip rescanning large zipfiles.
Returns a list of TextLines for this block, sorted by vertical position.
"""
if self._validated:
return
# there's no guarantee that xml document order follows page order,
# so sort by @VPOS (may need further refinement for more complicated layouts)
return sorted(self.lines, key=lambda line: line.vertical_position)

with ZipFile(self.input_file) as archive:
# ALTO XML filenames discovered in the zipfile
member_filenames: list[str] = []
for zip_info in archive.infolist():
if not zip_info.filename.lower().endswith(".xml"):
raise ValueError(
f"Non-XML file found in ALTO zipfile: {zip_info.filename}"
)
member_filenames.append(zip_info.filename)
@property
def text_content(self) -> str:
"""
Text contents of this block; newline-delimited content of
each line within this block, sorted by vertical position.
"""
return "\n".join([line.text_content for line in self.sorted_lines])

if not member_filenames:
raise ValueError("ALTO zipfile does not contain any XML files")

for member_name in member_filenames:
with archive.open(member_name) as member_file:
try:
root = ET.parse(member_file).getroot()
except ET.ParseError as exc:
raise ValueError(
f"Invalid XML in ALTO zipfile member: {member_name}"
) from exc

namespace, local_tag = self._split_tag(root.tag)
if local_tag.lower() != "alto":
raise ValueError(
f"File {member_name} is not an ALTO document (root tag {root.tag})"
)
if namespace and namespace != self.ALTO_NAMESPACE:
raise ValueError(
f"Unsupported ALTO namespace in {member_name}: {namespace}"
)
class AltoDocument(AltoXmlObject):
"""
:class:`neuxml.xmlmap.XmlObject` instance for a single ALTO XML file
"""

self._alto_members = sorted(member_filenames)
self._validated = True
blocks = xmlmap.NodeListField(".//alto:TextBlock", TextBlock)
lines = xmlmap.NodeListField(".//alto:TextLine", TextLine)

def _yield_text_for_member(
self, archive: ZipFile, member_name: str
) -> Generator[dict[str, str], None, None]:
def is_alto(self) -> bool:
"""
Check if this is an ALTO-XML document, based on the root element
"""
# parse with QName to access namespace and tag name without namespace
root_element = etree.QName(self.node.tag)
# both must match
return (
root_element.namespace == ALTO_NAMESPACE_V4
and root_element.localname == "alto"
)

@property
def sorted_blocks(self) -> list[TextBlock]:
"""
Hook for future ALTO parsing.
Returns a list of TextBlocks for this page, sorted by vertical position.
"""
yield {
"text": "",
"section_type": SectionType.TEXT.value,
}
# there's no guarantee that xml document order follows page order,
# so sort by @VPOS (may need further refinement for more complicated layouts).
# NOTE: in some cases, a textblock may not have a VPOS attribute;
# in that case, use the position for the first line
# (text block id = eSc_dummyblock_, but appears to have real content)
# if block has no line, sort text block last
if not self.blocks:
return []
return sorted(
self.blocks,
key=lambda block: block.vertical_position
or (
block.sorted_lines[0].vertical_position if block.lines else float("inf")
),
)

def text_chunks(self) -> Generator[dict[str, str]]:
"""
Returns a generator of a dictionary of text content and section type,
one dictionary per text block on the page.
"""
# yield by block, since in future we may set section type
# based on block-level semantic tagging
for block in self.sorted_blocks:
yield {"text": block.text_content, "section_type": SectionType.TEXT.value}


@dataclass
class ALTOInput(FileInput):
"""
Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
"""

@staticmethod
def _split_tag(tag: str) -> tuple[str | None, str]:
field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
"List of field names for sentences originating from ALTO XML content."

file_type: ClassVar[str] = ".zip"
"Supported file extension for ALTO zipfiles (.zip)"

def get_text(self) -> Generator[dict[str, str], None, None]:
"""
Split a potentially namespaced XML tag into (namespace, local_tag).
Iterate over ALTO XML files contained in the zipfile and return
a generator of text content.
"""
if tag.startswith("{"):
namespace, _, local = tag[1:].partition("}")
return namespace, local
return None, tag
num_files = 0
num_valid_files = 0

start = time()
with ZipFile(self.input_file) as archive:
# iterate over all files in the zipfile;
# use natural sorting to process in logical order
for zip_filepath in natsorted(archive.namelist()):
num_files += 1
base_filename = pathlib.Path(zip_filepath).name
# ignore & log non-xml files
if not base_filename.lower().endswith(".xml"):
logger.info(
f"Ignoring non-xml file included in ALTO zipfile: {zip_filepath}"
)
continue
# if the file is .xml, attempt to open as an ALTO XML
with archive.open(zip_filepath) as xmlfile:
logger.info(f"Processing XML file {zip_filepath}")
# zipfile archive open returns a file-like object
try:
alto_xmlobj = xmlmap.load_xmlobject_from_file(
xmlfile, AltoDocument
)
except etree.XMLSyntaxError as err:
logger.warning(f"Skipping {zip_filepath} : invalid XML")
logger.debug(f"XML syntax error: {err}", exc_info=err)
continue

if not alto_xmlobj.is_alto():
# TODO: add unit test for this case
logger.warning(
f"Skipping non-ALTO XML file {zip_filepath} (root element {alto_xmlobj.node.tag})"
)
continue

num_valid_files += 1
# report total # blocks, lines for each file as processed
logger.debug(
f"{base_filename}: {len(alto_xmlobj.blocks)} blocks, {len(alto_xmlobj.lines)} lines"
)

# use the base xml file as filename here, rather than zipfile for all
for chunk in alto_xmlobj.text_chunks():
yield chunk | {"file": base_filename}

# warn if a document has no lines
if len(alto_xmlobj.lines) == 0:
logger.warning(
f"No text lines found in ALTO XML file: {base_filename}"
)

elapsed_time = time() - start
logger.info(
f"Processed {self.file_name} with {num_files} files ({num_valid_files} valid ALTO) in {elapsed_time:.1f} seconds"
)

# error if no valid files were found
if num_valid_files == 0:
raise ValueError(f"No valid ALTO XML files found in {self.file_name}")
18 changes: 12 additions & 6 deletions src/remarx/sentence/corpus/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,18 @@ def get_sentences(self) -> Generator[dict[str, Any]]:

# character index is not included in output,
# but may be useful for sub-chunk metadata (e.g., line number)
yield chunk_info | {
"text": sentence,
"file": self.file_name,
"sent_index": sentence_index,
"sent_id": f"{self.file_name}:{sentence_index}",
}

# specify input file name first;
# chunk-specific filename take precedence (e.g. alto file within zip)
yield (
{"file": self.file_name}
| chunk_info
| {
"text": sentence,
"sent_index": sentence_index,
"sent_id": f"{self.file_name}:{sentence_index}",
}
)

# increment sentence index
sentence_index += 1
Expand Down
4 changes: 2 additions & 2 deletions src/remarx/sentence/corpus/tei_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class BaseTEIXmlObject(xmlmap.XmlObject):

class TEIPage(BaseTEIXmlObject):
"""
Custom :class:`eulxml.xmlmap.XmlObject` instance for a page
Custom :class:`neuxml.xmlmap.XmlObject` instance for a page
of content within a TEI XML document.
"""

Expand Down Expand Up @@ -155,7 +155,7 @@ def __str__(self) -> str:

class TEIDocument(BaseTEIXmlObject):
"""
Custom :class:`eulxml.xmlmap.XmlObject` instance for TEI XML document.
Custom :class:`neuxml.xmlmap.XmlObject` instance for TEI XML document.
Customized for MEGA TEI XML.
"""

Expand Down
Empty file added tests/test_sentence/__init__.py
Empty file.
Empty file.
Loading