Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# CHANGELOG

## 0.2.0
## 0.3.0

### Sentence corpus creation

- Sentence corpora generated from TEI now include line number field (`line_number`) based on line begin tag (`<lb>` n attribute)
- Support for ALTO XML input as a zipfile with multiple pages
- Skips non-ALTO files, logs warnings for invalid or empty xml
- Yields sentence corpora indexed across pages; ordering based on natural sort of filenames
- Improved logging output for `remarx-create-corpus` script, with optional verbose mode

## [0.2.0] - 2025-10-15

### Application

Expand Down Expand Up @@ -59,3 +69,4 @@ _Initial release._
- Add GitHub Actions workflow to build and publish python package on PyPI when a new GitHub release created

[0.1.0]: https://github.com/Princeton-CDH/remarx/tree/0.1
[0.2.0]: https://github.com/Princeton-CDH/remarx/tree/0.2
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"voyager>=2.1.0",
"fastapi",
"uvicorn",
"natsort>=8.4.0",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -81,6 +82,11 @@ omit = [

[tool.coverage.report]
show_missing = true # Helpful for debugging
exclude_lines = [
"# pragma: no cover",
# skip command-line configuration for main method on scripts
"if __name__ == .__main__.:"
]

[tool.coverage.html]
directory = "coverage_html_report"
Expand Down
2 changes: 1 addition & 1 deletion src/remarx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@

from remarx import app, sentence

__version__ = "0.2"
__version__ = "0.3rc1"

__all__ = ["__version__", "app", "sentence"]
11 changes: 10 additions & 1 deletion src/remarx/sentence/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@
Functionality for loading and chunking input files for sentence corpus creation.
"""

from remarx.sentence.corpus.alto_input import ALTOInput
from remarx.sentence.corpus.base_input import FileInput
from remarx.sentence.corpus.tei_input import TEI_TAG, TEIDocument, TEIinput, TEIPage
from remarx.sentence.corpus.text_input import TextInput

__all__ = ["TEI_TAG", "FileInput", "TEIDocument", "TEIPage", "TEIinput", "TextInput"]
__all__ = [
"TEI_TAG",
"ALTOInput",
"FileInput",
"TEIDocument",
"TEIPage",
"TEIinput",
"TextInput",
]
213 changes: 213 additions & 0 deletions src/remarx/sentence/corpus/alto_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
"""
Functionality related to parsing ALTO XML content packaged within a zipfile,
with the goal of creating a sentence corpus with associated metadata from ALTO.
"""

import logging
import pathlib
from collections.abc import Generator
from dataclasses import dataclass
from functools import cached_property
from timeit import default_timer as time
from typing import ClassVar
from zipfile import ZipFile

from lxml import etree
from natsort import natsorted
from neuxml import xmlmap

from remarx.sentence.corpus.base_input import FileInput, SectionType

logger = logging.getLogger(__name__)


ALTO_NAMESPACE_V4: str = "http://www.loc.gov/standards/alto/ns-v4#"


class AltoXmlObject(xmlmap.XmlObject):
"""
Base :class:`neuxml.xmlmap.XmlObject` class for ALTO-XML content.
"""

# alto namespace v4; we may eventually need to support other versions
ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"alto": ALTO_NAMESPACE_V4}


class AltoBlock(AltoXmlObject):
"""
Base class for an ALTO element with position information.
"""

vertical_position = xmlmap.FloatField("@VPOS")
horizontal_position = xmlmap.FloatField("@HPOS")


class TextLine(AltoBlock):
"""
Single line of text (`TextLine`) in an ALTO document
"""

text_content = xmlmap.StringField("alto:String/@CONTENT")

def __str__(self) -> str:
"""
Override default string method to return text content of this line.
"""
return self.text_content


class TextBlock(AltoBlock):
"""
Block of text with one or more lines.
"""

lines = xmlmap.NodeListField("alto:TextLine", TextLine)

@cached_property
def sorted_lines(self) -> list[TextLine]:
"""
Returns a list of TextLines for this block, sorted by vertical position.
"""
# there's no guarantee that xml document order follows page order,
# so sort by @VPOS (may need further refinement for more complicated layouts)
return sorted(self.lines, key=lambda line: line.vertical_position)

@property
def text_content(self) -> str:
"""
Text contents of this block; newline-delimited content of
each line within this block, sorted by vertical position.
"""
return "\n".join([line.text_content for line in self.sorted_lines])


class AltoDocument(AltoXmlObject):
"""
:class:`neuxml.xmlmap.XmlObject` instance for a single ALTO XML file
"""

blocks = xmlmap.NodeListField(".//alto:TextBlock", TextBlock)
lines = xmlmap.NodeListField(".//alto:TextLine", TextLine)

def is_alto(self) -> bool:
"""
Check if this is an ALTO-XML document, based on the root element
"""
# parse with QName to access namespace and tag name without namespace
root_element = etree.QName(self.node.tag)
# both must match
return (
root_element.namespace == ALTO_NAMESPACE_V4
and root_element.localname == "alto"
)

@property
def sorted_blocks(self) -> list[TextBlock]:
"""
Returns a list of TextBlocks for this page, sorted by vertical position.
"""
# there's no guarantee that xml document order follows page order,
# so sort by @VPOS (may need further refinement for more complicated layouts).
# NOTE: in some cases, a textblock may not have a VPOS attribute;
# in that case, use the position for the first line
# (text block id = eSc_dummyblock_, but appears to have real content)
# if block has no line, sort text block last
if not self.blocks:
return []
return sorted(
self.blocks,
key=lambda block: block.vertical_position
or (
block.sorted_lines[0].vertical_position if block.lines else float("inf")
),
)

def text_chunks(self) -> Generator[dict[str, str]]:
"""
Returns a generator of a dictionary of text content and section type,
one dictionary per text block on the page.
"""
# yield by block, since in future we may set section type
# based on block-level semantic tagging
for block in self.sorted_blocks:
yield {"text": block.text_content, "section_type": SectionType.TEXT.value}


@dataclass
class ALTOInput(FileInput):
"""
Preliminary FileInput implementation for ALTO XML delivered as a zipfile.
Iterates through ALTO XML members and stubs out chunk yielding for future parsing.
"""

field_names: ClassVar[tuple[str, ...]] = (*FileInput.field_names, "section_type")
"List of field names for sentences originating from ALTO XML content."

file_type: ClassVar[str] = ".zip"
"Supported file extension for ALTO zipfiles (.zip)"

def get_text(self) -> Generator[dict[str, str], None, None]:
"""
Iterate over ALTO XML files contained in the zipfile and return
a generator of text content.
"""
num_files = 0
num_valid_files = 0

start = time()
with ZipFile(self.input_file) as archive:
# iterate over all files in the zipfile;
# use natural sorting to process in logical order
for zip_filepath in natsorted(archive.namelist()):
num_files += 1
base_filename = pathlib.Path(zip_filepath).name
# ignore & log non-xml files
if not base_filename.lower().endswith(".xml"):
logger.info(
f"Ignoring non-xml file included in ALTO zipfile: {zip_filepath}"
)
continue
# if the file is .xml, attempt to open as an ALTO XML
with archive.open(zip_filepath) as xmlfile:
logger.info(f"Processing XML file {zip_filepath}")
# zipfile archive open returns a file-like object
try:
alto_xmlobj = xmlmap.load_xmlobject_from_file(
xmlfile, AltoDocument
)
except etree.XMLSyntaxError as err:
logger.warning(f"Skipping {zip_filepath} : invalid XML")
logger.debug(f"XML syntax error: {err}", exc_info=err)
continue

if not alto_xmlobj.is_alto():
# TODO: add unit test for this case
logger.warning(
f"Skipping non-ALTO XML file {zip_filepath} (root element {alto_xmlobj.node.tag})"
)
continue

num_valid_files += 1
# report total # blocks, lines for each file as processed
logger.debug(
f"{base_filename}: {len(alto_xmlobj.blocks)} blocks, {len(alto_xmlobj.lines)} lines"
)

# use the base xml file as filename here, rather than zipfile for all
for chunk in alto_xmlobj.text_chunks():
yield chunk | {"file": base_filename}

# warn if a document has no lines
if len(alto_xmlobj.lines) == 0:
logger.warning(
f"No text lines found in ALTO XML file: {base_filename}"
)

elapsed_time = time() - start
logger.info(
f"Processed {self.file_name} with {num_files} files ({num_valid_files} valid ALTO) in {elapsed_time:.1f} seconds"
)

# error if no valid files were found
if num_valid_files == 0:
raise ValueError(f"No valid ALTO XML files found in {self.file_name}")
30 changes: 24 additions & 6 deletions src/remarx/sentence/corpus/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,16 @@ def get_text(self) -> Generator[dict[str, str]]:
"""
raise NotImplementedError

def get_extra_metadata(
self, chunk_info: dict[str, Any], _char_idx: int, sentence: str
) -> dict[str, Any]:
"""
Hook method for subclasses to override to provide extra metadata for a sentence (e.g. line number).

:returns: Dictionary of additional metadata fields to include, or empty dict
"""
return {}

def get_sentences(self) -> Generator[dict[str, Any]]:
"""
Get sentences for this file, with associated metadata.
Expand All @@ -82,12 +92,20 @@ def get_sentences(self) -> Generator[dict[str, Any]]:

# character index is not included in output,
# but may be useful for sub-chunk metadata (e.g., line number)
yield chunk_info | {
"text": sentence,
"file": self.file_name,
"sent_index": sentence_index,
"sent_id": f"{self.file_name}:{sentence_index}",
}

# specify input file name first;
# chunk-specific filename take precedence (e.g. alto file within zip)
yield (
{"file": self.file_name}
| chunk_info
| {
"text": sentence,
"sent_index": sentence_index,
"sent_id": f"{self.file_name}:{sentence_index}",
}
# Include any extra metadata (subclass specific)
| self.get_extra_metadata(chunk_info, _char_idx, sentence)
)

# increment sentence index
sentence_index += 1
Expand Down
13 changes: 13 additions & 0 deletions src/remarx/sentence/corpus/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@

import argparse
import csv
import logging
import pathlib
import sys

from remarx.sentence.corpus.base_input import FileInput
from remarx.utils import configure_logging


def create_corpus(
Expand Down Expand Up @@ -56,9 +59,19 @@ def main() -> None:
parser.add_argument(
"output_csv", type=pathlib.Path, help="Path to output sentence corpus (CSV)"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Verbose output (debug logging)",
default=False,
)

args = parser.parse_args()

log_level = logging.DEBUG if args.verbose else logging.INFO

configure_logging(sys.stdout, log_level=log_level)
create_corpus(
args.input_file,
args.output_csv,
Expand Down
Loading