Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

### Sentence corpus creation

- Sentence corpora generated from TEI now include line number field (`line_number`) based on line begin tag (`<lb>` n attribute
- Sentence corpora generated from TEI now include line number field (`line_number`) based on line begin tag (`<lb>` n attribute)
- Support for ALTO XML input as a zipfile with multiple pages
- Skips non-ALTO files, logs warnings for invalid or empty xml
- Yields sentence corpora indexed across pages; ordering based on natural sort of filenames
- Improved logging output for `remarx-create-corpus` script, with optional verbose mode

## [0.2.0] - 2025-10-15

Expand Down
13 changes: 13 additions & 0 deletions src/remarx/sentence/corpus/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@

import argparse
import csv
import logging
import pathlib
import sys

from remarx.sentence.corpus.base_input import FileInput
from remarx.utils import configure_logging


def create_corpus(
Expand Down Expand Up @@ -56,9 +59,19 @@ def main() -> None:
parser.add_argument(
"output_csv", type=pathlib.Path, help="Path to output sentence corpus (CSV)"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Verbose output (debug logging)",
default=False,
)

args = parser.parse_args()

log_level = logging.DEBUG if args.verbose else logging.INFO

configure_logging(sys.stdout, log_level=log_level)
create_corpus(
args.input_file,
args.output_csv,
Expand Down
18 changes: 15 additions & 3 deletions src/remarx/sentence/corpus/tei_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ def pages(self) -> list[TEIPage]:
# it's more efficient to filter in python than in xpath
return [page for page in self.all_pages if page.edition != "manuscript"]

@cached_property
def pages_by_number(self) -> dict[str, TEIPage]:
"""Dictionary lookup of standard pages by page number."""
return {page.number: page for page in self.pages}

@classmethod
def init_from_file(cls, path: pathlib.Path) -> Self:
"""
Expand Down Expand Up @@ -317,7 +322,9 @@ def get_text(self) -> Generator[dict[str, str]]:
"""
# yield body text and footnotes content chunked by page with page number
start = time()
for page in self.xml_doc.pages:
for page in self.xml_doc.pages_by_number.values():
page_start = time()

body_text = page.get_body_text()
if body_text:
yield {
Expand All @@ -336,9 +343,14 @@ def get_text(self) -> Generator[dict[str, str]]:
"line_number": footnote.line_number,
}

page_elapsed_time = time() - page_start
logger.debug(
f"Processing page {page.number} in {page_elapsed_time:.2f} seconds"
)

elapsed_time = time() - start
logger.info(
f"Processed {self.file_name} with {len(self.xml_doc.pages)} in {elapsed_time:.1f} seconds"
f"Processed {self.file_name} with {len(self.xml_doc.pages)} pages in {elapsed_time:.1f} seconds"
)

def get_extra_metadata(
Expand All @@ -356,7 +368,7 @@ def get_extra_metadata(

# Otherwise, calculate it for body text based on character position
page_number = chunk_info["page_number"]
page = next((p for p in self.xml_doc.pages if p.number == page_number), None)
page = self.xml_doc.pages_by_number.get(page_number)

if page:
line_number = page.get_body_text_line_number(char_idx)
Expand Down
12 changes: 11 additions & 1 deletion tests/test_sentence/test_corpus/test_create.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import pathlib
import sys
from unittest.mock import Mock, patch

import pytest
Expand Down Expand Up @@ -53,10 +55,18 @@ def test_create_corpus_filename_override(mock_file_input, tmp_path: pathlib.Path
)


@patch("remarx.sentence.corpus.create.configure_logging")
@patch("remarx.sentence.corpus.create.create_corpus", spec=create_corpus)
def test_main(mock_create_corpus):
def test_main(mock_create_corpus, mock_config_logging):
with patch("sys.argv", ["create_corpus.py", "input", "output"]):
main()
mock_create_corpus.assert_called_once_with(
pathlib.Path("input"), pathlib.Path("output")
)
mock_config_logging.assert_called_once_with(sys.stdout, log_level=logging.INFO)

mock_config_logging.reset_mock()
# test verbose arg
with patch("sys.argv", ["create_corpus.py", "input", "output", "-v"]):
main()
mock_config_logging.assert_called_once_with(sys.stdout, log_level=logging.DEBUG)
8 changes: 8 additions & 0 deletions tests/test_sentence/test_corpus/test_tei_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ def test_pages(self):
# for these pages, edition attribute is not present
assert all(p.edition is None for p in tei_doc.pages)

def test_pages_by_number(self):
tei_doc = TEIDocument.init_from_file(TEST_TEI_FILE)
assert isinstance(tei_doc.pages_by_number, dict)
assert list(tei_doc.pages_by_number.values()) == tei_doc.pages
assert list(tei_doc.pages_by_number.keys()) == [p.number for p in tei_doc.pages]
first_page = tei_doc.pages[0]
assert tei_doc.pages_by_number[first_page.number] == first_page


class TestTEIPage:
def test_attributes(self):
Expand Down
Loading