Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
9ca2a24
add line numbers to each sentence when build the corpus
tanhaow Oct 15, 2025
3a51f08
change to hook method
tanhaow Oct 15, 2025
e20829c
refactor sentence line number
tanhaow Oct 16, 2025
5036aab
use regex to clean text part
tanhaow Oct 16, 2025
013e65d
Update tei_input.py
tanhaow Oct 16, 2025
c1652b3
Merge remote-tracking branch 'origin/develop' into feature/add-line-n…
tanhaow Oct 16, 2025
92494fc
Update CHANGELOG.md
tanhaow Oct 16, 2025
a561915
Update tei_input.py
tanhaow Oct 16, 2025
60d6c5d
increase code coverage
tanhaow Oct 16, 2025
8264a84
Update tei_input.py
tanhaow Oct 16, 2025
61c4703
Update tei_input.py
tanhaow Oct 16, 2025
8695dcf
Update tei_input.py
tanhaow Oct 16, 2025
183a8e3
Update base_input.py
tanhaow Oct 17, 2025
fada6de
Update tei_input.py
tanhaow Oct 20, 2025
aa9f3e3
Update src/remarx/sentence/corpus/tei_input.py
tanhaow Oct 20, 2025
dd4b0b7
revise per suggesitons
tanhaow Oct 20, 2025
3e87114
Add comment noting why we need to call lstrip() first
tanhaow Oct 21, 2025
72885af
fix the multiple lb tags bug found by laure
tanhaow Oct 21, 2025
0d5df55
handle the case where text immediately after an <lb/> nested inside i…
tanhaow Oct 21, 2025
83f84ed
handle inline-markup cases
tanhaow Oct 21, 2025
5e2d218
Merge branch 'develop' into feature/add-line-numbers-to-corpus
tanhaow Oct 21, 2025
c6de485
Update test_tei_input.py
tanhaow Oct 21, 2025
752fe06
Merge branch 'feature/add-line-numbers-to-corpus' of https://github.c…
tanhaow Oct 21, 2025
6f43d4e
Revert "Update test_tei_input.py"
tanhaow Oct 21, 2025
7357091
add more tests to pass coverage check
tanhaow Oct 21, 2025
a21842f
Refactor preceding lb method; remove unnecessary footnote method
rlskoeser Oct 21, 2025
1cab2c1
Add unit test for find preceding lb method
rlskoeser Oct 21, 2025
8b1c659
Merge branch 'develop' into feature/add-line-numbers-to-corpus
rlskoeser Oct 21, 2025
d3d24b0
Add time logging for TEI input handling
rlskoeser Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Sentence corpus creation

- Sentence corpora generated from TEI now include line number field (`line_number`) based on line begin tag (`<lb>` n attribute
- Support for ALTO XML input as a zipfile with multiple pages
- Skips non-ALTO files, logs warnings for invalid or empty xml
- Yields sentence corpora indexed across pages; ordering based on natural sort of filenames
Expand Down
12 changes: 12 additions & 0 deletions src/remarx/sentence/corpus/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,16 @@ def get_text(self) -> Generator[dict[str, str]]:
"""
raise NotImplementedError

def get_extra_metadata(
self, chunk_info: dict[str, Any], _char_idx: int, sentence: str
) -> dict[str, Any]:
"""
Hook method for subclasses to override to provide extra metadata for a sentence (e.g. line number).

:returns: Dictionary of additional metadata fields to include, or empty dict
"""
return {}

def get_sentences(self) -> Generator[dict[str, Any]]:
"""
Get sentences for this file, with associated metadata.
Expand Down Expand Up @@ -93,6 +103,8 @@ def get_sentences(self) -> Generator[dict[str, Any]]:
"sent_index": sentence_index,
"sent_id": f"{self.file_name}:{sentence_index}",
}
# Include any extra metadata (subclass specific)
| self.get_extra_metadata(chunk_info, _char_idx, sentence)
)

# increment sentence index
Expand Down
171 changes: 146 additions & 25 deletions src/remarx/sentence/corpus/tei_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,30 @@
from the TEI.
"""

import logging
import pathlib
import re
from collections import namedtuple
from collections.abc import Generator
from dataclasses import dataclass, field
from functools import cached_property
from typing import ClassVar, NamedTuple, Self
from timeit import default_timer as time
from typing import Any, ClassVar, NamedTuple, Self

from lxml.etree import XMLSyntaxError, _Element
from neuxml import xmlmap

from remarx.sentence.corpus.base_input import FileInput, SectionType

logger = logging.getLogger(__name__)


TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"

# namespaced tags look like {http://www.tei-c.org/ns/1.0}tagname
# create a named tuple of short tag name -> namespaced tag name
TagNames: NamedTuple = namedtuple(
"TagNames", ("pb", "lb", "note", "add", "label", "ref", "div3")
"TagNames", ("pb", "lb", "note", "add", "label", "ref", "div3", "text", "p")
)
TEI_TAG = TagNames(**{tag: f"{{{TEI_NAMESPACE}}}{tag}" for tag in TagNames._fields})
"Convenience access to namespaced TEI tag names"
Expand All @@ -37,6 +42,17 @@ class BaseTEIXmlObject(xmlmap.XmlObject):
ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"t": TEI_NAMESPACE}


class TEIFootnote(BaseTEIXmlObject):
"""XmlObject class for footnotes."""

line_number = xmlmap.IntegerField("./t:lb[1]/@n")
"Line number where this footnote begins, based on first TEI line beginning (`lb`) within this note"

# use xmlmap StringField method with normalize=True to collapse whitespace in the footnote text
text = xmlmap.StringField(".", normalize=True)
"Normalized text content for the footnote (collapses whitespace)"


class TEIPage(BaseTEIXmlObject):
"""
Custom :class:`neuxml.xmlmap.XmlObject` instance for a page
Expand All @@ -56,7 +72,7 @@ class TEIPage(BaseTEIXmlObject):
# fetch footnotes after the current page break; will filter them in Python later
# pb is a delimiter (not a container), so "following::note" returns all later footnotes
following_footnotes = xmlmap.NodeListField(
"following::t:note[@type='footnote']", xmlmap.XmlObject
"following::t:note[@type='footnote']", TEIFootnote
)
"list of footnote elements within this page and following pages"

Expand All @@ -80,12 +96,12 @@ def is_footnote_content(el: _Element) -> bool:
TEIPage.is_footnote_content(ancestor) for ancestor in el.iterancestors()
)

def get_page_footnotes(self) -> list[xmlmap.XmlObject]:
def get_page_footnotes(self) -> list[TEIFootnote]:
"""
Filters footnotes to keep only the footnotes that belong to this page.
Only includes footnotes that occur between this pb and the next standard pb[not(@ed)].
"""
page_footnotes: list[xmlmap.XmlObject] = []
page_footnotes: list[TEIFootnote] = []

for footnote in self.following_footnotes:
# If we have a next page and this footnote belongs to it, we're done
Expand All @@ -95,11 +111,60 @@ def get_page_footnotes(self) -> list[xmlmap.XmlObject]:

return page_footnotes

def get_body_text_line_number(self, char_pos: int) -> int | None:
"""
Return the TEI line number for the line at or before `char_pos`.
Returns None if no line number can be determined.
"""
if not hasattr(self, "line_number_by_offset"):
self.get_body_text()

# When there are no line breaks with line numbers, return None
if not self.line_number_by_offset:
return None

line_number = None
for offset, ln in self.line_number_by_offset.items():
if offset > char_pos:
break
line_number = ln
return line_number

@staticmethod
def find_preceding_lb(element: _Element) -> _Element | None:
"""
Find the closest preceding <lb> element for an element.
Needed to find the <lb/> relative to immediately following
inline markup, e.g. <lb n="31"/><hi>text ...</hi>
"""

# First, iterate over preceding siblings;
# Limit to TEI nodes to avoid iterating over non-tag nodes like comments
for sibling in element.itersiblings(f"{{{TEI_NAMESPACE}}}*", preceding=True):
if sibling.tag == TEI_TAG.lb:
return sibling
# if we hit a preceding paragraph, stop iterating (beyond inline text)
if sibling.tag == TEI_TAG.p:
break

# if not found, try on the parent element
parent = element.getparent()
# if no parent, or hit a text element, we've gone too far; bail out
if parent is None or parent.tag == TEI_TAG.text:
return None
return TEIPage.find_preceding_lb(element.getparent())

def get_body_text(self) -> str:
"""
Extract body text content for this page, excluding footnotes and editorial content.
While collecting the text, build a mapping of character offsets to TEI line numbers.
"""
body_text_parts = []
body_text_parts: list[str] = []
self.line_number_by_offset: dict[int, int] = {}
char_offset = 0

last_lb_el = None

for text in self.text_nodes:
# text here is an lxml smart string, which preserves context
# in the xml tree and is associated with a parent tag.
Expand All @@ -122,29 +187,54 @@ def get_body_text(self) -> str:
# OR if text comes immediately after (is_tail) and is whitespace only
continue

body_text_parts.append(text)
cleaned_text = re.sub(r"\s*\n\s*", "\n", str(text))

# Use lstrip() to strip leading whitespace from the very first text fragment
# before concatenation to avoid counting leading newlines toward `char_offset`,
# skewing line lookups.
if not body_text_parts:
cleaned_text = cleaned_text.lstrip()

# check for line begin tag; could be direct parent
# but in cases where <lb> is immediately followed by inline markup,
# it may be skipped due to having no tail text
preceding_lb = None
if parent.tag == TEI_TAG.lb:
preceding_lb = parent
else:
preceding_lb = self.find_preceding_lb(parent)

if preceding_lb is not None and preceding_lb is not last_lb_el:
# store the line number and character offset
line_number = preceding_lb.get("n")
self.line_number_by_offset[char_offset] = (
int(line_number) if line_number else None
)
# ensure text separated by <lb\> has a newline
# if there is a preceding text segment and it does not end
# with a newline, add one to the current text
if body_text_parts and not body_text_parts[-1].endswith("\n"):
cleaned_text = f"\n{cleaned_text}"

# set this element as the last lb handled, so we don't duplicate
last_lb_el = preceding_lb

if not cleaned_text:
continue

# consolidate whitespace once after joining all parts
# (i.e., space between indented tags in the XML)
return re.sub(r"\s*\n\s*", "\n", "".join(body_text_parts)).strip()
body_text_parts.append(cleaned_text)
char_offset += len(cleaned_text)

def get_individual_footnotes(self) -> Generator[str]:
"""
Get individual footnote content as a generator.
Yields each footnote's text content individually as a separate string element.
Each yielded element corresponds to one complete footnote from the page.
"""
for footnote in self.get_page_footnotes():
footnote_text = str(footnote).strip()
# consolidate whitespace for footnotes
footnote_text = re.sub(r"\s*\n\s*", "\n", footnote_text)
yield footnote_text
# join text parts and trim trailing whitespace
body_text = "".join(body_text_parts).rstrip()

return body_text

def get_footnote_text(self) -> str:
"""
Get all footnote content as a single string, with footnotes separated by double newlines.
"""
return "\n\n".join(self.get_individual_footnotes())
return "\n\n".join(fn.text for fn in self.get_page_footnotes())

def __str__(self) -> str:
"""
Expand Down Expand Up @@ -200,6 +290,7 @@ class TEIinput(FileInput):
*FileInput.field_names,
"page_number",
"section_type",
"line_number",
)
"List of field names for sentences from TEI XML input files"

Expand All @@ -225,6 +316,7 @@ def get_text(self) -> Generator[dict[str, str]]:
:returns: Generator with dictionaries of text content, with page number and section_type ("text" or "footnote").
"""
# yield body text and footnotes content chunked by page with page number
start = time()
for page in self.xml_doc.pages:
body_text = page.get_body_text()
if body_text:
Expand All @@ -235,10 +327,39 @@ def get_text(self) -> Generator[dict[str, str]]:
}

# Yield each footnote individually to enforce separate sentence segmentation
# so that separate footnotes cannot be combined into a single sentence by segmentation.
for footnote_text in page.get_individual_footnotes():
# so that separate footnotes cannot be combined into a single sentence
for footnote in page.get_page_footnotes():
yield {
"text": footnote_text,
"text": footnote.text,
"page_number": page.number,
"section_type": SectionType.FOOTNOTE.value,
"line_number": footnote.line_number,
}

elapsed_time = time() - start
logger.info(
f"Processed {self.file_name} with {len(self.xml_doc.pages)} in {elapsed_time:.1f} seconds"
)

def get_extra_metadata(
self, chunk_info: dict[str, Any], char_idx: int, sentence: str
) -> dict[str, Any]:
"""
Calculate extra metadata including line number for a sentence in TEI documents
based on the character position within the text chunk (page body or footnote).

:returns: Dictionary with line_number for the sentence (None if not found)
"""
# If line_number is already in chunk_info (e.g., for footnotes), use it directly
if "line_number" in chunk_info:
return {"line_number": chunk_info["line_number"]}

# Otherwise, calculate it for body text based on character position
page_number = chunk_info["page_number"]
page = next((p for p in self.xml_doc.pages if p.number == page_number), None)

if page:
line_number = page.get_body_text_line_number(char_idx)
return {"line_number": line_number}

return {"line_number": None}
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,44 @@
<label type="footnote">
<hi rendition="sup">5</hi>)</label> Erste Anmerkung auf Seite 19.
</note>

<!-- A real example from Das Kapital where <lb/> tag has no line number -->
<pb n="20"/>
<figure type="facsimile">
<graphic url="facsimiles/sample-caption-page.jpg" mimeType="image/jpeg"/>
<ab type="caption">
<lb/>Erste Umschlagseite der Erstausgabe
<lb n="2"/>Second line with an explicit line number.
</ab>
</figure>

<!-- An example where we have multiple line breaks on the same line -->
<pb n="21"/>
<p>
<lb n="27"/>Zur Vorbereitung auf die folgenden Abschnitte fasse ich den bisherigen Gang:
<lb n="28"/>Das bisher Entwickelte gilt unter der Voraussetzung, daß im Fortgang der<lb n="29"/>Accumulation <hi rendition="i">das Verhältniß zwischen der Masse der Produktionsmittel und</hi>
<lb n="30"/>der Beschäftigten stets neu austariert werden muß.
</p>
<p>
<lb n="31"/><hi rendition="i">Schiedensten Proportionen</hi> mit andern Artikeln aus.
<lb n="32"/>Ihr Tauschwerth bleibt unverändert, egal in welchen Kombinationen er erscheint.
</p>

<!-- Mixed inline/tail <lb/> tags example (continuing same page) -->
<p>
<lb n="33"/>
<hi rendition="i">Schiedensten Proportionen</hi> mit andern Artikeln aus. Dennoch bleibt sein
<lb n="34"/>Tauschwerth <hi rendition="i">unverändert</hi>, ob in x Stiefelwichse, y Seide, z Gold u. s. w.
<lb n="35"/>ausgedrückt. Er muß also von diesen seinen verschiedenen <hi rendition="i">Ausdrucks-
<lb n="36"/>weisen</hi> unterscheidbar sein.
</p>
<p>
<lb n="37"/><lb n="38"/>
<lb n="39"/>Leerzeile als separates Beispiel.
</p>

<pb n="22"/>
<p>Eine Seite ohne markierte Zeilenumbrüche.</p>
</text>
</group>
</text>
Expand Down
Loading