Princeton-CDH · tanhaow · Oct 22, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Sentence corpus creation
 
+- Sentence corpora generated from TEI now include line number field (`line_number`) based on line begin tag (`<lb>` n attribute
 - Support for ALTO XML input as a zipfile with multiple pages
     - Skips non-ALTO files, logs warnings for invalid or empty xml
     - Yields sentence corpora indexed across pages; ordering based on natural sort of filenames

diff --git a/src/remarx/sentence/corpus/base_input.py b/src/remarx/sentence/corpus/base_input.py
@@ -60,6 +60,16 @@ def get_text(self) -> Generator[dict[str, str]]:
         """
         raise NotImplementedError
 
+    def get_extra_metadata(
+        self, chunk_info: dict[str, Any], _char_idx: int, sentence: str
+    ) -> dict[str, Any]:
+        """
+        Hook method for subclasses to override to provide extra metadata for a sentence (e.g. line number).
+
+        :returns: Dictionary of additional metadata fields to include, or empty dict
+        """
+        return {}
+
     def get_sentences(self) -> Generator[dict[str, Any]]:
         """
         Get sentences for this file, with associated metadata.
@@ -93,6 +103,8 @@ def get_sentences(self) -> Generator[dict[str, Any]]:
                         "sent_index": sentence_index,
                         "sent_id": f"{self.file_name}:{sentence_index}",
                     }
+                    # Include any extra metadata (subclass specific)
+                    | self.get_extra_metadata(chunk_info, _char_idx, sentence)
                 )
 
                 # increment sentence index

diff --git a/src/remarx/sentence/corpus/tei_input.py b/src/remarx/sentence/corpus/tei_input.py
@@ -4,25 +4,30 @@
 from the TEI.
 """
 
+import logging
 import pathlib
 import re
 from collections import namedtuple
 from collections.abc import Generator
 from dataclasses import dataclass, field
 from functools import cached_property
-from typing import ClassVar, NamedTuple, Self
+from timeit import default_timer as time
+from typing import Any, ClassVar, NamedTuple, Self
 
 from lxml.etree import XMLSyntaxError, _Element
 from neuxml import xmlmap
 
 from remarx.sentence.corpus.base_input import FileInput, SectionType
 
+logger = logging.getLogger(__name__)
+
+
 TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
 
 # namespaced tags look like {http://www.tei-c.org/ns/1.0}tagname
 # create a named tuple of short tag name -> namespaced tag name
 TagNames: NamedTuple = namedtuple(
-    "TagNames", ("pb", "lb", "note", "add", "label", "ref", "div3")
+    "TagNames", ("pb", "lb", "note", "add", "label", "ref", "div3", "text", "p")
 )
 TEI_TAG = TagNames(**{tag: f"{{{TEI_NAMESPACE}}}{tag}" for tag in TagNames._fields})
 "Convenience access to namespaced TEI tag names"
@@ -37,6 +42,17 @@ class BaseTEIXmlObject(xmlmap.XmlObject):
     ROOT_NAMESPACES: ClassVar[dict[str, str]] = {"t": TEI_NAMESPACE}
 
 
+class TEIFootnote(BaseTEIXmlObject):
+    """XmlObject class for footnotes."""
+
+    line_number = xmlmap.IntegerField("./t:lb[1]/@n")
+    "Line number where this footnote begins, based on first TEI line beginning (`lb`) within this note"
+
+    # use xmlmap StringField method with normalize=True to collapse whitespace in the footnote text
+    text = xmlmap.StringField(".", normalize=True)
+    "Normalized text content for the footnote (collapses whitespace)"
+
+
 class TEIPage(BaseTEIXmlObject):
     """
     Custom :class:`neuxml.xmlmap.XmlObject` instance for a page
@@ -56,7 +72,7 @@ class TEIPage(BaseTEIXmlObject):
     # fetch footnotes after the current page break; will filter them in Python later
     # pb is a delimiter (not a container), so "following::note" returns all later footnotes
     following_footnotes = xmlmap.NodeListField(
-        "following::t:note[@type='footnote']", xmlmap.XmlObject
+        "following::t:note[@type='footnote']", TEIFootnote
     )
     "list of footnote elements within this page and following pages"
 
@@ -80,12 +96,12 @@ def is_footnote_content(el: _Element) -> bool:
             TEIPage.is_footnote_content(ancestor) for ancestor in el.iterancestors()
         )
 
-    def get_page_footnotes(self) -> list[xmlmap.XmlObject]:
+    def get_page_footnotes(self) -> list[TEIFootnote]:
         """
         Filters footnotes to keep only the footnotes that belong to this page.
         Only includes footnotes that occur between this pb and the next standard pb[not(@ed)].
         """
-        page_footnotes: list[xmlmap.XmlObject] = []
+        page_footnotes: list[TEIFootnote] = []
 
         for footnote in self.following_footnotes:
             # If we have a next page and this footnote belongs to it, we're done
@@ -95,11 +111,60 @@ def get_page_footnotes(self) -> list[xmlmap.XmlObject]:
 
         return page_footnotes
 
+    def get_body_text_line_number(self, char_pos: int) -> int | None:
+        """
+        Return the TEI line number for the line at or before `char_pos`.
+        Returns None if no line number can be determined.
+        """
+        if not hasattr(self, "line_number_by_offset"):
+            self.get_body_text()
+
+        # When there are no line breaks with line numbers, return None
+        if not self.line_number_by_offset:
+            return None
+
+        line_number = None
+        for offset, ln in self.line_number_by_offset.items():
+            if offset > char_pos:
+                break
+            line_number = ln
+        return line_number
+
+    @staticmethod
+    def find_preceding_lb(element: _Element) -> _Element | None:
+        """
+        Find the closest preceding <lb> element for an element.
+        Needed to find the <lb/> relative to immediately following
+        inline markup, e.g. <lb n="31"/><hi>text ...</hi>
+        """
+
+        # First, iterate over preceding siblings;
+        # Limit to TEI nodes to avoid iterating over non-tag nodes like comments
+        for sibling in element.itersiblings(f"{{{TEI_NAMESPACE}}}*", preceding=True):
+            if sibling.tag == TEI_TAG.lb:
+                return sibling
+            # if we hit a preceding paragraph, stop iterating (beyond inline text)
+            if sibling.tag == TEI_TAG.p:
+                break
+
+        # if not found, try on the parent element
+        parent = element.getparent()
+        # if no parent, or hit a text element, we've gone too far; bail out
+        if parent is None or parent.tag == TEI_TAG.text:
+            return None
+        return TEIPage.find_preceding_lb(element.getparent())
+
     def get_body_text(self) -> str:
         """
         Extract body text content for this page, excluding footnotes and editorial content.
+        While collecting the text, build a mapping of character offsets to TEI line numbers.
         """
-        body_text_parts = []
+        body_text_parts: list[str] = []
+        self.line_number_by_offset: dict[int, int] = {}
+        char_offset = 0
+
+        last_lb_el = None
+
         for text in self.text_nodes:
             # text here is an lxml smart string, which preserves context
             # in the xml tree and is associated with a parent tag.
@@ -122,29 +187,54 @@ def get_body_text(self) -> str:
                 # OR if text comes immediately after (is_tail) and is whitespace only
                 continue
 
-            body_text_parts.append(text)
+            cleaned_text = re.sub(r"\s*\n\s*", "\n", str(text))
+
+            # Use lstrip() to strip leading whitespace from the very first text fragment
+            # before concatenation to avoid counting leading newlines toward `char_offset`,
+            # skewing line lookups.
+            if not body_text_parts:
+                cleaned_text = cleaned_text.lstrip()
+
+            # check for line begin tag; could be direct parent
+            # but in cases where <lb> is immediately followed by inline markup,
+            # it may be skipped due to having no tail text
+            preceding_lb = None
+            if parent.tag == TEI_TAG.lb:
+                preceding_lb = parent
+            else:
+                preceding_lb = self.find_preceding_lb(parent)
+
+            if preceding_lb is not None and preceding_lb is not last_lb_el:
+                # store the line number and character offset
+                line_number = preceding_lb.get("n")
+                self.line_number_by_offset[char_offset] = (
+                    int(line_number) if line_number else None
+                )
+                # ensure text separated by <lb\> has a newline
+                # if there is a preceding text segment and it does not end
+                # with a newline, add one to the current text
+                if body_text_parts and not body_text_parts[-1].endswith("\n"):
+                    cleaned_text = f"\n{cleaned_text}"
+
+                # set this element as the last lb handled, so we don't duplicate
+                last_lb_el = preceding_lb
+
+            if not cleaned_text:
+                continue
 
-        # consolidate whitespace once after joining all parts
-        # (i.e., space between indented tags in the XML)
-        return re.sub(r"\s*\n\s*", "\n", "".join(body_text_parts)).strip()
+            body_text_parts.append(cleaned_text)
+            char_offset += len(cleaned_text)
 
-    def get_individual_footnotes(self) -> Generator[str]:
-        """
-        Get individual footnote content as a generator.
-        Yields each footnote's text content individually as a separate string element.
-        Each yielded element corresponds to one complete footnote from the page.
-        """
-        for footnote in self.get_page_footnotes():
-            footnote_text = str(footnote).strip()
-            # consolidate whitespace for footnotes
-            footnote_text = re.sub(r"\s*\n\s*", "\n", footnote_text)
-            yield footnote_text
+        # join text parts and trim trailing whitespace
+        body_text = "".join(body_text_parts).rstrip()
+
+        return body_text
 
     def get_footnote_text(self) -> str:
         """
         Get all footnote content as a single string, with footnotes separated by double newlines.
         """
-        return "\n\n".join(self.get_individual_footnotes())
+        return "\n\n".join(fn.text for fn in self.get_page_footnotes())
 
     def __str__(self) -> str:
         """
@@ -200,6 +290,7 @@ class TEIinput(FileInput):
         *FileInput.field_names,
         "page_number",
         "section_type",
+        "line_number",
     )
     "List of field names for sentences from TEI XML input files"
 
@@ -225,6 +316,7 @@ def get_text(self) -> Generator[dict[str, str]]:
         :returns: Generator with dictionaries of text content, with page number and section_type ("text" or "footnote").
         """
         # yield body text and footnotes content chunked by page with page number
+        start = time()
         for page in self.xml_doc.pages:
             body_text = page.get_body_text()
             if body_text:
@@ -235,10 +327,39 @@ def get_text(self) -> Generator[dict[str, str]]:
                 }
 
             # Yield each footnote individually to enforce separate sentence segmentation
-            # so that separate footnotes cannot be combined into a single sentence by segmentation.
-            for footnote_text in page.get_individual_footnotes():
+            # so that separate footnotes cannot be combined into a single sentence
+            for footnote in page.get_page_footnotes():
                 yield {
-                    "text": footnote_text,
+                    "text": footnote.text,
                     "page_number": page.number,
                     "section_type": SectionType.FOOTNOTE.value,
+                    "line_number": footnote.line_number,
                 }
+
+        elapsed_time = time() - start
+        logger.info(
+            f"Processed {self.file_name} with {len(self.xml_doc.pages)} in {elapsed_time:.1f} seconds"
+        )
+
+    def get_extra_metadata(
+        self, chunk_info: dict[str, Any], char_idx: int, sentence: str
+    ) -> dict[str, Any]:
+        """
+        Calculate extra metadata including line number for a sentence in TEI documents
+        based on the character position within the text chunk (page body or footnote).
+
+        :returns: Dictionary with line_number for the sentence (None if not found)
+        """
+        # If line_number is already in chunk_info (e.g., for footnotes), use it directly
+        if "line_number" in chunk_info:
+            return {"line_number": chunk_info["line_number"]}
+
+        # Otherwise, calculate it for body text based on character position
+        page_number = chunk_info["page_number"]
+        page = next((p for p in self.xml_doc.pages if p.number == page_number), None)
+
+        if page:
+            line_number = page.get_body_text_line_number(char_idx)
+            return {"line_number": line_number}
+
+        return {"line_number": None}
diff --git a/tests/test_sentence/test_corpus/fixtures/sample_tei_with_footnotes.xml b/tests/test_sentence/test_corpus/fixtures/sample_tei_with_footnotes.xml
@@ -80,6 +80,44 @@
                     <label type="footnote">
                         <hi rendition="sup">5</hi>)</label> Erste Anmerkung auf Seite 19.
                 </note>
+
+                <!-- A real example from Das Kapital where <lb/> tag has no line number -->
+                <pb n="20"/>
+                <figure type="facsimile">
+                    <graphic url="facsimiles/sample-caption-page.jpg" mimeType="image/jpeg"/>
+                    <ab type="caption">
+                        <lb/>Erste Umschlagseite der Erstausgabe
+                        <lb n="2"/>Second line with an explicit line number.
+                    </ab>
+                </figure>
+
+                <!-- An example where we have multiple line breaks on the same line -->
+                <pb n="21"/>
+                <p>
+                    <lb n="27"/>Zur Vorbereitung auf die folgenden Abschnitte fasse ich den bisherigen Gang:
+                    <lb n="28"/>Das bisher Entwickelte gilt unter der Voraussetzung, daß im Fortgang der<lb n="29"/>Accumulation <hi rendition="i">das Verhältniß zwischen der Masse der Produktionsmittel und</hi>
+                    <lb n="30"/>der Beschäftigten stets neu austariert werden muß.
+                </p>
+                <p>
+                    <lb n="31"/><hi rendition="i">Schiedensten Proportionen</hi> mit andern Artikeln aus.
+                    <lb n="32"/>Ihr Tauschwerth bleibt unverändert, egal in welchen Kombinationen er erscheint.
+                </p>
+
+                <!-- Mixed inline/tail <lb/> tags example (continuing same page) -->
+                <p>
+                    <lb n="33"/>
+                    <hi rendition="i">Schiedensten Proportionen</hi> mit andern Artikeln aus. Dennoch bleibt sein
+                    <lb n="34"/>Tauschwerth <hi rendition="i">unverändert</hi>, ob in x Stiefelwichse, y Seide, z Gold u. s. w.
+                    <lb n="35"/>ausgedrückt. Er muß also von diesen seinen verschiedenen <hi rendition="i">Ausdrucks-
+                    <lb n="36"/>weisen</hi> unterscheidbar sein.
+                </p>
+                <p>
+                    <lb n="37"/><lb n="38"/>
+                    <lb n="39"/>Leerzeile als separates Beispiel.
+                </p>
+
+                <pb n="22"/>
+                <p>Eine Seite ohne markierte Zeilenumbrüche.</p>
             </text>
         </group>
     </text>