Skip to content

Commit

Permalink
Merge pull request #8 from freud-digital/dev
Browse files Browse the repository at this point in the history
merging class=ff paragraphs with previous paragraph, adapting tests
  • Loading branch information
linxOD authored Jun 20, 2022
2 parents a9f718d + c8c05da commit 5627e75
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 15 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ exclude =
build
dist
env
venv
49 changes: 46 additions & 3 deletions freud_api_crawler/fixtures/make_tei.xslt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,51 @@
####################
-->

<xsl:template match="tei:body">
<body>
<div>
<xsl:apply-templates/>
</div>
</body>
</xsl:template>
<xsl:template match="tei:div">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="tei:div/tei:p[position() = last()]">
<xsl:copy>
<xsl:apply-templates/>
<xsl:choose>
<xsl:when test="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[@class='ff']">
<xsl:copy-of select="parent::tei:div/following-sibling::tei:div[1]/child::tei:pb"/>
<fw type="pageNum">
<xsl:value-of select="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[./tei:span[@class='pagenumber']]/tei:span[@class='pagenumber']"/>
</fw>
<!--<xsl:copy-of select="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[./tei:span[@class='pagenumber']]/tei:span[@class='pagenumber']"/>-->
<xsl:for-each select="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[@class='ff']">
<xsl:apply-templates/>
</xsl:for-each>
</xsl:when>
</xsl:choose>
</xsl:copy>
</xsl:template>
<xsl:template match="tei:pb[following-sibling::tei:p[@class='ff']]">

</xsl:template>
<xsl:template match="tei:p[@class='ff']">

</xsl:template>
<xsl:template match="tei:space">
<xsl:text>&#x00A0;</xsl:text>
</xsl:template>
<xsl:template match="tei:p[./tei:span[@class='pagenumber']]">
<fw type="pageNum"><xsl:value-of select=".//text()"/></fw>
<xsl:choose>
<xsl:when test="following-sibling::tei:p[@class='ff']">

</xsl:when>
<xsl:otherwise>
<fw type="pageNum"><xsl:value-of select=".//text()"/></fw>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="tei:p[@class='marginalie_place']">
<p rendition="#marginalie_place"><xsl:apply-templates/></p>
Expand All @@ -35,9 +78,9 @@
<xsl:template match="tei:p[@class='footnote footnote-ff']">
<note type="footnote" prev="true"><xsl:apply-templates/></note>
</xsl:template>
<xsl:template match="tei:p[@class='ff']">
<!-- <xsl:template match="tei:p[@class='ff']">
<p prev="true"><xsl:apply-templates/></p>
</xsl:template>
</xsl:template> -->

<!--
####################
Expand Down
6 changes: 1 addition & 5 deletions freud_api_crawler/freud_api_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,11 +426,7 @@ def make_xml(self, save=False, limit=True):
page_json = self.get_page(x['id'])
pp = self.process_page(page_json)
div = ET.fromstring(pp['body'])
pb_el = make_pb(
pp['page_nr'],
f"{FRD_BASE}{pp['faks__payload']}",
pp['faks__id']
)
pb_el = make_pb(pp)
cur_div = div.xpath('//tei:div', namespaces=self.nsmap)[0]
cur_div.insert(0, pb_el)
body.append(div)
Expand Down
3 changes: 2 additions & 1 deletion freud_api_crawler/string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
('\n', ''),
('-<br />', '<lb break="no"/>'),
('<br />', '<lb />\n'),
('</p>', '</p>\n'),
('</p>', '<lb break="paragraph"/></p>\n'),
('-<lb break="paragraph"/></p>', '<lb break="no"/></p>'),
('‚', ','),
('ı', 'i')
]
Expand Down
7 changes: 3 additions & 4 deletions freud_api_crawler/tei_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import lxml.etree as ET


def make_pb(n, faks_url, faks_id):
def make_pb(json):
""" returns a tei:pb
"""
pb_el = ET.Element("{http://www.tei-c.org/ns/1.0}pb")
pb_el.attrib['n'] = f"{n}"
pb_el.attrib['facs'] = f"{faks_url}"
pb_el.attrib['n'] = f"{json['page_nr']}"
pb_el.attrib[
"{http://www.w3.org/XML/1998/namespace}id"
] = f"faks__{faks_id}"
] = f"page__{json['id']}"

return pb_el
7 changes: 5 additions & 2 deletions tests/test_freud_api_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,13 @@ def tearDown(self):
def test_001_make_pg(self):
""" Test make_pb"""
pb_el = tei_utils.make_pb(
1, 'https://whatever.com', "1234sieben"
{
"page_nr": 1,
"id": "xyz"
}
)
pb_str = ET.tostring(pb_el).decode('utf-8')
self.assertEqual(
pb_str,
'<ns0:pb xmlns:ns0="http://www.tei-c.org/ns/1.0" n="1" facs="https://whatever.com" xml:id="faks__1234sieben"/>' # noqa: E501
'<ns0:pb xmlns:ns0="http://www.tei-c.org/ns/1.0" n="1" xml:id="page__xyz"/>' # noqa: E501
)

0 comments on commit 5627e75

Please sign in to comment.