Skip to content

Commit

Permalink
Add inflections
Browse files Browse the repository at this point in the history
  • Loading branch information
djuretic committed Jun 26, 2021
1 parent 8f27c1f commit d191966
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 3 deletions.
46 changes: 46 additions & 0 deletions add_inflections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import sys
from bs4 import BeautifulSoup

# from https://github.com/coljac/kindle_eo_eng/blob/master/src/inflect.py
def inflect(key):
# ekz: blankkola maraglo
if ' ' in key:
return []

root = key[:-1]
if key.endswith("o"):
return [root + x for x in ["on", "oj", "ojn"]]
elif key.endswith("a"):
return [root + x for x in ["aj", "an", "ajn"]]
elif key.endswith("e"):
return [root + "en"]
elif key.endswith("i"):
return [root + x for x in ["as", "os", "is", "us", "u",
"ita", "ata", "ota",
"inta", "anta", "onta",
"intan", "antan", "ontan",
"intaj", "antaj", "ontaj",
"intajn", "antajn", "ontajn"]]
else:
return []

def main(path: str, output_path: str):
with open(path) as f:
soup = BeautifulSoup(f, 'html.parser')

entries = soup.find_all('idx:orth')
for tag in entries:
tag['value'] = tag['value'].strip().strip(',')
inflections = inflect(tag['value'])
if inflections:
inflection_tag = soup.new_tag("idx:infl")
for inflection in inflections:
new_tag = soup.new_tag("idx:iform", attrs={'value': inflection})
inflection_tag.append(new_tag)
tag.append(inflection_tag)

with open(output_path, 'w') as f:
f.write(str(soup))

if __name__ == '__main__':
main(sys.argv[1], sys.argv[2])
5 changes: 2 additions & 3 deletions generate_opf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def main():
lines = f.readlines()

html_dir = Path('./output')
html_files = list(html_dir.glob("content*.html"))
html_files = list(html_dir.glob("inflected_content*.html"))
html_files.sort()

n = 0
Expand All @@ -14,8 +14,7 @@ def main():
for line in lines:
if "<item id" in line:
for n, html in enumerate(html_files):
# TODO test media-type="application/xhtml+xml"
f.write(f""" <item id="dictionary{n}" href="{html.name}" media-type="text/x-oeb1-document"/>\n""")
f.write(f""" <item id="dictionary{n}" href="{html.name}" media-type="application/xhtml+xml"/>\n""")
elif "<itemref" in line:
# the previous loop will set "n"
for n in range(n + 1):
Expand Down
5 changes: 5 additions & 0 deletions process.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
#!/usr/bin/env bash
xsltproc --path voko-grundo/dtd:revo-fonto/cfg xsl/revohtml.xsl revo-fonto/revo/*.xml > output/vortaro.html
python split_html.py
for HTML in output/content*.html
do
HTML_INFLECTED=$(echo $HTML | sed -e 's/content/inflected_content/')
python add_inflections.py $HTML $HTML_INFLECTED
done
python generate_opf.py

0 comments on commit d191966

Please sign in to comment.