-
Notifications
You must be signed in to change notification settings - Fork 1
/
ungreek.py
80 lines (70 loc) · 2.09 KB
/
ungreek.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import sys
import pathlib
greek_letters = {
"α": "alpha",
"β": "beta",
"γ": "gamma",
"δ": "delta",
"ε": "epsilon",
"ζ": "zeta",
"η": "eta",
"θ": "theta",
"ι": "iota",
"κ": "kappa",
"λ": "lambda",
"μ": "mu",
"ν": "nu",
"ξ": "xi",
"ο": "omicron",
"π": "pi",
"ρ": "rho",
"σ": "sigma",
"τ": "tau",
"υ": "upsilon",
"φ": "phi",
"χ": "chi",
"ψ": "psi",
"ω": "omega"
}
def ungreek(in_file, out_dir):
with open(in_file, encoding="utf8") as f:
contents = _ungreek(f.read())
if contents:
out_file = out_dir / in_file.name
with open(out_file, "w", encoding="utf8") as f:
f.write(contents)
def _ungreek(contents):
"""Reemplaza las letras griegas por sus nombres (convierte todo a minúsculas)."""
contents = contents.lower()
for letter, name in greek_letters.items():
contents = contents.replace(letter, name)
return contents
def _clean_html(contents):
"""Elimina etiquetas HTML del contenido."""
for letter, name in greek_letters.items():
contents = contents.replace("&" + name + ";", name)
for tag in ["<sub>", "</sub>", "<sup>", "</sup>", "<i>", "</i>", "<a>", "</a>"]:
contents = contents.replace(tag, "")
contents = contents.replace(">", ">")
contents = contents.replace("<", "<")
contents = contents.replace("&", "&")
contents = contents.replace("&mgr;", "mu")
contents = contents.replace("ü", "ü")
return contents
if __name__ == "__main__":
if len(sys.argv) < 3:
exit("Modo de uso: {} dir_entrada dir_salida".format(sys.argv[0]))
out_dir = pathlib.Path(sys.argv[2])
if not out_dir.exists():
out_dir.mkdir()
for p in pathlib.Path(sys.argv[1]).iterdir():
if p.is_file():
ungreek(p, out_dir)
if __name__ == "__main__2":
if len(sys.argv) < 2:
exit("Modo de uso: {} in_file".format(sys.argv[0]))
with open(sys.argv[1]) as f:
c = f.read()
c = _ungreek(c)
c = _clean_html(c)
print(c)