-
Notifications
You must be signed in to change notification settings - Fork 1
/
chars.py
87 lines (75 loc) · 3.04 KB
/
chars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- encoding: utf-8 -*-
import re
# common character sets
digits = u"0123456789"
letters = u"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
symbols = ur"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
ascii = digits+letters+symbols
xsymbols = u"""€¢£»«›‹÷©®†‡°∙•◦‣¶§÷¡¿▪▫"""
german = u"ÄäÖöÜüß"
french = u"ÀàÂâÆæÇçÉéÈèÊêËëÎîÏïÔôŒœÙùÛûÜüŸÿ"
turkish = u"ĞğŞşıſ"
greek = u"ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
portuguese = u"ÁÃÌÍÒÓÕÚáãìíòóõú"
telugu = u" ఁంఃఅఆఇఈఉఊఋఌఎఏఐఒఓఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరఱలళవశషసహఽాిీుూృౄెేైొోౌ్ౘౙౠౡౢౣ౦౧౨౩౪౫౬౭౮౯"
default = ascii+xsymbols+german+french+portuguese
european = default+turkish+greek
# List of regular expressions for normalizing Unicode text.
# Cleans up common homographs. This is mostly used for
# training text.
# Note that the replacement of pretty much all quotes with
# ASCII straight quotes and commas requires some
# postprocessing to figure out which of those symbols
# represent typographic quotes. See `requote`
# TODO: We may want to try to preserve more shape; unfortunately,
# there are lots of inconsistencies between fonts. Generally,
# there seems to be left vs right leaning, and top-heavy vs bottom-heavy
replacements = [
(u'[_~#]',u"~"), # OCR control characters
(u'"',u"''"), # typewriter double quote
(u"`",u"'"), # grave accent
(u'[“”]',u"''"), # fancy quotes
(u"´",u"'"), # acute accent
(u"[‘’]",u"'"), # left single quotation mark
(u"[“”]",u"''"), # right double quotation mark
(u"“",u"''"), # German quotes
(u"„",u",,"), # German quotes
(u"…",u"..."), # ellipsis
(u"′",u"'"), # prime
(u"″",u"''"), # double prime
(u"‴",u"'''"), # triple prime
(u"〃",u"''"), # ditto mark
(u"µ",u"μ"), # replace micro unit with greek character
(u"[–—]",u"-"), # variant length hyphens
(u"fl",u"fl"), # expand Unicode ligatures
(u"fi",u"fi"),
(u"ff",u"ff"),
(u"ffi",u"ffi"),
(u"ffl",u"ffl"),
]
def requote(s):
s = unicode(s)
s = re.sub(ur"''",u'"',s)
return s
def requote_fancy(s,germanic=0):
s = unicode(s)
if germanic:
# germanic quoting style reverses the shapes
# straight double quotes
s = re.sub(ur"\s+''",u"”",s)
s = re.sub(u"''\s+",u"“",s)
s = re.sub(ur"\s+,,",u"„",s)
# straight single quotes
s = re.sub(ur"\s+'",u"’",s)
s = re.sub(ur"'\s+",u"‘",s)
s = re.sub(ur"\s+,",u"‚",s)
else:
# straight double quotes
s = re.sub(ur"\s+''",u"“",s)
s = re.sub(ur"''\s+",u"”",s)
s = re.sub(ur"\s+,,",u"„",s)
# straight single quotes
s = re.sub(ur"\s+'",u"‘",s)
s = re.sub(ur"'\s+",u"’",s)
s = re.sub(ur"\s+,",u"‚",s)
return s