-
Notifications
You must be signed in to change notification settings - Fork 4
/
acroclass.py
executable file
·380 lines (362 loc) · 11.2 KB
/
acroclass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
import re
class core(object):
# Project Info:
# =============
# Written by: fyngyrz - codes with magnetic needle
# Incep date: November 24th, 2018
# Last Update: January 28th, 2019 (this code file only)
# Environment: Python 2.7
# Source Files: acroclass.py, acrobase.txt
# Tab Spacing: Set to 4 for sane readability of Python source
# Security: Suitable for benign users only (IOW, me.)
# Purpose: Creates informative <abbr> tag wraps around
# all-caps terms in the source text. Written
# to support use of <abbr> on soylentnews.org
# License: None. Use as you will. PD, free, etc.
# Dependencies: standard Python re import library
# ----------------------------------------------------------
def version_set(self):
return('0.0.7 Beta')
def __init__(self, detectterms=True, # disable class.makeacros() = False
numberterms=False, # disable detecting terms incorporating numbers
detectcomps=True, # detect electronic components
iglist=[], # terms to ignore
acrofile='acrobase.txt', # file to load term expansions from
editor=False, # use editor's marks
inquotes=False, # use editor's marks only within blockquote spans
astyle='', # style content for <abbr> tags
edpre = '', # editor prefix
edpost = ''): # editor postfix
self.version = self.version_set()
self.detectterms = detectterms
self.numberterms = numberterms
self.detectcomps = detectcomps
self.acrofile = acrofile
self.setstyle(astyle)
self.igdict = {}
self.undict = {}
self.editor = editor
self.inquotes = inquotes
self.inspan = 0
self.edpre = edpre
self.edpost = edpost
self.acros = {}
self.rmlist = []
self.relist = []
self.errors = u'' # note that errors are unicode strings!
self.setacros(acrofile)
self.geniglist(iglist)
def setstyle(self,astyle):
if astyle == '':
self.astyle = astyle
else:
self.astyle = ' style="%s"' % (astyle)
# Generate ignore list, remove items from main list
# -------------------------------------------------
def geniglist(self,iglist):
for el in iglist:
el = str(el).upper()
self.igdict[el] = True
try:
del self.acros[el]
except:
pass
# Convert a unicode string to an ASCII string, replacing any
# characters > 127 with the appropriate character entity.
# That in turn makes the text compatible with the macro
# processor, as character entities are 100$ ASCII.
# ----------------------------------------------------------
def makeascii(self,text):
o = ''
for i in range(0,len(text)):
try:
c = text[i].encode("ascii")
o += c
except:
o += '&#{:d};'.format(ord(text[i]))
return o
# Convert HTML character entities into unicode
# --------------------------------------------
def subents(self,text):
state = 0 # nothing detected
accum = u''
o = u''
for c in text:
if state == 0: # nothing as yet?
if c == u'&': # ampersand?
state = 1 # ampersand!
else:
o += c
elif state == 1: # ampersand found?
if c == u'#': # hash?
state = 2 # hash!
accum = u'' # clear accumulator
else: # not a hash, so not an entity encoding
state = 0 # abort
o += u'&'+c # flush char, done
elif state == 2: # expecting digits or terminating semicolon
if c.isdigit(): # digit?
accum += c # add it to accumulator if so
elif c == u';': # terminating
s = u'\\U%08x' % (int(accum))
ss= s.decode('unicode-escape')
o += ss
state = 0
else: # bad encoding?
o += u'&#'
o += accum
state = 0
return o
# Read term expansion file into memory
# ------------------------------------
def setacros(self,acrofile):
try:
with open(acrofile) as fh:
self.acrobase = fh.read()
except Exception,e:
self.acrobase = u''
self.errors += u'failed to read file'+str(e)+u'\n'
else:
self.acrobase = self.acrobase.replace(u'"',u'"') # can't have quotes in abbr tags
self.makedict()
# Test string for integer representation
# --------------------------------------
def chkint(self,text):
try:
n = int(text)
except:
return False
return True
# Create dictionary from the acronym / abbreviation file contents
# ---------------------------------------------------------------
def makedict(self):
self.acros = {}
linecounter = 1
l1 = self.acrobase.split(u'\n')
edpr = u''
edpo = u''
if self.editor == True:
edpr = self.edpre
edpo = self.edpost
for el in l1:
if len(el) != 0:
if el[0:1] != u'#':
try:
veri = True
key,alternate,expansion = el.split(u',',2)
if expansion.find('<') != -1: veri = False
if expansion.find('>') != -1: veri = False
if veri == True:
if key == '*': # if this is a component designator
if self.detectcomps == True:
self.rmlist.append(expansion)
self.relist.append(alternate)
else:
pass
elif self.numberterms == False and self.chkint(key) == True:
pass
else: # normal term definition
term = key
if alternate != u'':
term = alternate
if self.acros.get(key,'') != '':
self.errors += u'Duplicate ACRO key: '+ unicode(key) + u'\n'
alist = expansion.split('|')
if len(alist) == 1:
self.acros[key] = expansion
else:
alist.sort()
s = u''
n = 1
for el in alist:
if n != 1: s = s + u' '
s = s + u'(' + unicode(str(n)) + u'): '+unicode(str(el))
n += 1
self.acros[key] = s
else:
self.errors += u'< or > found in ACRO: '+ unicode(key) + u'\n'
except Exception,e:
self.errors += u'line '+str(linecounter)+u': '
self.errors += u'"'+unicode(el)+u'"\n'+unicode(str(e))
linecounter += 1
# Match term against component encodings
# --------------------------------------
def compmatch(self,term):
if self.detectcomps == False: return term
if self.igdict.get(term,False) == True: return term
if self.isnumeric(term) == False: # if not fully numeric
rmatch = False
ren = 0
edpr = u''
edpo = u''
if self.editor == True:
edpr = self.edpre
edpo = self.edpost
for el in self.relist:
ln = len(el)
el = el + '\d*'
if re.match(el,term):
try:
n = int(term[ln:])
except: # not a number, bail
pass
else:
comp = self.rmlist[ren]
ell = comp.split('|')
smark = edpr
emark = edpo
if self.inquotes == True:
if self.inspan == 0:
smark = u''
emark = u''
if len(ell) == 1:
string = '<abbr%s title="%s%s %s%s">%s</abbr>' % (self.astyle,smark,comp,n,emark,term)
else: # multiple elements
x = 1
smark = edpr
emark = edpo
if self.inquotes == True:
if self.inspan == 0:
smark = u''
emark = u''
string = '<abbr'+self.astyle+' title="'+smark
ell.sort()
for element in ell:
if x != 1: string += ' '
string += '(%d): %s %d' % (x,element,n)
x += 1
string += emark+'">'+term+'</abbr>'
return string
ren += 1
return term
# Explicit match against numerals 0...9
# -------------------------------------
def isnumeric(self,text):
for c in text:
if c < u'0' or c > u'9': return False
return True
# Conversion including translation to unicode
# -------------------------------------------
def a2u(self,text): # ASCII in
if type(text) is not str:
self.errors += u'class function a2u() requires ASCII input\n';
return u''
return self.makeacros(unicode(text)) # generate <abbr> tags, unicode out
def a2a(self,text): # ASCII in
if type(text) is not str:
self.errors += u'class function a2a() requires ASCII input\n';
return u''
text = self.makeacros(unicode(text)) # generate <abbr> tags, unicode out
return self.makeascii(text) # get back an entity-encoded string
# Conversion including translation from unicode
# ---------------------------------------------
def u2a(self,text): # unicode in
if type(text) is not unicode:
self.errors += u'class function u2a() requires unicode input\n';
return ''
text = self.makeacros(text) # generate <abbr> tags, ASCII out
return self.makeascii(text) # convert to ASCII string
def cleanbraces(self,text):
text = text.replace('<','<')
text = text.replace('>','>')
return text
def makeacros(self,text): # for compatibility only
return self.u2u(text)
# Convert all instances of TERM to <abbr title="expansion">TERM</abbr>
# where TERM is capAlpha or some combination of capAlpha and Numeric
# This is unicode in, unicode out
# --------------------------------------------------------------------
def u2u(self,text):
tlen = len(text)
ccnt = 0
if self.detectterms == False: return text
if type(text) is not unicode:
self.errors += 'class function makeacros() requires unicode input\n';
return ''
incaps = False
accum = u''
o = u''
ctag = u''
btag = u''
wait = False
wait2 = False
for c in text: # iterate all characters
ccnt += 1
if c == u'<':
wait = True # if within an HTML tag, don't bother
ctag = u'' # reset abbr detector
btag = u'' # reset blockquote detector
elif c == u'>': wait = False
ctag += c.lower()
btag += c.lower()
if btag[:11] == u'<blockquote':
self.inspan += 1
btag = u''
elif btag[:12] == u'</blockquote':
self.inspan -= 1
if self.inspan < 0:
self.inspan = 0
btag = u''
if ctag[:5] == u'<abbr':
wait2 = True # ignore between <abbr></abbr>
ctag = u''
elif ctag[:6] == u'</abbr':
wait2 = False
ctag = u''
if wait == False and wait2 == False and ((c >= u'A' and c <= u'Z') or (c >= u'0' and c <= u'9')):
accum += c
else: # not a cap now
if len(accum) > 1:
taccum = self.acros.get(accum,accum)
if taccum == accum: # not found
if self.isnumeric(accum) == False: # if not fully numeric
taccum = self.compmatch(accum)
if taccum == accum: # still not found
if self.igdict.get(taccum,'') == '':
self.undict[taccum] = 1 # we don't know this one
else: # we found it
if self.editor == True:
smark = self.edpre
emark = self.edpost
else:
smark = u''
emark = u''
if self.inquotes == True:
if self.inspan == 0:
smark = u''
emark = u''
taccum = '<abbr%s title="%s%s%s">%s</abbr>' % (self.astyle,smark,taccum,emark,accum)
accum = taccum
accum += c
o += accum
accum = u''
else: # 1 or 0
o += accum
accum = u''
o += c
if accum != u'': # any pending on end of post?
if len(accum) > 1:
taccum = self.acros.get(accum,accum)
if taccum == accum:
if self.isnumeric(taccum) == False:
taccum = self.compmatch(accum)
if taccum == accum: # still not found
if self.igdict.get(taccum,'') == '':
self.undict[taccum] = 1 # we don't know this one
else:
if self.editor == True:
smark = self.edpre
emark = self.edpost
else:
smark = u''
emark = u''
if self.inquotes == True:
if self.inspan == 0:
smark = u''
emark = u''
taccum = '<abbr%s title="%s%s%s">%s</abbr>' % (self.astyle,smark,taccum,emark,accum)
accum = taccum
o += accum
else: # 1 or 0
o += accum
return o