forked from sanyaade-machine-learning/Transana
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PyRTFParser.py
2667 lines (2350 loc) · 135 KB
/
PyRTFParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf8 -*-
# Copyright (C) 2009-2015 The Board of Regents of the University of Wisconsin System
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
""" An XML - RFT export / import Parser for the wxPython RichTextCtrl """
__author__ = 'David Woods <dwoods@wcer.wisc.edu>'
# Based on work by Donald N. Allingham and Gary Shao in 2000 and 2002 respectively
# Thanks to Tim Morton for help with output optimization
DEBUG = False # Shows debug messages
DEBUG2 = False # Shows unknown control words, not those explicitly ignored
DEBUG3 = False # SHOW CHARACTERS ONLY
# Transana is my program, and has some special requirements. These can be skipped using this GLOBAL.
IN_TRANSANA = True
# Import wxPython and the wxPython wxRichTextCtrl
import wx
import wx.richtext as richtext
# import Python's cStringIO, os, and string modules
import cStringIO, os, string
# import Python's XML Sax handler
import xml.sax.handler
if DEBUG:
import time
class PyRichTextRTFHandler(richtext.RichTextFileHandler):
""" A RichTextFileHandler that can handle Rich Text Format files,
at least to the extent that Transana needs Rich Text Format.
by David K. Woods (dwoods@wcer.wisc.edu) """
def __init__(self, name='RTF', ext='rtf'):
""" Initialize the RichTextRTF Handler.
Parameters: name='RTF'
ext='rtf' """
# Save the Handler Name
self._name = name
# Save the Handler Extension
self._ext = ext
def CanHandle(self, filename):
""" Can this File Handler handle a particular file? """
return os.path.splitext(filename)[1].lower() == ('.' + self._ext)
def CanLoad(self):
""" Can you load an RTF File with this handler? """
return True
def CanSave(self):
""" Can you save an RTF File with this handler? """
return True
def DoLoadFile(self, buf, stream):
return False
def DoSaveFile(self, buf, stream):
return False
def GetEncoding(self):
""" Get the encoding set for this handler """
# NOTE: I've only tried UTF8 encoding, which is currently hard-coded into the load and save classes.
return 'utf8'
def GetExtension(self):
""" Get the handler file extension """
return self._ext
def GetName(self):
""" Get the handler name """
return self._name
def GetType(self):
""" Get the handler file type """
return richtext.RICHTEXT_TYPE_RTF
def IsVisible(self):
return True
def LoadFile(self, ctrl, filename):
""" Load the contents of a Rich Text Format file into a wxRichTextCtrl.
Parameters: ctrl a wxRichTextCtrl. (NOT a wxRichTextBuffer. The wxRichTextBuffer lacks methods for direct manipulation.)
filename the name of the file to be loaded """
if os.path.exists(filename) and isinstance(ctrl, richtext.RichTextCtrl):
# Use the RTFToRichTextCtrlParser to handle the file load
RTFTowxRichTextCtrlParser(ctrl, filename=filename, encoding=self.GetEncoding())
# There's no feedback from the Parser, so we'll just assume things loaded.
return True
else:
return False
def LoadString(self, ctrl, buf, insertionPoint=None, displayProgress=True):
""" Load the contents of a Rich Text Format string buffer into a wxRichTextCtrl.
Parameters: ctrl a wxRichTextCtrl. (NOT a wxRichTextBuffer. The wxRichTextBuffer lacks methods for direct manipulation.)
buf the RTF string data to be loaded """
if (len(buf) > 0) and isinstance(ctrl, richtext.RichTextCtrl):
# At least in Transana, if the buffer is a unicode object, processing is MUCH, MUCH slower, like more than
# 20 TIMES slower, than if the object is a string. Converting from unicode to string speeds things up incredibly.
# At least for Transana, this causes no problems.
if isinstance(buf, unicode):
buf = buf.encode(self.GetEncoding())
# Use the RTFToRichTextCtrlParser to handle the file load
RTFTowxRichTextCtrlParser(ctrl, buf=buf, insertionPoint=insertionPoint, encoding=self.GetEncoding(), displayProgress=displayProgress)
# There's no feedback from the Parser, so we'll just assume things loaded.
return True
else:
return False
def SaveFile(self, buf, filename=None):
""" Save the contents of a wxRichTextBuffer to a Rich Text Format file,
OR, if filename is omitted, return a string with the appropriate RTF information
Parameters: buf a wxRichTextBuffer or a wxRichTextCtrl
filename the name of the file to be created or overwritten """
# If we're passed a wxRichTextCtrl, we can get the control's buffer, which is what we need.
if isinstance(buf, richtext.RichTextCtrl):
buf = buf.GetBuffer()
# Get a Rich Text XML Handler to extract the data from the wxRichTextBuffer in XML.
# NOTE: buf.Dump() just returns the text contents of the buffer, not any formatting information.
xmlHandler = richtext.RichTextXMLHandler()
# Create a stream object that can hold the data
stream = cStringIO.StringIO()
# If no file name is specified, we should return a string instead of saving to a file!
if filename == None:
# This creates a string-like object that behaves like a file too.
fileobj = cStringIO.StringIO()
# If a filename is specified ..
else:
# ... we can just pass the file name to the file handler
fileobj = filename
# Extract the wxRichTextBuffer data to the stream object
if xmlHandler.SaveStream(buf, stream):
# Convert the stream to a string
contents = stream.getvalue()
# Get the XML to RTF File Handler
fileHandler = XMLToRTFHandler()
# Use xml.sax, with the XML to RTF File Handler, to parse the XML and create
# an RTF Output string.
xml.sax.parseString(contents, fileHandler)
# Use the XML to RTF File Handler to save the RTF Output String to a file or
# to populate the StringIO object if we're to return a string
fileHandler.saveFile(fileobj)
# If no file name is specified ...
if filename == None:
# ... return the converted RTF String
return fileobj.getvalue()
# If a filename is specified ...
else:
# ... indicate success in saving
return True
# If we couldn't extract the XML from the buffer ...
else:
# ... signal failure
return False
def SetName(self, name):
""" Set the name of the File Handler """
self._name = name
class XMLToRTFHandler(xml.sax.handler.ContentHandler):
""" An xml.sax handler designed to convert wxRichTextCtrl's internal XML format data into
Rich Text Format data that can be saved to *.rtf files, at least to the extent that
Transana (htp://www.transana.org) needs Rich Text Format features supported.
by David K. Woods (dwoods@wcer.wisc.edu) """
def __init__(self, encoding='utf8'):
""" Initialize the XMLToRTFHandler
Parameters: encoding='utf8' Character Encoding to use (only utf8 has been tested, and I don't
think the RTF Parser decodes yet. """
# Remember the encoding to use
self.encoding = encoding
# Define an initial Fonts. We define multiple levels of fonts to handle cascading styles.
self.fontAttributes = {}
self.fontAttributes[u'text'] = {u'bgcolor' : '#FFFFFF',
u'fontface' : 'Courier New',
u'fontpointsize' : 12,
u'fontstyle' : wx.FONTSTYLE_NORMAL,
u'fontunderlined' : u'0',
u'fontweight' : wx.FONTSTYLE_NORMAL,
u'textcolor' : '#000000'}
self.fontAttributes[u'symbol'] = {u'bgcolor' : '#FFFFFF',
u'fontface' : 'Courier New',
u'fontpointsize' : 12,
u'fontstyle' : wx.FONTSTYLE_NORMAL,
u'fontunderlined' : u'0',
u'fontweight' : wx.FONTSTYLE_NORMAL,
u'textcolor' : '#000000'}
self.fontAttributes[u'paragraph'] = {u'bgcolor' : '#FFFFFF',
u'fontface' : 'Courier New',
u'fontpointsize' : 12,
u'fontstyle' : wx.FONTSTYLE_NORMAL,
u'fontunderlined' : u'0',
u'fontweight' : wx.FONTSTYLE_NORMAL,
u'textcolor' : '#000000'}
self.fontAttributes[u'paragraphlayout'] = {u'bgcolor' : '#FFFFFF',
u'fontface' : 'Courier New',
u'fontpointsize' : 12,
u'fontstyle' : wx.FONTSTYLE_NORMAL,
u'fontunderlined' : u'0',
u'fontweight' : wx.FONTSTYLE_NORMAL,
u'textcolor' : '#000000'}
# Define the initial Paragraph attributes. We define mulitple levels to handle cascading styles.
self.paragraphAttributes = {}
self.paragraphAttributes[u'paragraph'] = {u'alignment' : u'1',
u'linespacing' : u'10',
u'leftindent' : u'0',
u'rightindent' : u'0',
u'leftsubindent' : u'0',
u'parspacingbefore' : u'0',
u'parspacingafter' : u'0',
u'bulletnumber' : None,
u'bulletstyle' : None,
u'bulletfont' : None,
u'bulletsymbol' : None,
u'bullettext' : None,
u'tabs' : None}
self.paragraphAttributes[u'paragraphlayout'] = {u'alignment' : u'1',
u'linespacing' : u'10',
u'leftindent' : u'0',
u'rightindent' : u'0',
u'leftsubindent' : u'0',
u'parspacingbefore' : u'0',
u'parspacingafter' : u'0',
u'bulletnumber' : None,
u'bulletstyle' : None,
u'bulletfont' : None,
u'bulletsymbol' : None,
u'bullettext' : None,
u'tabs' : None}
# Define an initial font table
self.fontTable = [u'Courier New']
# define an initial color table
self.colorTable = ['#000000', '#FF0000', '#00FF00', '#0000FF', '#FFFFFF']
# Define the parsed text output (cStringIO used for the speed improvements it provides!)
self.outputString = cStringIO.StringIO()
# Define a variable for tracking what element we are changing
self.element = ''
if IN_TRANSANA:
# Track whether we're inside a Transana time code
self.inTimeCode = False
# Handling a URL
self.url = ''
def startElement(self, name, attributes):
""" xml.sax required method for handling the starting XML element """
# We need roman numerals for list processing
# Copied from http://www.daniweb.com/code/snippet216865.html on 2/3/2010
def int2roman(number):
numerals = { 1 : "I", 4 : "IV", 5 : "V", 9 : "IX", 10 : "X", 40 : "XL",
50 : "L", 90 : "XC", 100 : "C", 400 : "CD", 500 : "D", 900 : "CM", 1000 : "M" }
result = ""
for value, numeral in sorted(numerals.items(), reverse=True):
while number >= value:
result += numeral
number -= value
return result
# Remember the element's name
self.element = name
# If the element is a paragraphlayout, paragraph, symbol, or text element ...
if name in [u'paragraphlayout', u'paragraph', u'symbol', u'text']:
# Let's cascade the font and paragraph settings from a level up BEFORE we change things to reset the font and
# paragraph settings to the proper initial state. First, let's create empty character and paragraph cascade lists
charcascade = paracascade = []
# Initially, assume we will cascade from our current object for character styles
cascadesource = name
# If we're in a Paragraph spec ...
if name == u'paragraph':
# ... we need to cascase paragraph, symbol, and text styles for characters ...
charcascade = [u'paragraph', u'symbol', u'text']
# ... from the paragraph layout style for characters ...
cascadesource = u'paragraphlayout'
# ... and we need to cascare paragraph styles for paragraphs
paracascade = [u'paragraph']
# If we're in a Text spec ...
elif name == u'text':
# ... we need to cascase text styles for characters ...
charcascade = [u'text']
# ... from the paragraph style for characters ...
cascadesource = u'paragraph'
# If we're in a Symbol spec ...
elif name == u'symbol':
# ... we need to cascase symbol styles for characters ...
charcascade = [u'symbol']
# ... from the paragraph style for characters ...
cascadesource = u'paragraph'
# For each type of character style we need to cascade ...
for x in charcascade:
# ... iterate through the dictionary elements ...
for y in self.fontAttributes[x].keys():
# ... and assign the character cascade source styles (cascadesource) to the destination element (x)
self.fontAttributes[x][y] = self.fontAttributes[cascadesource][y]
# For each type of paragraph style we need to cascade ...
for x in paracascade:
# ... iterate through the dictionary elements ...
for y in self.paragraphAttributes[x].keys():
# ... and assign the paragraph cascade source styles (cascadesource) to the destination element (x)
self.paragraphAttributes[x][y] = self.paragraphAttributes[cascadesource][y]
# If the element is a paragraph element or a paragraph layout element, there is extra processing to do at the start
if name in [u'paragraph', u'paragraphlayout']:
# ... iterate through the element attributes looking for paragraph attributes
for x in attributes.keys():
# If the attribute is a paragraph format attribute ...
if x in [u'alignment',
u'linespacing',
u'leftindent',
u'rightindent',
u'leftsubindent',
u'parspacingbefore',
u'parspacingafter',
u'bulletnumber',
u'bulletstyle',
u'bulletfont',
u'bulletsymbol',
u'bullettext',
u'tabs']:
# ... update the current paragraph dictionary
self.paragraphAttributes[name][x] = attributes[x]
# ... iterate through the element attributes looking for font attributes
for x in attributes.keys():
if x == u'fontsize':
x = u'fontpointsize'
# If the attribute is a font format attribute ...
if x in [u'bgcolor',
u'fontface',
u'fontpointsize',
u'fontstyle',
u'fontunderlined',
u'fontweight',
u'textcolor']:
# ... update the current font dictionary
self.fontAttributes[name][x] = attributes[x]
# If the attribute is a font name ...
if x == u'fontface':
# ... that is not already in the font table ...
if not(attributes[x] in self.fontTable):
# ... add the font name to the font table list
self.fontTable.append(attributes[x])
# If the element is a text element and the attribute is a url attribute ...
if (name == u'text') and (x == u'url'):
# ... capture the URL data.
self.url = attributes[x]
# If the URL is a Transana Object link ...
# (This should be done after all text attributes are processed so formatting can be corrected.)
if (len(self.url) > 9) and (self.url[:9].lower() == 'transana:'):
# ... completely remove the URL value
self.url = ''
# Let's remove the Hyperlink formatting too!
self.fontAttributes[u'text'][u'textcolor'] = '#000000'
self.fontAttributes[u'text'][u'fontunderlined'] = u'0'
# Let's cascade the font and paragraph settings we've just changed.
# First, let's create empty character and paragraph cascade lists
charcascade = paracascade = []
# Initially, assume we will cascade from our current object for character styles
cascadesource = name
# If we're in a Paragraph Layout spec ...
if name == u'paragraphlayout':
# ... we need to cascase paragraph, symbol, and text styles for characters ...
charcascade = [u'paragraph', u'symbol', u'text']
# ... we need to cascase paragraph styles for paragraphs ...
paracascade = [u'paragraph']
# If we're in a Paragraph spec ...
elif name == u'paragraph':
# ... we need to cascase symbol and text styles for characters ...
charcascade = [u'symbol', u'text']
# For each type of character style we need to cascade ...
for x in charcascade:
# ... iterate through the dictionary elements ...
for y in self.fontAttributes[x].keys():
# ... and assign the character cascade source styles (cascadesource) to the destination element (x)
self.fontAttributes[x][y] = self.fontAttributes[cascadesource][y]
for x in paracascade:
# ... iterate through the dictionary elements ...
for y in self.paragraphAttributes[x].keys():
# ... and assign the paragraph cascade source styles (cascadesource) to the destination element (x)
self.paragraphAttributes[x][y] = self.paragraphAttributes[cascadesource][y]
if DEBUG:
# List unknown elements
for x in attributes.keys():
if not x in [u'bgcolor',
u'fontface',
u'fontpointsize',
u'fontstyle',
u'fontunderlined',
u'fontweight',
u'textcolor',
u'alignment',
u'linespacing',
u'leftindent',
u'rightindent',
u'leftsubindent',
u'parspacingbefore',
u'parspacingafter',
u'url',
u'tabs',
u'bulletnumber',
u'bulletstyle',
u'bulletfont',
u'bulletsymbol',
u'bullettext']:
print "Unknown %s attribute: %s %s" % (name, x, attributes[x])
# If the element is an image element ...
elif name in [u'image']:
# ... if we have a PNG graphic ...
if attributes[u'imagetype'] == u'15': # wx.BITMAP_TYPE_PNG = 15
# ... signal that we have a PNG image to process ...
self.elementType = "ImagePNG"
# ... and start the RTF code for a PNG image block
self.outputString.write('{\pict\pngblip ')
# It appears to me that all images will be PNG images coming from the RichTextCtrl.
else:
# if not, signal a unknown image type
self.elementType = 'ImageUnknown'
print "Image of UNKNOWN TYPE!!", attributes.keys()
# If the element is a data or richtext element ...
elif name in [u'data', u'richtext']:
# ... we should do nothing here at this time
pass
# If we have an unhandled element ...
else:
# ... output a message and the element attributes.
print "PyRTFParser.XMLToRTFHandler.startElement(): Unknown XML tag:", name
for x in attributes.keys():
print x, attributes[x]
print
# If the element is a paragraph element ...
if name in [u'paragraph']:
# Code for handling bullet lists and numbered lists is preliminary and probably very buggy
# print "Bullet Number:", self.paragraphAttributes[u'paragraph'][u'bulletnumber'], type(self.paragraphAttributes[u'paragraph'][u'bulletnumber'])
# print "Bullet Style:", self.paragraphAttributes[u'paragraph'][u'bulletstyle'],
# if self.paragraphAttributes[u'paragraph'][u'bulletstyle'] != None:
# print "%04x" % int(self.paragraphAttributes[u'paragraph'][u'bulletstyle'])
# else:
# print
# print "Bullet Font:", self.paragraphAttributes[u'paragraph'][u'bulletfont']
# print "Bullet Symbol:", self.paragraphAttributes[u'paragraph'][u'bulletsymbol']
# print "Bullet Text:", self.paragraphAttributes[u'paragraph'][u'bullettext']
# print
# If we have a bullet or numbered list specification ...
if self.paragraphAttributes[u'paragraph'][u'bulletstyle'] != None:
# ... indicate that in the RTF output string
self.outputString.write('{\\listtext\\pard\\plain')
# Convert the Bullet Style to a hex string so we can interpret it correctly.
# (I'm sure there's a better way to do this!)
styleHexStr = "%04x" % int(self.paragraphAttributes[u'paragraph'][u'bulletstyle'])
# If we have a known symbol bullet (TEXT_ATTR_BULLET_STYLE_SYMBOL and defined bulletsymbol) ...
if (styleHexStr[2] == '2') and (self.paragraphAttributes[u'paragraph'][u'bulletsymbol'] != None):
# ... add that to the RTF Output String
self.outputString.write("\\f%s %s\\tab}" % (self.fontTable.index(self.fontAttributes[name][u'fontface']), chr(int(self.paragraphAttributes[u'paragraph'][u'bulletsymbol']))))
# if the second characters is a "2", we have richtext.TEXT_ATTR_BULLET_STYLE_STANDARD
elif (styleHexStr[1] == '2'):
# If Symbol font is not yet in the Font Table ...
if not 'Symbol' in self.fontTable:
# ... then add it now.
self.fontTable.append('Symbol')
# add the bullet symbol in Symbol font to the RTF Output String
self.outputString.write("\\f%s \\'b7\\tab}" % self.fontTable.index('Symbol'))
# If we have a know bullet NUMBER (i.e. a numbered list) ...
elif self.paragraphAttributes[u'paragraph'][u'bulletnumber'] != None:
# Initialize variables used for presenting the proper "number" style and punctuation
numberChar = ''
numberLeadingChar = ''
numberTrailingChar = ''
# Put the bullet "number" into the correct format
# TEXT_ATTR_BULLET_STYLE_ARABIC
if styleHexStr[3] == '1':
numberChar = self.paragraphAttributes[u'paragraph'][u'bulletnumber']
# TEXT_ATTR_BULLET_STYLE_LETTERS_UPPER
elif styleHexStr[3] == '2':
bulletChars = string.uppercase[:26]
numberChar = bulletChars[int(self.paragraphAttributes[u'paragraph'][u'bulletnumber']) - 1]
# TEXT_ATTR_BULLET_STYLE_LETTERS_LOWER
elif styleHexStr[3] == '4':
bulletChars = string.lowercase[:26]
numberChar = bulletChars[int(self.paragraphAttributes[u'paragraph'][u'bulletnumber']) - 1]
# TEXT_ATTR_BULLET_STYLE_ROMAN_UPPER
elif styleHexStr[3] == '8':
numberChar = int2roman(int(self.paragraphAttributes[u'paragraph'][u'bulletnumber']))
# TEXT_ATTR_BULLET_STYLE_ROMAN_LOWER
elif styleHexStr[2] == '1':
numberChar = int2roman(int(self.paragraphAttributes[u'paragraph'][u'bulletnumber'])).lower()
# Put the bullet "number" into the correct punctuation structure
# TEXT_ATTR_BULLET_STYLE_PERIOD
if styleHexStr[1] == '1':
numberTrailingChar = '.'
# TEXT_ATTR_BULLET_STYLE_RIGHT_PARENTHESIS
elif styleHexStr[1] == '4':
numberTrailingChar = ')'
# TEXT_ATTR_BULLET_STYLE_PARENTHESIS
elif styleHexStr[2] == '8':
numberLeadingChar = '('
numberTrailingChar = ')'
# ... add that to the RTF Output String
self.outputString.write("\\f%s %s%s%s\\tab}" % (self.fontTable.index(self.fontAttributes[name][u'fontface']), numberLeadingChar, numberChar, numberTrailingChar))
# If we have a know bullet symbol ...
elif self.paragraphAttributes[u'paragraph'][u'bulletsymbol'] != None:
# ... add that to the RTF Output String
self.outputString.write("\\f%s %s\\tab}" % (self.fontTable.index(self.fontAttributes[name][u'fontface']), unichr(int(self.paragraphAttributes[u'paragraph'][u'bulletsymbol']))))
# If we still don't know what kind of bullet we have, we're in trouble.
else:
print "PyRTFParser.startElement() SYMBOL INSERTION FAILURE"
# Signal the start of a new paragraph in the RTF output string
self.outputString.write('\\pard')
# Paragraph alignment left is u'1'
if self.paragraphAttributes[u'paragraph'][u'alignment'] == u'1':
self.outputString.write('\\ql')
# Paragraph alignment centered is u'2'
elif self.paragraphAttributes[u'paragraph'][u'alignment'] == u'2':
self.outputString.write('\\qc')
# Paragraph alignment right is u'3'
elif self.paragraphAttributes[u'paragraph'][u'alignment'] == u'3':
self.outputString.write('\\qr')
else:
print "Unknown alignment:", self.paragraphAttributes[u'paragraph'][u'alignment'], type(self.paragraphAttributes[u'paragraph'][u'alignment'])
# line spacing u'10' is single line spacing, which is NOT included in the RTF as it is the default.
if self.paragraphAttributes[u'paragraph'][u'linespacing'] in [u'0', u'10']:
pass
# 11 point line spacing is u'11'
elif self.paragraphAttributes[u'paragraph'][u'linespacing'] == u'11':
# I'm not exactly sure why 11 point spacing for lines is 264 but that seems to be what Word uses.
self.outputString.write('\\sl264\\slmult1')
# 12 point line spacing is u'12'
elif self.paragraphAttributes[u'paragraph'][u'linespacing'] == u'12':
# I'm not exactly sure why 12 point spacing for lines is 288 but that seems to be what Word uses.
self.outputString.write('\\sl288\\slmult1')
# 1.5 line spacing is u'15'
elif self.paragraphAttributes[u'paragraph'][u'linespacing'] == u'15':
# I'm not exactly sure why 1.5 spacing for lines is 360 but that seems to be what Word uses.
self.outputString.write('\\sl360\\slmult1')
# double line spacing is u'20'
elif self.paragraphAttributes[u'paragraph'][u'linespacing'] == u'20':
# I'm not exactly sure why double spacing for lines is 480 but that seems to be what Word uses.
self.outputString.write('\\sl480\\slmult1')
# 2.5 line spacing is u'25'
elif self.paragraphAttributes[u'paragraph'][u'linespacing'] == u'25':
# I'm not exactly sure why 2.5 spacing for lines is 600 but that seems to be what Word uses.
self.outputString.write('\\sl600\\slmult1')
# triple line spacing is u'30'
elif self.paragraphAttributes[u'paragraph'][u'linespacing'] == u'30':
# I'm not exactly sure why triple spacing for lines is 720 but that seems to be what Word uses.
self.outputString.write('\\sl720\\slmult1')
else:
print "Unknown linespacing:", self.paragraphAttributes[u'paragraph'][u'linespacing'], type(self.paragraphAttributes[u'paragraph'][u'linespacing'])
# Paragraph Margins and first-line indents
# First, let's convert the unicode strings we got from the XML to integers and translate from wxRichTextCtrl's
# system to RTF's system.
# Left Indent in RTF is the sum of wxRichTextCtrl's left indent and left subindent
leftindent = int(self.paragraphAttributes[u'paragraph'][u'leftindent']) + int(self.paragraphAttributes[u'paragraph'][u'leftsubindent'])
# The First Line Indent in RTF is the wxRichTextCtrl's left indent minus the left indent calculated above.
firstlineindent = int(self.paragraphAttributes[u'paragraph'][u'leftindent']) - leftindent
# The Right Indent translates directly
rightindent = int(self.paragraphAttributes[u'paragraph'][u'rightindent'])
# Now let's convert what we got from the conversions above to twips.
leftMargin = self.twips((leftindent) / 100.0)
rightMargin = self.twips(rightindent / 100.0)
firstIndent = self.twips((firstlineindent) / 100.0)
# Now add these values to the RTF output string
self.outputString.write('\\li%d\\ri%d\\fi%d' % (leftMargin, rightMargin, firstIndent))
# Add non-zero Spacing before and after paragraphs to the RTF output String
if int(self.paragraphAttributes[u'paragraph'][u'parspacingbefore']) != 0:
self.outputString.write('\\sb%d' % self.twips(int(self.paragraphAttributes[u'paragraph'][u'parspacingbefore']) / 100.0))
if int(self.paragraphAttributes[u'paragraph'][u'parspacingafter']) > 0:
self.outputString.write('\\sa%d' % self.twips(int(self.paragraphAttributes[u'paragraph'][u'parspacingafter']) / 100.0))
# Due to a bug in the RichTextEditCtrl, the parspacingafter value may sometimes be NEGATIVE, which of course doesn't
# make sense outside of the RichTextEditCtrl. This adjusts for that.
else:
parAfter = int(self.paragraphAttributes[u'paragraph'][u'parspacingafter']) + int(self.paragraphAttributes[u'paragraph'][u'parspacingbefore'])
self.outputString.write('\\sa%d' % self.twips(max(parAfter, 0) / 100.0))
# If Tabs are defined ...
if self.paragraphAttributes[u'paragraph'][u'tabs'] != None:
# ... break the tab data into its component pieces
tabStops = self.paragraphAttributes[u'paragraph'][u'tabs'].split(',')
# For each tab stop ...
for x in tabStops:
# ... (assuming the data isn't empty) ...
if x != u'':
# ... add the tab stop data to the RTF output string
self.outputString.write('\\tx%d' % self.twips(int(x) / 100.0))
# Add Font formatting when we process text or symbol tags, as text and symbol specs can modify paragraph-level font specifications
if name in [u'text', u'symbol']:
# Begin an RTF block
self.outputString.write('{')
# Add Font Face information
self.outputString.write('\\f%d' % self.fontTable.index(self.fontAttributes[name][u'fontface']))
# Add Font Size information
self.outputString.write('\\fs%d' % (int(self.fontAttributes[name][u'fontpointsize']) * 2))
# If bold, add Bold
if self.fontAttributes[name][u'fontweight'] == str(wx.FONTWEIGHT_BOLD):
self.outputString.write('\\b')
# If Italics, add Italics
if self.fontAttributes[name][u'fontstyle'] == str(wx.FONTSTYLE_ITALIC):
self.outputString.write('\\i')
# If Underline, add Underline
if self.fontAttributes[name][u'fontunderlined'] == u'1':
self.outputString.write('\\ul')
# If Text Color is not black ...
if self.fontAttributes[name][u'textcolor'] != '#000000':
# Check the color table. If the color is not there ...
if not self.fontAttributes[name][u'textcolor'] in self.colorTable:
# ... add it to the color table
self.colorTable.append(self.fontAttributes[name][u'textcolor'])
# ... Add text foreground color
self.outputString.write('\\cf%d' % self.colorTable.index(self.fontAttributes[name][u'textcolor']))
# If Text Background Color is not White ...
if self.fontAttributes[name][u'bgcolor'] != '#FFFFFF':
# Check the color table. If the color is not there ...
if not self.fontAttributes[name][u'bgcolor'] in self.colorTable:
# ... add it to the color table
self.colorTable.append(self.fontAttributes[name][u'bgcolor'])
# ... Add text background color to the RTF output string
# Replaced "cb" with "highlight" for WORD compatibility. "cb" works in OS X TextEdit.
# self.outputString.write('\\cb%d' % self.colorTable.index(self.fontAttributes[name][u'bgcolor']))
self.outputString.write('\\highlight%d' % self.colorTable.index(self.fontAttributes[name][u'bgcolor']))
# Done with formatting string. Add a space to terminate the formatting block, but don't close the text block yet.
self.outputString.write(' ')
def characters(self, data):
""" xml.sax required method for handling the characters within XML elements """
# If the characters come from a text element ...
if self.element in ['text']:
# Replace single backslash characters with double backslash characters because Python needs it that way.
data = data.replace('\\', '\\\\')
# Look for newline characters and replace them with the RTF-friendly '\line' specification.
# (I don't think this line gets any hits.)
data = data.replace('\\n', '\\line')
# Process open curly bracket
data = data.replace('{', '\\{')
# Process close curly bracket
data = data.replace('}', '\\}')
# If we have a value in self.URL, populated in startElement, ...
if self.url != '':
# ... then we're in the midst of a hyperlink. Let's specify the URL for the RTF output string.
self.outputString.write('{\\field{\\*\\fldinst HYPERLINK "%s"}{\\fldrslt ' % self.url)
# If we have a single character in the data specification ...
if len(data) == 1:
# If we have an angle bracket (gt or lt) or a character above chr(127) ...
if (data == '<') or (data == '>') or ord(data[0]) > 127:
# ... add the character NUMBER as a unicode character to the RTF.
# (NOTE: This syntax is probably only correct under UTF8 encoding!)
self.outputString.write("\\u%d\\'3f" % ord(data[0]))
# Otherwise, if we have something other than a quotation mark character ...
elif data != '"':
# ... then add the encoded character to the RTF output string. Since we're in the first 127 characters
# here, the encoding probably does nothing.
self.outputString.write(data.encode(self.encoding))
# Transana requires special processing of Time Codes, with their "hidden" data
if IN_TRANSANA:
# If we have a Transana Time Code character ...
if ord(data[0]) == 164:
# ... signal that we've started a time code and need to process the hidden data
self.inTimeCode = True
# If we are in a time code and hit the end of the data portion ...
elif self.inTimeCode and (data == '>'):
# ... signal that we're no longer in the time code ...
self.inTimeCode = False
# ... and add a space to finish the time code specification.
self.outputString.write(' ')
# If we don't have a single character
else:
# If the text has leading or trailing spaces, it gets enclosed in quotation marks in the XML.
# Otherwise, not. We have to detect this and remove the quotes as needed. Unicode characters
# make this a bit more complicated, as in " 137 e(umlaut) 137 ".
if ((data != ' "') and ((data[0] == '"') or (data[-1] == '"')) ):
if data[0] == '"':
data = data[1:]
if data[-1] == '"':
data = data[:-1]
# If we're in Transana, time code data if followed by a "(space)(quotationmark)" combination from the XML.
# I'm not sure why, but this causes problems in the RTF. Therefore skip this combo in Transana
if not (IN_TRANSANA and (data == ' "')):
# Encode the data and add it to the RTF output string
self.outputString.write(data.encode(self.encoding))
# If we've just added a URL hyperlink ...
if self.url != '':
# ... we need to close the link field RTF block
self.outputString.write('}}')
# Reset the URL to empty, as we're done with it.
self.url = ''
# If the characters come from a symbol element ...
elif self.element == 'symbol':
# Check that we don't have only whitespace, we don't have a multi-character string, and
# we don't have a newline character.
if (len(data.strip()) > 0) and ((len(data) != 1) or (ord(data) != 10)):
# Convert the symbol data to the appropriate unicode character
data = unichr(int(data))
# Add that unicode character to the RTF output string
self.outputString.write(data.encode(self.encoding))
# If the characters come from a data element ...
elif self.element == 'data':
# If we're expecting a PNG Image ...
if self.elementType == 'ImagePNG':
# ... we can just add the data's data to the RTF output string
self.outputString.write(data)
# I haven't seen anything but PNG image data in this data structure from the RichTextCtrl's XML data
else:
# If we're dealing with an image, we could convert the image to PNG, then do a Hex conversion.
# RTF can also JPEG images directly, as well as Enhanced Metafiles, Windows Metafiles, QuickDraw
# pictures, none of which I think wxPython can handle.
print "I don't know how to handle the data!!"
# We can ignore whitespace here, which will be made up of the spaces added by XML and newline characters
# that are part of the XML file but not part of the data.
elif data.strip() != '':
# Otherwise, print a message to the developer
print "PyRTFParser.characters(): Unhandled text."
print '"%s"' % data
def endElement(self, name):
""" xml.sax required method for handling the ending of an XML element (the close tag) """
# If we have a text, data, or symbol end tag ...
if name in [u'text', u'data', u'symbol']:
# ... we need to close the RTF block
self.outputString.write('}')
# If we have a paragraph end tag ...
elif name in [u'paragraph']:
# ... we need to add the end paragraph RTF information
self.outputString.write('\par\n')
# If we have a text, data, paragraph, paragraphlayout, or richtext end tag ...
if name in [u'text', u'data', u'paragraph', u'paragraphlayout', u'richtext']:
# ... we need to clear the element type, as we're no longer processing that type of element!
self.element = None
# NOTE: We could call "saveFile()" here with the richtext end tag (if we'd already gotten a file name.)
# I decided not to do that, as there may be times when we want to get the RTF output string or a data
# stream containing the RTF output stream rather than saving to a file. I haven't written
# getRTFString() or getStream() methods yet, but it wouldn't be hard.
def saveFile(self, filename):
""" Save the RTF Output String to a file or to a StringIO object """
# If filename is a string or unicode object ...
if isinstance(filename, (str, unicode)):
# ... open the file for writing
f = open(filename, 'w')
# If the filename is neither, it is probably a StringIO object. (isinstance doesn't seem to work here to check.)
else:
# If that's the case, no open() is needed.
f = filename
# Add the appropriate RTF header information to the file. This is VERY generic RTF information here.
f.write('{\\rtf1\\ansi\\ansicpg1252\\deff0\n')
# Write the Font Table information at the front of the file
f.write('{\\fonttbl\n')
# Iterate through the fontTable entries ...
for x in range(len(self.fontTable)):
# ... and add each font to the font table
f.write('{\\f%d\\fmodern\\fcharset1\\fprq1 %s;}\n' % (x, self.fontTable[x]))
# Close the Font Table block
f.write('}\n')
# Write the Color Table information at the front of the file
f.write('{\colortbl\n')
# Iterate through the colorTable entries ...
for x in range(len(self.colorTable)):
# ... and add each color to the color table
f.write('\\red%d\\green%d\\blue%d;' % (int(self.colorTable[x][1:3], 16), int(self.colorTable[x][3:5], 16), int(self.colorTable[x][5:7], 16)))
# Close the Color Table block
f.write('}\n')
# Write the page definition information to the file. This is VERY GENERIC information, and
# probably should be converted to pull data from the default printer definition or something.
# Start with no kerning, and a normal data view
f.write('\\kerning0\\cf0\\viewkind1')
# paper width of 8.5 inches (21.59 cm)
f.write('\\paperw%d' % self.twips(21.59))
# paper height of 11 inches (27.94 cm)
f.write('\\paperh%d' % self.twips(27.94))
# Margins of 1 inch (2.54 cm) all around
f.write('\\margl%d' % self.twips(2.54))
f.write('\\margr%d' % self.twips(2.54))
f.write('\\margt%d' % self.twips(2.54))
f.write('\\margb%d' % self.twips(2.54))
# Specify widow/orphan control
f.write('\\widowctrl\n')
# now add the RTF output string from the XML parser
f.write(self.outputString.getvalue())
# Close the RTF document string
f.write('}')
# if filename is a string or unicode object ...
if isinstance(filename, str) or isinstance(filename, unicode):
# ... close the output file
f.close()
def twips(self, cm):
""" Convert centimeters to twips. Twips are 1/72th of an inch, and are the official measurement unit of
the RTF specification """
return int(((cm/2.54)*72)+0.5)*20
class RTFTowxRichTextCtrlParser:
""" An RTF Parser designed to convert Rich Text Format data from *.rtf files to
wxRichTextCtrl's internal format, at least to the extent that
Transana (htp://www.transana.org) needs Rich Text Format features supported.
by David K. Woods (dwoods@wcer.wisc.edu) """
def __init__(self, txtCtrl, filename=None, buf=None, insertionPoint=None, encoding='utf8', displayProgress=True):
""" Initialize the RTFToRichTextCtrlParser.
Parameters: txtCtrl a wx.RichTextCtrl, NOT a wx.RichTextBuffer. The buffer doesn't provide an easy way to add text!
filename=None a Rich Text Format (*.rtf) file name
buf=None a string with RTF-encoded data
encoding='utf8' Character Encoding to use (only utf8 has been tested, and I don't
think the RTF Parser decodes yet.
You can pass in either a filename or a buffer string. If both are passed, only the file will be imported. """
# Remember the wxRichTextCtrl to populate
self.txtCtrl = txtCtrl
# Remember the insertion point
self.insertionPoint = insertionPoint
if insertionPoint != None:
self.insertionOffset = self.txtCtrl.GetLastPosition() - insertionPoint
else:
self.insertionOffset = 0
# Initialize the Code Page setting
self.codePage = 0
# At present, encoding is not used!
self.encoding = encoding
# Create a default font specification. I've chosen Courier New, 12 point, black on white,
self.font = {'fontfacename' : 'Courier New',
'fontpointsize' : 12,
'fontcolor' : wx.Colour(0, 0, 0),
'fontbgcolor' : wx.Colour(255, 255, 255)}
# Create an object to hold font specifications for the current font
self.txtAttr = richtext.RichTextAttr()
# Apply the default font specifications to the current font object
self.SetTxtStyle(fontFace = self.font['fontfacename'], fontSize = self.font['fontpointsize'],
fontColor = self.font['fontcolor'], fontBgColor = self.font['fontbgcolor'],
fontBold = False, fontItalic = False, fontUnderline = False)
# If a file name was passed in and the file exists ...
if (filename != None) and os.path.exists(filename):
# ... open the file to be read ...
f = open(filename, "r")
# ... read its contents into a buffer ...
self.buffer = f.read()
# ... and close the file
f.close()
# If there's a buffer string passed in ...
elif buf != None:
self.buffer = buf
# If there's nothing to read ...
else:
# ... create an empty buffer variable
self.buffer = ''
# Set the processing index to the start of the buffer
self.index = 0
# Initialize variables related to the Font Table
self.in_font_table = False
self.in_font_block = False
self.fontName = ""
self.fontNumber = -1
self.fontCharSet = -1
self.fontEncoding = 'utf8'
self.fontTable = {}
# Initialize the Default Font Number
self.defaultFontNumber = 0
# Initialize variables related to the Color Table
self.in_color_table = False
self.colorIndex = 0
self.colorTable = [0x000000]
# Initialize Paragraph settings
self.paragraph = {'alignment' : 'left',
'linespacing' : wx.TEXT_ATTR_LINE_SPACING_NORMAL,
'leftindent' : 0,
'rightindent' : 0,
'firstlineindent' : 0,
'spacingbefore' : 0,
'spacingafter' : 0,
'tabs' : []}
# Initialize variables related to processing images
self.in_image = False
self.image_type = None
self.image_loaded = False
# Initialize variables related to processing URLs
self.in_field = 0
self.in_url = False
self.in_link = False
self.url = ''
# Initialize variables related to list processing
self.in_list = False
self.list_txt = ''
# Initialize RTF block nesting counter
self.nest = 0
# Set the insertion point, if one is passed in
if self.insertionPoint != None:
self.txtCtrl.SetInsertionPoint(insertionPoint)
# Process the RTF document
self.process_doc(displayProgress)
def SetTxtStyle(self, fontColor = None, fontBgColor = None, fontFace = None, fontSize = None,
fontBold = None, fontItalic = None, fontUnderline = None,
parAlign = None, parLeftIndent = None, parRightIndent = None,
parTabs = None, parLineSpacing = None, parSpacingBefore = None, parSpacingAfter = None):
""" I find some of the RichTextCtrl method names to be misleading. Some character styles are stacked in the RichTextCtrl,
and they are removed in the reverse order from how they are added, regardless of the method called.
For example, starting with plain text, BeginBold() makes it bold, and BeginItalic() makes it bold-italic. EndBold()
should make it italic but instead makes it bold. EndItalic() takes us back to plain text by removing the bold.
According to Julian, this functions "as expected" because of the way the RichTextCtrl is written.