Skip to content

Commit 8c686dc

Browse files
author
Brendon
committed
Get default charset from rtf header instead of just using cp1252
1 parent 7f601c4 commit 8c686dc

File tree

1 file changed

+35
-7
lines changed

1 file changed

+35
-7
lines changed

pyth/plugins/rtf15/reader.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,18 @@
7373
255: "cp850", # OEM
7474
}
7575

76+
# All the ones named by number in my 2.6 encodings dir
77+
_CODEPAGES_BY_NUMBER = dict(
78+
(x, "cp%s" % x) for x in (37, 1006, 1026, 1140, 1250, 1251, 1252, 1253, 1254, 1255,
79+
1256, 1257, 1258, 424, 437, 500, 737, 775, 850, 852, 855,
80+
856, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
81+
875, 932, 949, 950))
7682

77-
83+
# Miscellaneous, incomplete
84+
_CODEPAGES_BY_NUMBER.update({
85+
10000: "mac-roman",
86+
10007: "mac-greek",
87+
})
7888

7989

8090
class BackslashEscape(Exception):
@@ -107,8 +117,9 @@ def go(self):
107117

108118
self.source.seek(0)
109119

110-
self.group = Group()
111120
self.charsetTable = None
121+
self.charset = 'cp1252'
122+
self.group = Group(self)
112123
self.stack = [self.group]
113124
self.parse()
114125
return self.build()
@@ -124,14 +135,15 @@ def parse(self):
124135
if next in '\r\n':
125136
continue
126137
if next == '{':
127-
subGroup = Group(self.group, self.charsetTable)
138+
subGroup = Group(self, self.group, self.charsetTable)
128139
self.stack.append(subGroup)
129140
self.group = subGroup
130141
elif next == '}':
131142
subGroup = self.stack.pop()
132143
self.group = self.stack[-1]
133144

134145
subGroup.finalize()
146+
135147
if subGroup.specialMeaning == 'FONT_TABLE':
136148
self.charsetTable = subGroup.charsetTable
137149
self.group.content.append(subGroup)
@@ -331,15 +343,16 @@ def handle_ReadableMarker(self, marker):
331343

332344
class Group(object):
333345

334-
def __init__(self, parent=None, charsetTable=None):
346+
def __init__(self, reader, parent=None, charsetTable=None):
347+
self.reader = reader
335348
self.parent = parent
336349

337350
if parent:
338351
self.props = parent.props.copy()
339352
self.charset = self.parent.charset
340353
else:
341354
self.props = {}
342-
self.charset = 'cp1252' # ?
355+
self.charset = self.reader.charset
343356

344357
self.specialMeaning = None
345358
self.skip = False
@@ -416,6 +429,20 @@ def flatten(self):
416429
return stuff
417430

418431

432+
# Header stuff
433+
def handle_ansi(self): self.charset = self.reader.charset = 'cp1252'
434+
def handle_mac(self): self.charset = self.reader.charset = 'mac-roman'
435+
def handle_pc(self): self.charset = self.reader.charset = 'cp437'
436+
def handle_pca(self): self.charset = self.reader.charset = 'cp850'
437+
438+
def handle_ansicpg(self, codepage):
439+
codepage = int(codepage)
440+
if codepage in _CODEPAGES_BY_NUMBER:
441+
self.charset = self.reader.charset = _CODEPAGES_BY_NUMBER[codepage]
442+
else:
443+
raise ValueError("Unknown codepage %s" % codepage)
444+
445+
419446
def handle_fonttbl(self):
420447
self.specialMeaning = 'FONT_TABLE'
421448
self.charsetTable = {}
@@ -449,7 +476,7 @@ def handle_ansi_escape(self, code):
449476
if isinstance(self.charset, dict):
450477
uni_code = self.charset.get(code)
451478
if uni_code is None:
452-
char = '?'
479+
char = u'?'
453480
else:
454481
char = unichr(uni_code)
455482

@@ -458,7 +485,8 @@ def handle_ansi_escape(self, code):
458485
try:
459486
char = chr(code).decode(self.charset)
460487
except UnicodeDecodeError:
461-
char = '?'
488+
print code, self.charset
489+
char = u'?'
462490

463491
self.content.append(char)
464492

0 commit comments

Comments
 (0)