Skip to content

Commit de9f070

Browse files
committed
Detect non-RTF files early, and handle symbol font
1 parent ba62512 commit de9f070

File tree

2 files changed

+54
-7
lines changed

2 files changed

+54
-7
lines changed

pyth/errors.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
class WrongFileType(ValueError):
2+
pass
3+

pyth/plugins/rtf15/reader.py

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,46 @@
1111
from pyth import document
1212
from pyth.format import PythReader
1313

14-
1514
_CONTROLCHARS = set(string.ascii_letters + string.digits + "-*")
1615
_DIGITS = set(string.digits)
1716

17+
# Maps Symbol typeface to Unicode, extracted from http://en.wikipedia.org/wiki/Symbol_(typeface)
18+
symbolTable = {
19+
33: 33, 34: 8704, 35: 35, 36: 8707, 37: 37, 38: 38, 39: 8717, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46,
20+
47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61,
21+
62: 62, 63: 63, 64: 8773, 65: 913, 66: 914, 67: 935, 68: 916, 69: 917, 70: 934, 71: 915, 72: 919, 73: 921, 74: 977,
22+
75: 922, 76: 923, 77: 924, 78: 925, 79: 927, 80: 928, 81: 920, 82: 929, 83: 931, 84: 932, 85: 933, 86: 962, 87: 937,
23+
88: 926, 89: 936, 90: 918, 91: 91, 92: 8756, 93: 93, 94: 8869, 95: 95, 96: 63717, 97: 945, 98: 946, 99: 967, 100: 948,
24+
101: 949, 102: 966, 103: 947, 104: 951, 105: 953, 106: 981, 107: 954, 108: 955, 109: 956, 110: 957, 111: 959, 112: 960,
25+
113: 952, 114: 961, 115: 963, 116: 964, 117: 965, 118: 982, 119: 969, 120: 958, 121: 968, 122: 950, 123: 123, 124: 124,
26+
125: 125, 126: 126, 160: 8364, 161: 978, 162: 697, 163: 8804, 164: 8260, 165: 8734, 166: 402, 167: 9827, 168: 9830,
27+
169: 9829, 170: 9824, 171: 8596, 172: 8592, 173: 8593, 174: 8594, 175: 8595, 176: 176, 177: 177, 178: 698, 179: 8805,
28+
180: 215, 181: 8733, 182: 8706, 183: 8226, 184: 247, 185: 8800, 186: 8801, 187: 8776, 188: 8230, 189: 9168, 190: 9135,
29+
191: 8629, 192: 8501, 193: 8465, 194: 8476, 195: 8472, 196: 8855, 197: 8853, 198: 8709, 199: 8745, 200: 8746, 201: 8835,
30+
202: 8839, 203: 8836, 204: 8834, 205: 8838, 206: 8712, 207: 8713, 208: 8736, 209: 8711, 210: 174, 211: 169, 212: 8482,
31+
213: 8719, 214: 8730, 215: 8901, 216: 172, 217: 8743, 218: 8744, 219: 8660, 220: 8656, 221: 8657, 222: 8658, 223: 8659,
32+
224: 9674, 225: 12296, 226: 174, 227: 169, 228: 8482, 229: 8721, 230: 9115, 231: 9116, 232: 9117, 233: 9121, 234: 9122,
33+
235: 9123, 236: 9127, 237: 9128, 238: 9129, 239: 9130, 241: 12297, 242: 8747, 243: 8992, 244: 9134, 245: 8993, 246: 9118,
34+
247: 9119, 248: 9120, 249: 9124, 250: 9125, 251: 9126, 252: 9131, 253: 9132, 254: 9133}
35+
1836
_CODEPAGES = {
1937
0: "cp1252", # ANSI
2038
1: "cp1252", # Default (this is wrong, but there is no right)
2139

22-
# Does Python have built-in support for these? What is it?
23-
# 2: "42", # Symbol
40+
2: symbolTable, # Symbol
2441
77: "mac-roman", # Mac Roman
42+
43+
# Does Python have built-in support for these? What is it?
2544
# 78: "10001", # Mac Shift Jis
2645
# 79: "10003", # Mac Hangul
2746
# 80: "10008", # Mac GB2312
2847
# 81: "10002", # Mac Big5
2948
# 83: "10005", # Mac Hebrew
49+
3050
84: "mac-arabic", # Mac Arabic
3151
85: "mac-greek", # Mac Greek
3252
86: "mac-turkish", # Mac Turkish
53+
3354
# 87: "10021", # Mac Thai
3455
# 88: "10029", # Mac East Europe
3556
# 89: "10007", # Mac Russian
@@ -53,6 +74,9 @@
5374
}
5475

5576

77+
78+
79+
5680
class BackslashEscape(Exception):
5781
pass
5882

@@ -76,6 +100,13 @@ def __init__(self, source):
76100

77101
def go(self):
78102
self.source.seek(0)
103+
104+
if self.source.read(5) != r"{\rtf":
105+
from pyth.errors import WrongFileType
106+
raise WrongFileType("Doesn't look like an RTF file")
107+
108+
self.source.seek(0)
109+
79110
self.group = Group()
80111
self.charsetTable = None
81112
self.stack = [self.group]
@@ -408,10 +439,23 @@ def handle_fcharset(self, charsetNum):
408439

409440

410441
def handle_ansi_escape(self, code):
411-
try:
412-
self.content.append(chr(int(code, 16)).decode(self.charset))
413-
except UnicodeDecodeError:
414-
self.content.append('?')
442+
code = int(code, 16)
443+
444+
if isinstance(self.charset, dict):
445+
uni_code = self.charset.get(code)
446+
if uni_code is None:
447+
char = '?'
448+
else:
449+
char = unichr(uni_code)
450+
451+
452+
else:
453+
try:
454+
char = chr(code).decode(self.charset)
455+
except UnicodeDecodeError:
456+
char = '?'
457+
458+
self.content.append(char)
415459

416460

417461
def handle_control_symbol(self, symbol):

0 commit comments

Comments
 (0)