7373 255 : "cp850" , # OEM
7474}
7575
76+ # All the ones named by number in my 2.6 encodings dir
77+ _CODEPAGES_BY_NUMBER = dict (
78+ (x , "cp%s" % x ) for x in (37 , 1006 , 1026 , 1140 , 1250 , 1251 , 1252 , 1253 , 1254 , 1255 ,
79+ 1256 , 1257 , 1258 , 424 , 437 , 500 , 737 , 775 , 850 , 852 , 855 ,
80+ 856 , 857 , 860 , 861 , 862 , 863 , 864 , 865 , 866 , 869 , 874 ,
81+ 875 , 932 , 949 , 950 ))
7682
77-
83+ # Miscellaneous, incomplete
84+ _CODEPAGES_BY_NUMBER .update ({
85+ 10000 : "mac-roman" ,
86+ 10007 : "mac-greek" ,
87+ })
7888
7989
8090class BackslashEscape (Exception ):
@@ -107,8 +117,9 @@ def go(self):
107117
108118 self .source .seek (0 )
109119
110- self .group = Group ()
111120 self .charsetTable = None
121+ self .charset = 'cp1252'
122+ self .group = Group (self )
112123 self .stack = [self .group ]
113124 self .parse ()
114125 return self .build ()
@@ -124,14 +135,15 @@ def parse(self):
124135 if next in '\r \n ' :
125136 continue
126137 if next == '{' :
127- subGroup = Group (self .group , self .charsetTable )
138+ subGroup = Group (self , self .group , self .charsetTable )
128139 self .stack .append (subGroup )
129140 self .group = subGroup
130141 elif next == '}' :
131142 subGroup = self .stack .pop ()
132143 self .group = self .stack [- 1 ]
133144
134145 subGroup .finalize ()
146+
135147 if subGroup .specialMeaning == 'FONT_TABLE' :
136148 self .charsetTable = subGroup .charsetTable
137149 self .group .content .append (subGroup )
@@ -331,15 +343,16 @@ def handle_ReadableMarker(self, marker):
331343
332344class Group (object ):
333345
334- def __init__ (self , parent = None , charsetTable = None ):
346+ def __init__ (self , reader , parent = None , charsetTable = None ):
347+ self .reader = reader
335348 self .parent = parent
336349
337350 if parent :
338351 self .props = parent .props .copy ()
339352 self .charset = self .parent .charset
340353 else :
341354 self .props = {}
342- self .charset = 'cp1252' # ?
355+ self .charset = self . reader . charset
343356
344357 self .specialMeaning = None
345358 self .skip = False
@@ -416,6 +429,20 @@ def flatten(self):
416429 return stuff
417430
418431
432+ # Header stuff
433+ def handle_ansi (self ): self .charset = self .reader .charset = 'cp1252'
434+ def handle_mac (self ): self .charset = self .reader .charset = 'mac-roman'
435+ def handle_pc (self ): self .charset = self .reader .charset = 'cp437'
436+ def handle_pca (self ): self .charset = self .reader .charset = 'cp850'
437+
438+ def handle_ansicpg (self , codepage ):
439+ codepage = int (codepage )
440+ if codepage in _CODEPAGES_BY_NUMBER :
441+ self .charset = self .reader .charset = _CODEPAGES_BY_NUMBER [codepage ]
442+ else :
443+ raise ValueError ("Unknown codepage %s" % codepage )
444+
445+
419446 def handle_fonttbl (self ):
420447 self .specialMeaning = 'FONT_TABLE'
421448 self .charsetTable = {}
@@ -449,7 +476,7 @@ def handle_ansi_escape(self, code):
449476 if isinstance (self .charset , dict ):
450477 uni_code = self .charset .get (code )
451478 if uni_code is None :
452- char = '?'
479+ char = u '?'
453480 else :
454481 char = unichr (uni_code )
455482
@@ -458,7 +485,8 @@ def handle_ansi_escape(self, code):
458485 try :
459486 char = chr (code ).decode (self .charset )
460487 except UnicodeDecodeError :
461- char = '?'
488+ print code , self .charset
489+ char = u'?'
462490
463491 self .content .append (char )
464492
0 commit comments