@@ -307,7 +307,7 @@ def test_invalid_encoding2(self):
307
307
self .assertEqual (og , {"og:title" : "ÿÿ Foo" , "og:description" : "Some text." })
308
308
309
309
def test_windows_1252 (self ):
310
- """A body which uses windows-1252 , but doesn't declare that."""
310
+ """A body which uses cp1252 , but doesn't declare that."""
311
311
html = b"""
312
312
<html>
313
313
<head><title>\xf3 </title></head>
@@ -333,7 +333,7 @@ def test_meta_charset(self):
333
333
""" ,
334
334
"text/html" ,
335
335
)
336
- self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "windows-1252 " ])
336
+ self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "cp1252 " ])
337
337
338
338
# A less well-formed version.
339
339
encodings = get_html_media_encodings (
@@ -345,7 +345,7 @@ def test_meta_charset(self):
345
345
""" ,
346
346
"text/html" ,
347
347
)
348
- self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "windows-1252 " ])
348
+ self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "cp1252 " ])
349
349
350
350
def test_meta_charset_underscores (self ):
351
351
"""A character encoding contains underscore."""
@@ -358,7 +358,7 @@ def test_meta_charset_underscores(self):
358
358
""" ,
359
359
"text/html" ,
360
360
)
361
- self .assertEqual (list (encodings ), ["shift_jis" , "utf-8" , "windows-1252 " ])
361
+ self .assertEqual (list (encodings ), ["shift_jis" , "utf-8" , "cp1252 " ])
362
362
363
363
def test_xml_encoding (self ):
364
364
"""A character encoding is found via the meta tag."""
@@ -370,7 +370,7 @@ def test_xml_encoding(self):
370
370
""" ,
371
371
"text/html" ,
372
372
)
373
- self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "windows-1252 " ])
373
+ self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "cp1252 " ])
374
374
375
375
def test_meta_xml_encoding (self ):
376
376
"""Meta tags take precedence over XML encoding."""
@@ -384,7 +384,7 @@ def test_meta_xml_encoding(self):
384
384
""" ,
385
385
"text/html" ,
386
386
)
387
- self .assertEqual (list (encodings ), ["utf-16" , "ascii" , "utf-8" , "windows-1252 " ])
387
+ self .assertEqual (list (encodings ), ["utf-16" , "ascii" , "utf-8" , "cp1252 " ])
388
388
389
389
def test_content_type (self ):
390
390
"""A character encoding is found via the Content-Type header."""
@@ -399,12 +399,12 @@ def test_content_type(self):
399
399
)
400
400
for header in headers :
401
401
encodings = get_html_media_encodings (b"" , header )
402
- self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "windows-1252 " ])
402
+ self .assertEqual (list (encodings ), ["ascii" , "utf-8" , "cp1252 " ])
403
403
404
404
def test_fallback (self ):
405
405
"""A character encoding cannot be found in the body or header."""
406
406
encodings = get_html_media_encodings (b"" , "text/html" )
407
- self .assertEqual (list (encodings ), ["utf-8" , "windows-1252 " ])
407
+ self .assertEqual (list (encodings ), ["utf-8" , "cp1252 " ])
408
408
409
409
def test_duplicates (self ):
410
410
"""Ensure each encoding is only attempted once."""
@@ -418,4 +418,17 @@ def test_duplicates(self):
418
418
""" ,
419
419
'text/html; charset="UTF_8"' ,
420
420
)
421
- self .assertEqual (list (encodings ), ["utf-8" , "windows-1252" ])
421
+ self .assertEqual (list (encodings ), ["utf-8" , "cp1252" ])
422
+
423
+ def test_unknown_invalid (self ):
424
+ """A character encoding should be ignored if it is unknown or invalid."""
425
+ encodings = get_html_media_encodings (
426
+ b"""
427
+ <html>
428
+ <head><meta charset="invalid">
429
+ </head>
430
+ </html>
431
+ """ ,
432
+ 'text/html; charset="invalid"' ,
433
+ )
434
+ self .assertEqual (list (encodings ), ["utf-8" , "cp1252" ])
0 commit comments