1
1
from __future__ import annotations
2
2
3
3
import codecs
4
- import unittest
5
4
from typing import Any
6
5
7
6
from w3lib .encoding import (
14
13
)
15
14
16
15
17
- class RequestEncodingTests ( unittest . TestCase ) :
16
+ class TestRequestEncoding :
18
17
utf8_fragments = [
19
18
# Content-Type as meta http-equiv
20
19
b"""<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""" ,
@@ -44,84 +43,84 @@ def test_bom(self):
44
43
assert bom_encoding is not None
45
44
assert bom is not None
46
45
decoded = string [len (bom ) :].decode (bom_encoding )
47
- self . assertEqual ( water_unicode , decoded )
46
+ assert water_unicode == decoded
48
47
# Body without BOM
49
48
enc , bom = read_bom (b"foo" )
50
- self . assertEqual ( enc , None )
51
- self . assertEqual ( bom , None )
49
+ assert enc is None
50
+ assert bom is None
52
51
# Empty body
53
52
enc , bom = read_bom (b"" )
54
- self . assertEqual ( enc , None )
55
- self . assertEqual ( bom , None )
53
+ assert enc is None
54
+ assert bom is None
56
55
57
56
def test_http_encoding_header (self ):
58
57
header_value = "Content-Type: text/html; charset=ISO-8859-4"
59
58
extracted = http_content_type_encoding (header_value )
60
- self . assertEqual ( extracted , "iso8859-4" )
61
- self . assertEqual ( None , http_content_type_encoding ("something else" ))
59
+ assert extracted == "iso8859-4"
60
+ assert http_content_type_encoding ("something else" ) is None
62
61
63
62
def test_html_body_declared_encoding (self ):
64
63
for fragment in self .utf8_fragments :
65
64
encoding = html_body_declared_encoding (fragment )
66
- self .assertEqual (encoding , "utf-8" , fragment )
67
- self .assertEqual (None , html_body_declared_encoding (b"something else" ))
68
- self .assertEqual (
69
- None ,
65
+ assert encoding == "utf-8" , fragment
66
+ assert None is html_body_declared_encoding (b"something else" )
67
+ assert (
70
68
html_body_declared_encoding (
71
69
b"""
72
70
<head></head><body>
73
71
this isn't searched
74
72
<meta charset="utf-8">
75
73
"""
76
- ),
74
+ )
75
+ is None
77
76
)
78
- self .assertEqual (
79
- None ,
77
+ assert (
80
78
html_body_declared_encoding (
81
79
b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
82
- ),
80
+ )
81
+ is None
83
82
)
84
83
85
84
def test_html_body_declared_encoding_unicode (self ):
86
85
# html_body_declared_encoding should work when unicode body is passed
87
- self . assertEqual ( None , html_body_declared_encoding ("something else" ))
86
+ assert html_body_declared_encoding ("something else" ) is None
88
87
89
88
for fragment in self .utf8_fragments :
90
89
encoding = html_body_declared_encoding (fragment .decode ("utf8" ))
91
- self . assertEqual ( encoding , "utf-8" , fragment )
90
+ assert encoding == "utf-8" , fragment
92
91
93
- self .assertEqual (
94
- None ,
92
+ assert (
95
93
html_body_declared_encoding (
96
94
"""
97
95
<head></head><body>
98
96
this isn't searched
99
97
<meta charset="utf-8">
100
98
"""
101
- ),
99
+ )
100
+ is None
102
101
)
103
- self .assertEqual (
104
- None ,
102
+ assert (
105
103
html_body_declared_encoding (
106
104
"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
107
- ),
105
+ )
106
+ is None
108
107
)
109
108
110
109
111
- class CodecsEncodingTestCase ( unittest . TestCase ) :
110
+ class TestCodecsEncoding :
112
111
def test_resolve_encoding (self ):
113
- self . assertEqual ( resolve_encoding ("latin1" ), "cp1252" )
114
- self . assertEqual ( resolve_encoding (" Latin-1" ), "cp1252" )
115
- self . assertEqual ( resolve_encoding ("gb_2312-80" ), "gb18030" )
116
- self . assertEqual ( resolve_encoding ("unknown encoding" ), None )
112
+ assert resolve_encoding ("latin1" ) == "cp1252"
113
+ assert resolve_encoding (" Latin-1" ) == "cp1252"
114
+ assert resolve_encoding ("gb_2312-80" ) == "gb18030"
115
+ assert resolve_encoding ("unknown encoding" ) is None
117
116
118
117
119
- class UnicodeDecodingTestCase ( unittest . TestCase ) :
118
+ class TestUnicodeDecoding :
120
119
def test_utf8 (self ):
121
- self . assertEqual ( to_unicode (b"\xc2 \xa3 " , "utf-8" ), "\xa3 " )
120
+ assert to_unicode (b"\xc2 \xa3 " , "utf-8" ) == "\xa3 "
122
121
123
122
def test_invalid_utf8 (self ):
124
- self . assertEqual ( to_unicode (b"\xc2 \xc2 \xa3 " , "utf-8" ), "\ufffd \xa3 " )
123
+ assert to_unicode (b"\xc2 \xc2 \xa3 " , "utf-8" ) == "\ufffd \xa3 "
125
124
126
125
127
126
def ct (charset : str | None ) -> str | None :
@@ -132,14 +131,14 @@ def norm_encoding(enc: str) -> str:
132
131
return codecs .lookup (enc ).name
133
132
134
133
135
- class HtmlConversionTests ( unittest . TestCase ) :
134
+ class TestHtmlConversion :
136
135
def test_unicode_body (self ):
137
136
unicode_string = "\u043a \u0438 \u0440 \u0438 \u043b \u043b \u0438 \u0447 \u0435 \u0441 \u043a \u0438 \u0439 \u0442 \u0435 \u043a \u0441 \u0442 "
138
137
original_string = unicode_string .encode ("cp1251" )
139
138
encoding , body_unicode = html_to_unicode (ct ("cp1251" ), original_string )
140
139
# check body_as_unicode
141
- self . assertTrue ( isinstance (body_unicode , str ) )
142
- self . assertEqual ( body_unicode , unicode_string )
140
+ assert isinstance (body_unicode , str )
141
+ assert body_unicode == unicode_string
143
142
144
143
def _assert_encoding (
145
144
self ,
@@ -150,15 +149,14 @@ def _assert_encoding(
150
149
) -> None :
151
150
assert not isinstance (body , str )
152
151
encoding , body_unicode = html_to_unicode (ct (content_type ), body )
153
- self . assertTrue ( isinstance (body_unicode , str ) )
154
- self . assertEqual ( norm_encoding (encoding ), norm_encoding (expected_encoding ) )
152
+ assert isinstance (body_unicode , str )
153
+ assert norm_encoding (encoding ) == norm_encoding (expected_encoding )
155
154
156
155
if isinstance (expected_unicode , str ):
157
- self . assertEqual ( body_unicode , expected_unicode )
156
+ assert body_unicode == expected_unicode
158
157
else :
159
- self .assertTrue (
160
- body_unicode in expected_unicode ,
161
- f"{ body_unicode } is not in { expected_unicode } " ,
158
+ assert body_unicode in expected_unicode , (
159
+ f"{ body_unicode } is not in { expected_unicode } "
162
160
)
163
161
164
162
def test_content_type_and_conversion (self ):
@@ -227,8 +225,8 @@ def _assert_encoding_detected(
227
225
) -> None :
228
226
assert not isinstance (body , str )
229
227
encoding , body_unicode = html_to_unicode (ct (content_type ), body , ** kwargs )
230
- self . assertTrue ( isinstance (body_unicode , str ) )
231
- self . assertEqual ( norm_encoding (encoding ), norm_encoding (expected_encoding ) )
228
+ assert isinstance (body_unicode , str )
229
+ assert norm_encoding (encoding ) == norm_encoding (expected_encoding )
232
230
233
231
def test_BOM (self ):
234
232
# utf-16 cases already tested, as is the BOM detection function
0 commit comments