Skip to content

Remove Python 2.7 and 3.5 support, add Python 3.9 support #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ jobs:
- python-version: 3.7
env:
TOXENV: docs
- python-version: 3.8
- python-version: 3.9
env:
TOXENV: flake8
- python-version: 3.8
- python-version: 3.9
env:
TOXENV: pylint
- python-version: 3.8
- python-version: 3.9
env:
TOXENV: security

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: Set up Python 3.8
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9

- name: Check Tag
id: check-release-tag
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [2.7, 3.5, 3.6, 3.7, 3.8, pypy3]
python-version: [3.6, 3.7, 3.8, 3.9, pypy3]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ This is a Python library of web-related functions, such as:
Requirements
============

Python 2.7 or Python 3.5+
Python 3.6+

Install
=======
Expand Down
18 changes: 8 additions & 10 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# w3lib documentation build configuration file, created by
# sphinx-quickstart on Sun Jan 26 22:19:38 2014.
#
Expand Down Expand Up @@ -47,8 +45,8 @@
master_doc = 'index'

# General information about the project.
project = u'w3lib'
copyright = u'2014, w3lib developers'
project = 'w3lib'
copyright = '2014, w3lib developers'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
Expand Down Expand Up @@ -190,8 +188,8 @@
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'w3lib.tex', u'w3lib Documentation',
u'w3lib developers', 'manual'),
('index', 'w3lib.tex', 'w3lib Documentation',
'w3lib developers', 'manual'),
]

# The name of an image file (relative to this directory) to place at the top of
Expand Down Expand Up @@ -220,8 +218,8 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'w3lib', u'w3lib Documentation',
[u'w3lib developers'], 1)
('index', 'w3lib', 'w3lib Documentation',
['w3lib developers'], 1)
]

# If true, show URL addresses after external links.
Expand All @@ -234,8 +232,8 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'w3lib', u'w3lib Documentation',
u'w3lib developers', 'w3lib', 'One line description of project.',
('index', 'w3lib', 'w3lib Documentation',
'w3lib developers', 'w3lib', 'One line description of project.',
'Miscellaneous'),
]

Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Modules
Requirements
============

Python 2.7 or Python 3.3+
Python 3.6+

Install
=======
Expand Down
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
[pytest]
doctest_optionflags = ALLOW_UNICODE ALLOW_BYTES
flake8-ignore =
W503 # https://www.flake8rules.com/rules/W503.html

docs/conf.py E121 E122 E265 E401 E501
tests/test_encoding.py E128 E221 E241 E302 E401 E501 E731
tests/test_form.py E265 E501
Expand Down
5 changes: 1 addition & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,13 @@
'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
'Topic :: Internet :: WWW/HTTP',
],
install_requires=['six >= 1.4.1'],
)
2 changes: 0 additions & 2 deletions stdeb.cfg

This file was deleted.

5 changes: 0 additions & 5 deletions tests/py3-ignores.txt

This file was deleted.

111 changes: 59 additions & 52 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import unittest, codecs
import six
from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
http_content_type_encoding, resolve_encoding, html_to_unicode)
import codecs
import unittest

from w3lib.encoding import (
html_body_declared_encoding,
http_content_type_encoding,
html_to_unicode,
read_bom,
resolve_encoding,
to_unicode,
)

class RequestEncodingTests(unittest.TestCase):
utf8_fragments = [
Expand All @@ -22,7 +29,7 @@ class RequestEncodingTests(unittest.TestCase):

def test_bom(self):
# cjk water character in unicode
water_unicode = u'\u6C34'
water_unicode = '\u6C34'
# BOM + water character encoded
utf16be = b'\xfe\xff\x6c\x34'
utf16le = b'\xff\xfe\x34\x6c'
Expand Down Expand Up @@ -62,19 +69,19 @@ def test_html_body_declared_encoding(self):

def test_html_body_declared_encoding_unicode(self):
# html_body_declared_encoding should work when unicode body is passed
self.assertEqual(None, html_body_declared_encoding(u"something else"))
self.assertEqual(None, html_body_declared_encoding("something else"))

for fragment in self.utf8_fragments:
encoding = html_body_declared_encoding(fragment.decode('utf8'))
self.assertEqual(encoding, 'utf-8', fragment)

self.assertEqual(None, html_body_declared_encoding(u"""
self.assertEqual(None, html_body_declared_encoding("""
<head></head><body>
this isn't searched
<meta charset="utf-8">
"""))
self.assertEqual(None, html_body_declared_encoding(
u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))


class CodecsEncodingTestCase(unittest.TestCase):
Expand All @@ -88,10 +95,10 @@ def test_resolve_encoding(self):
class UnicodeDecodingTestCase(unittest.TestCase):

def test_utf8(self):
self.assertEqual(to_unicode(b'\xc2\xa3', 'utf-8'), u'\xa3')
self.assertEqual(to_unicode(b'\xc2\xa3', 'utf-8'), '\xa3')

def test_invalid_utf8(self):
self.assertEqual(to_unicode(b'\xc2\xc2\xa3', 'utf-8'), u'\ufffd\xa3')
self.assertEqual(to_unicode(b'\xc2\xc2\xa3', 'utf-8'), '\ufffd\xa3')


def ct(charset):
Expand All @@ -103,22 +110,22 @@ def norm_encoding(enc):
class HtmlConversionTests(unittest.TestCase):

def test_unicode_body(self):
unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
unicode_string = '\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
original_string = unicode_string.encode('cp1251')
encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
# check body_as_unicode
self.assertTrue(isinstance(body_unicode, six.text_type))
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(body_unicode, unicode_string)

def _assert_encoding(self, content_type, body, expected_encoding,
expected_unicode):
assert not isinstance(body, six.text_type)
assert not isinstance(body, str)
encoding, body_unicode = html_to_unicode(ct(content_type), body)
self.assertTrue(isinstance(body_unicode, six.text_type))
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(norm_encoding(encoding),
norm_encoding(expected_encoding))

if isinstance(expected_unicode, six.string_types):
if isinstance(expected_unicode, str):
self.assertEqual(body_unicode, expected_unicode)
else:
self.assertTrue(
Expand All @@ -130,23 +137,23 @@ def test_content_type_and_conversion(self):
"""Test content type header is interpreted and text converted as
expected
"""
self._assert_encoding('utf-8', b"\xc2\xa3", 'utf-8', u"\xa3")
self._assert_encoding('utf-8', b"\xc2\xa3", 'utf-8', "\xa3")
# something like this in the scrapy tests - but that's invalid?
# self._assert_encoding('', "\xa3", 'utf-8', u"\xa3")
# self._assert_encoding('', "\xa3", 'utf-8', "\xa3")
# iso-8859-1 is overridden to cp1252
self._assert_encoding('iso-8859-1', b"\xa3", 'cp1252', u"\xa3")
self._assert_encoding('', b"\xc2\xa3", 'utf-8', u"\xa3")
self._assert_encoding('none', b"\xc2\xa3", 'utf-8', u"\xa3")
self._assert_encoding('gb2312', b"\xa8D", 'gb18030', u"\u2015")
self._assert_encoding('gbk', b"\xa8D", 'gb18030', u"\u2015")
self._assert_encoding('big5', b"\xf9\xda", 'big5hkscs', u"\u6052")
self._assert_encoding('iso-8859-1', b"\xa3", 'cp1252', "\xa3")
self._assert_encoding('', b"\xc2\xa3", 'utf-8', "\xa3")
self._assert_encoding('none', b"\xc2\xa3", 'utf-8', "\xa3")
self._assert_encoding('gb2312', b"\xa8D", 'gb18030', "\u2015")
self._assert_encoding('gbk', b"\xa8D", 'gb18030', "\u2015")
self._assert_encoding('big5', b"\xf9\xda", 'big5hkscs', "\u6052")

def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
# unlike scrapy, the BOM is stripped
self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xabWORD2",
'utf-8', u'WORD\ufffdWORD2')
'utf-8', 'WORD\ufffdWORD2')
self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xabWORD2",
'utf-8', u'WORD\ufffdWORD2')
'utf-8', 'WORD\ufffdWORD2')

def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self):
# Python implementations handle unexpected end of UTF8 data
Expand All @@ -156,69 +163,69 @@ def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self):

# unlike scrapy, the BOM is stripped
self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xab",
'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd'])
'utf-8', ['WORD\ufffd\ufffd', 'WORD\ufffd'])
self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xab",
'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd'])
'utf-8', ['WORD\ufffd\ufffd', 'WORD\ufffd'])

def test_replace_wrong_encoding(self):
"""Test invalid chars are replaced properly"""
encoding, body_unicode = html_to_unicode(ct('utf-8'),
b'PREFIX\xe3\xabSUFFIX')
# XXX: Policy for replacing invalid chars may suffer minor variations
# but it should always contain the unicode replacement char (u'\ufffd')
assert u'\ufffd' in body_unicode, repr(body_unicode)
assert u'PREFIX' in body_unicode, repr(body_unicode)
assert u'SUFFIX' in body_unicode, repr(body_unicode)
# but it should always contain the unicode replacement char ('\ufffd')
assert '\ufffd' in body_unicode, repr(body_unicode)
assert 'PREFIX' in body_unicode, repr(body_unicode)
assert 'SUFFIX' in body_unicode, repr(body_unicode)

# Do not destroy html tags due to encoding bugs
encoding, body_unicode = html_to_unicode(ct('utf-8'),
b'\xf0<span>value</span>')
assert u'<span>value</span>' in body_unicode, repr(body_unicode)
assert '<span>value</span>' in body_unicode, repr(body_unicode)

def _assert_encoding_detected(self, content_type, expected_encoding, body,
**kwargs):
assert not isinstance(body, six.text_type)
assert not isinstance(body, str)
encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs)
self.assertTrue(isinstance(body_unicode, six.text_type))
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))

def test_BOM(self):
# utf-16 cases already tested, as is the BOM detection function

# http header takes precedence, irrespective of BOM
bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
expected = u'\ufffd\ufffd\x00h\x00i'
bom_be_str = codecs.BOM_UTF16_BE + "hi".encode('utf-16-be')
expected = '\ufffd\ufffd\x00h\x00i'
self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected)

# BOM is stripped when it agrees with the encoding, or used to
# determine encoding
bom_utf8_str = codecs.BOM_UTF8 + b'hi'
self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi")
self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi")
self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', "hi")
self._assert_encoding(None, bom_utf8_str, 'utf-8', "hi")

def test_utf16_32(self):
# tools.ietf.org/html/rfc2781 section 4.3

# USE BOM and strip it
bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi")
self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi")
bom_be_str = codecs.BOM_UTF16_BE + "hi".encode('utf-16-be')
self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', "hi")
self._assert_encoding(None, bom_be_str, 'utf-16-be', "hi")

bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le')
self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi")
self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi")
bom_le_str = codecs.BOM_UTF16_LE + "hi".encode('utf-16-le')
self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', "hi")
self._assert_encoding(None, bom_le_str, 'utf-16-le', "hi")

bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be')
self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi")
self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi")
bom_be_str = codecs.BOM_UTF32_BE + "hi".encode('utf-32-be')
self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', "hi")
self._assert_encoding(None, bom_be_str, 'utf-32-be', "hi")

bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le')
self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi")
self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi")
bom_le_str = codecs.BOM_UTF32_LE + "hi".encode('utf-32-le')
self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', "hi")
self._assert_encoding(None, bom_le_str, 'utf-32-le', "hi")

# if there is no BOM, big endian should be chosen
self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi")
self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi")
self._assert_encoding('utf-16', "hi".encode('utf-16-be'), 'utf-16-be', "hi")
self._assert_encoding('utf-32', "hi".encode('utf-32-be'), 'utf-32-be', "hi")

def test_python_crash(self):
import random
Expand Down
Loading