Skip to content

Commit

Permalink
much better encoding detection
Browse files Browse the repository at this point in the history
Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
  • Loading branch information
kennethreitz committed Feb 26, 2018
1 parent 53d1e2a commit 8f30583
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 16 deletions.
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ pyquery = "*"
fake-useragent = "*"
parse = "*"
"bs4" = "*"
"PyQt5" = "*"
"pyqt5" = "*"
"w3lib" = "*"


[dev-packages]
Expand Down
16 changes: 15 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 2 additions & 13 deletions requests_html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import sys
from urllib.parse import urlparse, urlunparse

import requests
Expand All @@ -9,6 +8,7 @@
from lxml.html.soupparser import fromstring
from parse import search as parse_search
from parse import findall
from w3lib.encoding import html_to_unicode

try:
from PyQt5.QtWidgets import QApplication
Expand Down Expand Up @@ -50,18 +50,7 @@ def encoding(self):
return self._encoding

# Scan meta tags for chaset.
for meta_tag in self.find('meta', _encoding=self.default_encoding):

# HTML 5 support.
if 'charset' in meta_tag.attrs:
self._encoding = meta_tag.attrs['charset']

# HTML 4 support.
if 'content' in meta_tag.attrs:
try:
self._encoding = meta_tag.attrs['content'].split('charset=')[1]
except IndexError:
pass
self._encoding = html_to_unicode(self.default_encoding, self.html)[0]

return self._encoding if self._encoding else self.default_encoding

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

# What packages are required for this module to be executed?
REQUIRED = [
'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4'
'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib'
]

# The rest you shouldn't have to touch too much :)
Expand Down

0 comments on commit 8f30583

Please sign in to comment.