Skip to content

Commit

Permalink
Merge pull request sethblack#51 from Benoss/patch-1
Browse files Browse the repository at this point in the history
Give the ability to send the raw html
  • Loading branch information
sethblack authored Feb 5, 2020
2 parents d3a7602 + df35e6c commit f0c1eb8
Showing 1 changed file with 28 additions and 27 deletions.
55 changes: 28 additions & 27 deletions seoanalyzer/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,43 +121,44 @@ def populate(self, bs):
if len(keywords) > 0:
self.warn(f'Keywords should be avoided as they are a spam indicator and no longer used by Search Engines: {keywords}')

def analyze(self):
def analyze(self, raw_html=None):
"""
Analyze the page and populate the warnings list
"""

if not raw_html:
valid_prefixes = []

valid_prefixes = []
# only allow http:// https:// and //
for s in ['http://', 'https://', '//',]:
valid_prefixes.append(self.url.startswith(s))

# only allow http:// https:// and //
for s in ['http://', 'https://', '//',]:
valid_prefixes.append(self.url.startswith(s))

if True not in valid_prefixes:
self.warn(f'{self.url} does not appear to have a valid protocol.')
return
if True not in valid_prefixes:
self.warn(f'{self.url} does not appear to have a valid protocol.')
return

if self.url.startswith('//'):
self.url = f'{self.base_domain.scheme}:{self.url}'
if self.url.startswith('//'):
self.url = f'{self.base_domain.scheme}:{self.url}'

try:
page = http.get(self.url)
except requests.exceptions.HTTPError as e:
self.warn(f'Returned {page.status_code}')
return
try:
page = http.get(self.url)
except requests.exceptions.HTTPError as e:
self.warn(f'Returned {page.status_code}')
return

encoding = 'ascii'
encoding = 'ascii'

if 'content-type' in page.headers:
encoding = page.headers['content-type'].split('charset=')[-1]
if 'content-type' in page.headers:
encoding = page.headers['content-type'].split('charset=')[-1]

if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'):
try:
raw_html = unicode(page.read(), encoding)
except:
self.warn(f'Can not read {encoding}')
return
else:
raw_html = page.data.decode('utf-8')
if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'):
try:
raw_html = unicode(page.read(), encoding)
except:
self.warn(f'Can not read {encoding}')
return
else:
raw_html = page.data.decode('utf-8')

# remove comments, they screw with BeautifulSoup
clean_html = re.sub(r'<!--.*?-->', r'', raw_html, flags=re.DOTALL)
Expand Down

0 comments on commit f0c1eb8

Please sign in to comment.