From df35e6c46bb9e714c085b35b3e9e5c75f9914f2c Mon Sep 17 00:00:00 2001 From: Benoit Chabord Date: Wed, 5 Feb 2020 14:43:37 +1300 Subject: [PATCH] Give the ability to send the raw html Add an option to analyze with Raw HTML as a parameter --- seoanalyzer/page.py | 55 +++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/seoanalyzer/page.py b/seoanalyzer/page.py index eaaef3d..296d908 100644 --- a/seoanalyzer/page.py +++ b/seoanalyzer/page.py @@ -121,43 +121,44 @@ def populate(self, bs): if len(keywords) > 0: self.warn(f'Keywords should be avoided as they are a spam indicator and no longer used by Search Engines: {keywords}') - def analyze(self): + def analyze(self, raw_html=None): """ Analyze the page and populate the warnings list """ + + if not raw_html: + valid_prefixes = [] - valid_prefixes = [] + # only allow http:// https:// and // + for s in ['http://', 'https://', '//',]: + valid_prefixes.append(self.url.startswith(s)) - # only allow http:// https:// and // - for s in ['http://', 'https://', '//',]: - valid_prefixes.append(self.url.startswith(s)) - - if True not in valid_prefixes: - self.warn(f'{self.url} does not appear to have a valid protocol.') - return + if True not in valid_prefixes: + self.warn(f'{self.url} does not appear to have a valid protocol.') + return - if self.url.startswith('//'): - self.url = f'{self.base_domain.scheme}:{self.url}' + if self.url.startswith('//'): + self.url = f'{self.base_domain.scheme}:{self.url}' - try: - page = http.get(self.url) - except requests.exceptions.HTTPError as e: - self.warn(f'Returned {page.status_code}') - return + try: + page = http.get(self.url) + except requests.exceptions.HTTPError as e: + self.warn(f'Returned {page.status_code}') + return - encoding = 'ascii' + encoding = 'ascii' - if 'content-type' in page.headers: - encoding = page.headers['content-type'].split('charset=')[-1] + if 'content-type' in page.headers: + encoding = page.headers['content-type'].split('charset=')[-1] - if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'): - try: - raw_html = unicode(page.read(), encoding) - except: - self.warn(f'Can not read {encoding}') - return - else: - raw_html = page.data.decode('utf-8') + if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'): + try: + raw_html = unicode(page.read(), encoding) + except: + self.warn(f'Can not read {encoding}') + return + else: + raw_html = page.data.decode('utf-8') # remove comments, they screw with BeautifulSoup clean_html = re.sub(r'', r'', raw_html, flags=re.DOTALL)