Merge pull request sethblack#51 from Benoss/patch-1

Give the ability to send the raw html
dotdesh71 · Feb 5, 2020 · f0c1eb8 · f0c1eb8
2 parents d3a7602 + df35e6c
commit f0c1eb8
Showing 1 changed file with 28 additions and 27 deletions.
diff --git a/seoanalyzer/page.py b/seoanalyzer/page.py
@@ -121,43 +121,44 @@ def populate(self, bs):
         if len(keywords) > 0:
             self.warn(f'Keywords should be avoided as they are a spam indicator and no longer used by Search Engines: {keywords}')
 
-    def analyze(self):
+    def analyze(self, raw_html=None):
         """
         Analyze the page and populate the warnings list
         """
+
+        if not raw_html:
+            valid_prefixes = []
 
-        valid_prefixes = []
+            # only allow http:// https:// and //
+            for s in ['http://', 'https://', '//',]:
+                valid_prefixes.append(self.url.startswith(s))
 
-        # only allow http:// https:// and //
-        for s in ['http://', 'https://', '//',]:
-            valid_prefixes.append(self.url.startswith(s))
-
-        if True not in valid_prefixes:
-            self.warn(f'{self.url} does not appear to have a valid protocol.')
-            return
+            if True not in valid_prefixes:
+                self.warn(f'{self.url} does not appear to have a valid protocol.')
+                return
 
-        if self.url.startswith('//'):
-            self.url = f'{self.base_domain.scheme}:{self.url}'
+            if self.url.startswith('//'):
+                self.url = f'{self.base_domain.scheme}:{self.url}'
 
-        try:
-            page = http.get(self.url)
-        except requests.exceptions.HTTPError as e:
-            self.warn(f'Returned {page.status_code}')
-            return
+            try:
+                page = http.get(self.url)
+            except requests.exceptions.HTTPError as e:
+                self.warn(f'Returned {page.status_code}')
+                return
 
-        encoding = 'ascii'
+            encoding = 'ascii'
 
-        if 'content-type' in page.headers:
-            encoding = page.headers['content-type'].split('charset=')[-1]
+            if 'content-type' in page.headers:
+                encoding = page.headers['content-type'].split('charset=')[-1]
 
-        if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'):
-            try:
-                raw_html = unicode(page.read(), encoding)
-            except:
-                self.warn(f'Can not read {encoding}')
-                return
-        else:
-            raw_html = page.data.decode('utf-8')
+            if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'):
+                try:
+                    raw_html = unicode(page.read(), encoding)
+                except:
+                    self.warn(f'Can not read {encoding}')
+                    return
+            else:
+                raw_html = page.data.decode('utf-8')
 
         # remove comments, they screw with BeautifulSoup
         clean_html = re.sub(r'<!--.*?-->', r'', raw_html, flags=re.DOTALL)