Add duplicate page audit.

Jaiantony · Feb 11, 2020 · 19e91a7 · 19e91a7
1 parent cb6260d
commit 19e91a7
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 1 deletion.
diff --git a/seoanalyzer/analyzer.py b/seoanalyzer/analyzer.py
@@ -19,6 +19,8 @@ def calc_total_time():
     for p in site.crawled_pages:
         output['pages'].append(p.talk())
 
+    output['duplicate_pages'] = [list(site.content_hashes[p]) for p in site.content_hashes if len(site.content_hashes[p]) > 1]
+
     sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)
     sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)
     sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)

diff --git a/seoanalyzer/page.py b/seoanalyzer/page.py
@@ -1,5 +1,6 @@
-import re
+import hashlib
 import json
+import re
 
 from bs4 import BeautifulSoup
 from collections import Counter
@@ -86,6 +87,7 @@ def __init__(self, url='', base_domain=''):
         self.bigrams = Counter()
         self.trigrams = Counter()
         self.stem_to_word = {}
+        self.content_hash = None
 
     def talk(self):
         """
@@ -102,6 +104,7 @@ def talk(self):
             'trigrams': self.trigrams,
             'warnings': self.warnings,
             'social': self.social,
+            'content_hash': self.content_hash
         }
 
     def populate(self, bs):
@@ -168,6 +171,8 @@ def analyze(self, raw_html=None):
             else:
                 raw_html = page.data.decode('utf-8')
 
+        self.content_hash = hashlib.sha1(raw_html.encode('utf-8')).hexdigest()
+
         # remove comments, they screw with BeautifulSoup
         clean_html = re.sub(r'<!--.*?-->', r'', raw_html, flags=re.DOTALL)
 

diff --git a/seoanalyzer/website.py b/seoanalyzer/website.py
@@ -1,4 +1,5 @@
 from collections import Counter
+from collections import defaultdict
 from urllib.parse import urlsplit
 from xml.dom import minidom
 
@@ -17,6 +18,7 @@ def __init__(self, base_url, sitemap):
         self.wordcount = Counter()
         self.bigrams = Counter()
         self.trigrams = Counter()
+        self.content_hashes = defaultdict(set)
 
     def check_dns(self, url_to_check):
         try:
@@ -62,6 +64,8 @@ def crawl(self):
 
             page.analyze()
 
+            self.content_hashes[page.content_hash].add(page.url)
+
             for w in page.wordcount:
                 self.wordcount[w] += page.wordcount[w]