Skip to content

Commit

Permalink
Add duplicate page audit.
Browse files Browse the repository at this point in the history
  • Loading branch information
sethblack committed Feb 11, 2020
1 parent cb6260d commit 19e91a7
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 1 deletion.
2 changes: 2 additions & 0 deletions seoanalyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def calc_total_time():
for p in site.crawled_pages:
output['pages'].append(p.talk())

output['duplicate_pages'] = [list(site.content_hashes[p]) for p in site.content_hashes if len(site.content_hashes[p]) > 1]

sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)
sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)
sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)
Expand Down
7 changes: 6 additions & 1 deletion seoanalyzer/page.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import hashlib
import json
import re

from bs4 import BeautifulSoup
from collections import Counter
Expand Down Expand Up @@ -86,6 +87,7 @@ def __init__(self, url='', base_domain=''):
self.bigrams = Counter()
self.trigrams = Counter()
self.stem_to_word = {}
self.content_hash = None

def talk(self):
"""
Expand All @@ -102,6 +104,7 @@ def talk(self):
'trigrams': self.trigrams,
'warnings': self.warnings,
'social': self.social,
'content_hash': self.content_hash
}

def populate(self, bs):
Expand Down Expand Up @@ -168,6 +171,8 @@ def analyze(self, raw_html=None):
else:
raw_html = page.data.decode('utf-8')

self.content_hash = hashlib.sha1(raw_html.encode('utf-8')).hexdigest()

# remove comments, they screw with BeautifulSoup
clean_html = re.sub(r'<!--.*?-->', r'', raw_html, flags=re.DOTALL)

Expand Down
4 changes: 4 additions & 0 deletions seoanalyzer/website.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import Counter
from collections import defaultdict
from urllib.parse import urlsplit
from xml.dom import minidom

Expand All @@ -17,6 +18,7 @@ def __init__(self, base_url, sitemap):
self.wordcount = Counter()
self.bigrams = Counter()
self.trigrams = Counter()
self.content_hashes = defaultdict(set)

def check_dns(self, url_to_check):
try:
Expand Down Expand Up @@ -62,6 +64,8 @@ def crawl(self):

page.analyze()

self.content_hashes[page.content_hash].add(page.url)

for w in page.wordcount:
self.wordcount[w] += page.wordcount[w]

Expand Down

0 comments on commit 19e91a7

Please sign in to comment.