Skip to content

Commit

Permalink
Feature/2024 updates (sethblack#101)
Browse files Browse the repository at this point in the history
* Use new pyproject.toml

* Remove old setup.py

* Update README to reflect new cli script name seoanalyzer

* Update requirements.txt to include much newer versions of necessary packages and remove requests

* Refactor everything into pyseoanalyzer directory
  • Loading branch information
sethblack authored Apr 21, 2024
1 parent 55fd7b3 commit 7ab5b35
Show file tree
Hide file tree
Showing 20 changed files with 1,167 additions and 737 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# I don't want the python virtual env in github!
venv
env

# nor visual
.vscode
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ print(output)
Alternatively, you can run the analysis as a script from the seoanalyzer folder.

```sh
python analyzer.py https://www.sethserver.com/ -f html > results.html
python -m seoanalyzer https://www.sethserver.com/ -f html > results.html
```

Notes
Expand Down
54 changes: 54 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "pyseoanalyzer"
version = "2024.04.21"
authors = [
{name = "Seth Black", email = "sblack@sethserver.com"},
]
dependencies = [
"beautifulsoup4>=4.12.3",
"certifi>=2024.2.2",
"Jinja2>=3.1.3",
"lxml>=5.2.1",
"MarkupSafe>=2.1.5",
"urllib3>=2.2.1",
]
requires-python = ">= 3.8"
description = "An SEO tool that analyzes the structure of a site, crawls the site, count words in the body of the site and warns of any technical SEO issues."
readme = "README.md"
license = {file = "LICENSE"}
keywords = [
"search engine optimization",
"seo",
"website parser",
"crawler",
"scraper",
"site analyzer",
"site parser",
"site crawler",
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Environment :: Console",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing",
"Topic :: Internet :: WWW/HTTP",
]

[project.scripts]
seoanalyze = "pyseoanalyzer.__main__:main"

[project.urls]
Homepage = "https://github.com/sethblack/python-seo-analyzer"
Repository = "https://github.com/sethblack/python-seo-analyzer.git"
Issues = "https://github.com/sethblack/python-seo-analyzer/issues"
File renamed without changes.
75 changes: 75 additions & 0 deletions pyseoanalyzer/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python3

import argparse
import inspect
import json
import os

from .analyzer import analyze


def main():
module_path = os.path.dirname(inspect.getfile(analyze))

arg_parser = argparse.ArgumentParser()

arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.")
arg_parser.add_argument(
"-s", "--sitemap", help="URL of the sitemap to seed the crawler with."
)
arg_parser.add_argument(
"-f",
"--output-format",
help="Output format.",
choices=[
"json",
"html",
],
default="json",
)

arg_parser.add_argument(
"--analyze-headings",
default=False,
action="store_true",
help="Analyze heading tags (h1-h6).",
)
arg_parser.add_argument(
"--analyze-extra-tags",
default=False,
action="store_true",
help="Analyze other extra additional tags.",
)
arg_parser.add_argument(
"--no-follow-links",
default=True,
action="store_false",
help="Analyze all the existing inner links as well (might be time consuming).",
)

args = arg_parser.parse_args()

output = analyze(
args.site,
args.sitemap,
analyze_headings=args.analyze_headings,
analyze_extra_tags=args.analyze_extra_tags,
follow_links=args.no_follow_links,
)

if args.output_format == "html":
from jinja2 import Environment
from jinja2 import FileSystemLoader

env = Environment(
loader=FileSystemLoader(os.path.join(module_path, "templates"))
)
template = env.get_template("index.html")
output_from_parsed_template = template.render(result=output)
print(output_from_parsed_template)
elif args.output_format == "json":
print(json.dumps(output, indent=4, separators=(",", ": ")))


if __name__ == "__main__":
main()
85 changes: 85 additions & 0 deletions pyseoanalyzer/analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import time

from operator import itemgetter
from .website import Website


def analyze(
url,
sitemap_url=None,
analyze_headings=False,
analyze_extra_tags=False,
follow_links=True,
):
start_time = time.time()

def calc_total_time():
return time.time() - start_time

output = {
"pages": [],
"keywords": [],
"errors": [],
"total_time": calc_total_time(),
}

site = Website(
url,
sitemap_url,
analyze_headings,
analyze_extra_tags,
follow_links,
)

site.crawl()

for p in site.crawled_pages:
output["pages"].append(p.talk())

output["duplicate_pages"] = [
list(site.content_hashes[p])
for p in site.content_hashes
if len(site.content_hashes[p]) > 1
]

sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)
sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)
sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)

output["keywords"] = []

for w in sorted_words:
if w[1] > 4:
output["keywords"].append(
{
"word": w[0],
"count": w[1],
}
)

for w, v in sorted_bigrams:
if v > 4:
output["keywords"].append(
{
"word": w,
"count": v,
}
)

for w, v in sorted_trigrams:
if v > 4:
output["keywords"].append(
{
"word": w,
"count": v,
}
)

# Sort one last time...
output["keywords"] = sorted(
output["keywords"], key=itemgetter("count"), reverse=True
)

output["total_time"] = calc_total_time()

return output
13 changes: 7 additions & 6 deletions seoanalyzer/http.py → pyseoanalyzer/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
from urllib3 import Timeout


class Http():
class Http:
def __init__(self):
user_agent = {'User-Agent': 'Mozilla/5.0'}
user_agent = {"User-Agent": "Mozilla/5.0"}

self.http = PoolManager(
timeout=Timeout(connect=1.0, read=2.0),
cert_reqs='CERT_REQUIRED',
cert_reqs="CERT_REQUIRED",
ca_certs=certifi.where(),
headers=user_agent
headers=user_agent,
)

def get(self, url):
return self.http.request('GET', url)
return self.http.request("GET", url)


http = Http()

Loading

0 comments on commit 7ab5b35

Please sign in to comment.