Feature/2024 updates (sethblack#101)

* Use new pyproject.toml * Remove old setup.py * Update README to reflect new cli script name seoanalyzer * Update requirements.txt to include much newer versions of necessary packages and remove requests * Refactor everything into pyseoanalyzer directory
dotdesh71 · Apr 21, 2024 · 7ab5b35 · 7ab5b35
1 parent 55fd7b3
commit 7ab5b35
Show file tree

Hide file tree

Showing 20 changed files with 1,167 additions and 737 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # I don't want the python virtual env in github!
 venv
+env
 
 # nor visual
 .vscode

diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ print(output)
 Alternatively, you can run the analysis as a script from the seoanalyzer folder.
 
 ```sh
-python analyzer.py https://www.sethserver.com/ -f html > results.html
+python -m seoanalyzer https://www.sethserver.com/ -f html > results.html
 ```
 
 Notes

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "pyseoanalyzer"
+version = "2024.04.21"
+authors = [
+  {name = "Seth Black", email = "sblack@sethserver.com"},
+]
+dependencies = [
+    "beautifulsoup4>=4.12.3",
+    "certifi>=2024.2.2",
+    "Jinja2>=3.1.3",
+    "lxml>=5.2.1",
+    "MarkupSafe>=2.1.5",
+    "urllib3>=2.2.1",
+]
+requires-python = ">= 3.8"
+description = "An SEO tool that analyzes the structure of a site, crawls the site, count words in the body of the site and warns of any technical SEO issues."
+readme = "README.md"
+license = {file = "LICENSE"}
+keywords = [
+    "search engine optimization",
+    "seo",
+    "website parser",
+    "crawler",
+    "scraper",
+    "site analyzer",
+    "site parser",
+    "site crawler",
+]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: BSD License",
+    "Operating System :: OS Independent",
+    "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Text Processing",
+    "Topic :: Internet :: WWW/HTTP",
+]
+
+[project.scripts]
+seoanalyze = "pyseoanalyzer.__main__:main"
+
+[project.urls]
+Homepage = "https://github.com/sethblack/python-seo-analyzer"
+Repository = "https://github.com/sethblack/python-seo-analyzer.git"
+Issues = "https://github.com/sethblack/python-seo-analyzer/issues"
diff --git a/seoanalyzer/__init__.py → pyseoanalyzer/__init__.py b/seoanalyzer/__init__.py → pyseoanalyzer/__init__.py
diff --git a/pyseoanalyzer/__main__.py b/pyseoanalyzer/__main__.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+import argparse
+import inspect
+import json
+import os
+
+from .analyzer import analyze
+
+
+def main():
+    module_path = os.path.dirname(inspect.getfile(analyze))
+
+    arg_parser = argparse.ArgumentParser()
+
+    arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.")
+    arg_parser.add_argument(
+        "-s", "--sitemap", help="URL of the sitemap to seed the crawler with."
+    )
+    arg_parser.add_argument(
+        "-f",
+        "--output-format",
+        help="Output format.",
+        choices=[
+            "json",
+            "html",
+        ],
+        default="json",
+    )
+
+    arg_parser.add_argument(
+        "--analyze-headings",
+        default=False,
+        action="store_true",
+        help="Analyze heading tags (h1-h6).",
+    )
+    arg_parser.add_argument(
+        "--analyze-extra-tags",
+        default=False,
+        action="store_true",
+        help="Analyze other extra additional tags.",
+    )
+    arg_parser.add_argument(
+        "--no-follow-links",
+        default=True,
+        action="store_false",
+        help="Analyze all the existing inner links as well (might be time consuming).",
+    )
+
+    args = arg_parser.parse_args()
+
+    output = analyze(
+        args.site,
+        args.sitemap,
+        analyze_headings=args.analyze_headings,
+        analyze_extra_tags=args.analyze_extra_tags,
+        follow_links=args.no_follow_links,
+    )
+
+    if args.output_format == "html":
+        from jinja2 import Environment
+        from jinja2 import FileSystemLoader
+
+        env = Environment(
+            loader=FileSystemLoader(os.path.join(module_path, "templates"))
+        )
+        template = env.get_template("index.html")
+        output_from_parsed_template = template.render(result=output)
+        print(output_from_parsed_template)
+    elif args.output_format == "json":
+        print(json.dumps(output, indent=4, separators=(",", ": ")))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyseoanalyzer/analyzer.py b/pyseoanalyzer/analyzer.py
@@ -0,0 +1,85 @@
+import time
+
+from operator import itemgetter
+from .website import Website
+
+
+def analyze(
+    url,
+    sitemap_url=None,
+    analyze_headings=False,
+    analyze_extra_tags=False,
+    follow_links=True,
+):
+    start_time = time.time()
+
+    def calc_total_time():
+        return time.time() - start_time
+
+    output = {
+        "pages": [],
+        "keywords": [],
+        "errors": [],
+        "total_time": calc_total_time(),
+    }
+
+    site = Website(
+        url,
+        sitemap_url,
+        analyze_headings,
+        analyze_extra_tags,
+        follow_links,
+    )
+
+    site.crawl()
+
+    for p in site.crawled_pages:
+        output["pages"].append(p.talk())
+
+    output["duplicate_pages"] = [
+        list(site.content_hashes[p])
+        for p in site.content_hashes
+        if len(site.content_hashes[p]) > 1
+    ]
+
+    sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)
+    sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)
+    sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)
+
+    output["keywords"] = []
+
+    for w in sorted_words:
+        if w[1] > 4:
+            output["keywords"].append(
+                {
+                    "word": w[0],
+                    "count": w[1],
+                }
+            )
+
+    for w, v in sorted_bigrams:
+        if v > 4:
+            output["keywords"].append(
+                {
+                    "word": w,
+                    "count": v,
+                }
+            )
+
+    for w, v in sorted_trigrams:
+        if v > 4:
+            output["keywords"].append(
+                {
+                    "word": w,
+                    "count": v,
+                }
+            )
+
+    # Sort one last time...
+    output["keywords"] = sorted(
+        output["keywords"], key=itemgetter("count"), reverse=True
+    )
+
+    output["total_time"] = calc_total_time()
+
+    return output
diff --git a/seoanalyzer/http.py → pyseoanalyzer/http.py b/seoanalyzer/http.py → pyseoanalyzer/http.py
@@ -3,18 +3,19 @@
 from urllib3 import Timeout
 
 
-class Http():
+class Http:
     def __init__(self):
-        user_agent = {'User-Agent': 'Mozilla/5.0'}
+        user_agent = {"User-Agent": "Mozilla/5.0"}
+
         self.http = PoolManager(
             timeout=Timeout(connect=1.0, read=2.0),
-            cert_reqs='CERT_REQUIRED',
+            cert_reqs="CERT_REQUIRED",
             ca_certs=certifi.where(),
-            headers=user_agent
+            headers=user_agent,
         )
 
     def get(self, url):
-        return self.http.request('GET', url)
+        return self.http.request("GET", url)
+
 
 http = Http()
-
-Original file line number
+Diff line change
@@ Expand Up / @@ -78,7 +78,7 @@ print(output) @@
     Alternatively, you can run the analysis as a script from the seoanalyzer folder.
     ```sh
-    python analyzer.py https://www.sethserver.com/ -f html > results.html
+    python -m seoanalyzer https://www.sethserver.com/ -f html > results.html
     ```
     Notes
@@ Expand Down @@