forked from sethblack/python-seo-analyzer
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/2024 updates (sethblack#101)
* Use new pyproject.toml * Remove old setup.py * Update README to reflect new cli script name seoanalyzer * Update requirements.txt to include much newer versions of necessary packages and remove requests * Refactor everything into pyseoanalyzer directory
- Loading branch information
Showing
20 changed files
with
1,167 additions
and
737 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
# I don't want the python virtual env in github! | ||
venv | ||
env | ||
|
||
# nor visual | ||
.vscode | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
[build-system] | ||
requires = ["hatchling"] | ||
build-backend = "hatchling.build" | ||
|
||
[project] | ||
name = "pyseoanalyzer" | ||
version = "2024.04.21" | ||
authors = [ | ||
{name = "Seth Black", email = "sblack@sethserver.com"}, | ||
] | ||
dependencies = [ | ||
"beautifulsoup4>=4.12.3", | ||
"certifi>=2024.2.2", | ||
"Jinja2>=3.1.3", | ||
"lxml>=5.2.1", | ||
"MarkupSafe>=2.1.5", | ||
"urllib3>=2.2.1", | ||
] | ||
requires-python = ">= 3.8" | ||
description = "An SEO tool that analyzes the structure of a site, crawls the site, count words in the body of the site and warns of any technical SEO issues." | ||
readme = "README.md" | ||
license = {file = "LICENSE"} | ||
keywords = [ | ||
"search engine optimization", | ||
"seo", | ||
"website parser", | ||
"crawler", | ||
"scraper", | ||
"site analyzer", | ||
"site parser", | ||
"site crawler", | ||
] | ||
classifiers = [ | ||
"Development Status :: 5 - Production/Stable", | ||
"Programming Language :: Python", | ||
"Programming Language :: Python :: 3", | ||
"Programming Language :: Python :: 3 :: Only", | ||
"Environment :: Console", | ||
"Intended Audience :: Developers", | ||
"License :: OSI Approved :: BSD License", | ||
"Operating System :: OS Independent", | ||
"Topic :: Internet :: WWW/HTTP :: Indexing/Search", | ||
"Topic :: Software Development :: Libraries :: Python Modules", | ||
"Topic :: Text Processing", | ||
"Topic :: Internet :: WWW/HTTP", | ||
] | ||
|
||
[project.scripts] | ||
seoanalyze = "pyseoanalyzer.__main__:main" | ||
|
||
[project.urls] | ||
Homepage = "https://github.com/sethblack/python-seo-analyzer" | ||
Repository = "https://github.com/sethblack/python-seo-analyzer.git" | ||
Issues = "https://github.com/sethblack/python-seo-analyzer/issues" |
0
seoanalyzer/__init__.py → pyseoanalyzer/__init__.py
100755 → 100644
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
import inspect | ||
import json | ||
import os | ||
|
||
from .analyzer import analyze | ||
|
||
|
||
def main(): | ||
module_path = os.path.dirname(inspect.getfile(analyze)) | ||
|
||
arg_parser = argparse.ArgumentParser() | ||
|
||
arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.") | ||
arg_parser.add_argument( | ||
"-s", "--sitemap", help="URL of the sitemap to seed the crawler with." | ||
) | ||
arg_parser.add_argument( | ||
"-f", | ||
"--output-format", | ||
help="Output format.", | ||
choices=[ | ||
"json", | ||
"html", | ||
], | ||
default="json", | ||
) | ||
|
||
arg_parser.add_argument( | ||
"--analyze-headings", | ||
default=False, | ||
action="store_true", | ||
help="Analyze heading tags (h1-h6).", | ||
) | ||
arg_parser.add_argument( | ||
"--analyze-extra-tags", | ||
default=False, | ||
action="store_true", | ||
help="Analyze other extra additional tags.", | ||
) | ||
arg_parser.add_argument( | ||
"--no-follow-links", | ||
default=True, | ||
action="store_false", | ||
help="Analyze all the existing inner links as well (might be time consuming).", | ||
) | ||
|
||
args = arg_parser.parse_args() | ||
|
||
output = analyze( | ||
args.site, | ||
args.sitemap, | ||
analyze_headings=args.analyze_headings, | ||
analyze_extra_tags=args.analyze_extra_tags, | ||
follow_links=args.no_follow_links, | ||
) | ||
|
||
if args.output_format == "html": | ||
from jinja2 import Environment | ||
from jinja2 import FileSystemLoader | ||
|
||
env = Environment( | ||
loader=FileSystemLoader(os.path.join(module_path, "templates")) | ||
) | ||
template = env.get_template("index.html") | ||
output_from_parsed_template = template.render(result=output) | ||
print(output_from_parsed_template) | ||
elif args.output_format == "json": | ||
print(json.dumps(output, indent=4, separators=(",", ": "))) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import time | ||
|
||
from operator import itemgetter | ||
from .website import Website | ||
|
||
|
||
def analyze( | ||
url, | ||
sitemap_url=None, | ||
analyze_headings=False, | ||
analyze_extra_tags=False, | ||
follow_links=True, | ||
): | ||
start_time = time.time() | ||
|
||
def calc_total_time(): | ||
return time.time() - start_time | ||
|
||
output = { | ||
"pages": [], | ||
"keywords": [], | ||
"errors": [], | ||
"total_time": calc_total_time(), | ||
} | ||
|
||
site = Website( | ||
url, | ||
sitemap_url, | ||
analyze_headings, | ||
analyze_extra_tags, | ||
follow_links, | ||
) | ||
|
||
site.crawl() | ||
|
||
for p in site.crawled_pages: | ||
output["pages"].append(p.talk()) | ||
|
||
output["duplicate_pages"] = [ | ||
list(site.content_hashes[p]) | ||
for p in site.content_hashes | ||
if len(site.content_hashes[p]) > 1 | ||
] | ||
|
||
sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True) | ||
sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True) | ||
sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True) | ||
|
||
output["keywords"] = [] | ||
|
||
for w in sorted_words: | ||
if w[1] > 4: | ||
output["keywords"].append( | ||
{ | ||
"word": w[0], | ||
"count": w[1], | ||
} | ||
) | ||
|
||
for w, v in sorted_bigrams: | ||
if v > 4: | ||
output["keywords"].append( | ||
{ | ||
"word": w, | ||
"count": v, | ||
} | ||
) | ||
|
||
for w, v in sorted_trigrams: | ||
if v > 4: | ||
output["keywords"].append( | ||
{ | ||
"word": w, | ||
"count": v, | ||
} | ||
) | ||
|
||
# Sort one last time... | ||
output["keywords"] = sorted( | ||
output["keywords"], key=itemgetter("count"), reverse=True | ||
) | ||
|
||
output["total_time"] = calc_total_time() | ||
|
||
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.