Skip to content

Commit

Permalink
Parallelized updating alexa ranking
Browse files Browse the repository at this point in the history
  • Loading branch information
avinashshenoy97 committed Jan 27, 2019
1 parent 1442f33 commit 3db3f45
Showing 1 changed file with 23 additions and 7 deletions.
30 changes: 23 additions & 7 deletions site_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import threading
from bs4 import BeautifulSoup as bs
from datetime import datetime
from argparse import ArgumentParser, RawDescriptionHelpFormatter

pool = list()

Expand All @@ -22,7 +23,15 @@ def get_rank(domain_to_query, dest):
if strong.has_attr("class"):
if "metrics-data" in strong["class"]:
result = int(strong.text.strip().replace(',', ''))
dest = result
dest['rank'] = result

parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("--rank","-r",
action="store_true", dest="rank", default=False,
help="Update all website ranks (not recommended)."
)
args = parser.parse_args()

with open("data.json", "r", encoding="utf-8") as data_file:
data = json.load(data_file)
Expand All @@ -33,19 +42,26 @@ def get_rank(domain_to_query, dest):

for social_network in data:
url_main = data.get(social_network).get("urlMain")
th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)["rank"]))
pool.append((url_main, th))
th.start()
data.get(social_network)["rank"] = 0
if args.rank:
th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)))
else:
th = None
pool.append((url_main, url_main, th))
if args.rank:
th.start()

index = 1
for social_network, th in pool:
th.join()
for social_network, url_main, th in pool:
if args.rank:
th.join()
site_file.write(f'{index}. [{social_network}]({url_main})\n')
sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries"))
sys.stdout.flush()
index = index + 1

site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
if args.rank:
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')

sorted_json_data = json.dumps(data, indent=2, sort_keys=True)

Expand Down

0 comments on commit 3db3f45

Please sign in to comment.