forked from james-see/python-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
google-result-counts.py
46 lines (43 loc) · 1.56 KB
/
google-result-counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
import time
import requests
import random
from bs4 import BeautifulSoup
from tqdm import tqdm
from tabulate import tabulate
from proxies import *
from useragents import *
diction = []
subset = []
fname = 'active.txt'
with open(fname) as f:
diction = f.readlines()
for term in diction:
subset.append(term.strip('\n'))
#parser = argparse.ArgumentParser(description='Get Google Count.')
#parser.add_argument('word', help='word to count')
#args = parser.parse_args()
tsubset = tqdm(subset,total=len(subset))
report = ' domain name | total number of hits \n-------------------------------------------\n'
reporter = {"site_name":[],"results":[]}
totaler = 0 # store total results
for site in tsubset:
tsubset.set_description("Processing %s" % site)
r = requests.get('http://www.google.com/search',
params={'q':'"site:'+site+'"',
"tbs":"li:1"},proxies={'https' : random.choice(proxies)},headers={'User-Agent':random.choice(useragents)}
)
soup = BeautifulSoup(r.text, 'html.parser')
randwait = random.uniform(5,10)
print("waiting {} seconds...".format(randwait))
time.sleep(randwait)
reporter["site_name"].append(site)
results = soup.find('div',{'id':'resultStats'}).text
reporter["results"].append(results)
# report = report+site+' '+results+'\n'
try: totaler = totaler + int(re.sub("[^0-9]", "", results))
except: totaler = totaler
#print(report) # old table format
print (tabulate(reporter,headers="keys",tablefmt="pipe")) # awesome using tabulate
print('total pages: {}'.format(totaler))
exit("finis.")