-
Notifications
You must be signed in to change notification settings - Fork 6
/
get-bing-cache.py
75 lines (57 loc) · 2.05 KB
/
get-bing-cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from robobrowser import RoboBrowser
import re
import json
import time
# store input variable as JSON
def store_json(obj):
# filename of JSON file to store to
fname = "cache.json"
# write to JSON, pretty printed
with open(fname, 'w') as outfile:
json.dump(obj, outfile, sort_keys=True, indent=2, separators=(',', ': '))
# scrape bing's HTML search results
def scrape_cache(query, total):
browser = RoboBrowser()
listings = []
for i in range(1, total, 14):
offset = i # which listing to start at per page. Increment by 14
browser.open('http://www.bing.com/search?q=%s&first=%d' % (query, offset))
# Database Schema - A sqlite database is used to make data queries more efficient.
# id (Primary Key) - ID of the item
# orig_url - Original URL of the site.
# cache_url - Cached URL of the site.
# desc - Quick description of the site.
# grab all search attribute strings
capt_list = browser.select('.b_caption')
for capt in capt_list:
# start a new listing
listing = {}
# display original url
listing['orig_url'] = re.sub('<[^>]*>', '', str(capt.select('cite')[0]))
# display description
listing['desc'] = capt.p.string
# '|' delimited list, containing the ids needed to cache
id_string = capt.select('div.b_attribution')[0].get('u')
print(id_string)
if (id_string != None):
ids = id_string.split('|')
listing['cache_url'] = "http://cc.bingj.com/cache.aspx?q=%s&d=%s&mkt=en-US&setlang=en-US&w=%s" % (query, ids[2], ids[3])
else:
listing['cache_url'] = None
print(listing)
listings.append(listing)
print(":: End of dump %d" % i)
# delay between page grabs
time.sleep(1)
# listings is given as an output object
return(listings)
def main():
site = "rockenflac.blogspot.com" # site to search for
query = "site%3A" + site # what to search for
total = 419 # the total amount of listings for this search on Bing
# scrape the search results
listings = scrape_cache(query, total)
# store the listings to JSON
store_json(listings)
if __name__ == "__main__":
main()