Skip to content

Commit c812ae2

Browse files
author
Danny Sauer
committed
Initial commit
Recurse over a site and open new browser tabs for every link
1 parent 5642d18 commit c812ae2

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

misc/recurse.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env python3
2+
3+
import webbrowser
4+
import httplib2
5+
from urllib.parse import urlparse, urljoin
6+
from bs4 import BeautifulSoup, SoupStrainer
7+
8+
site = None
9+
browser = None
10+
http = None
11+
known_links = set()
12+
13+
def local_link(tag):
14+
global site
15+
return tag.has_attr('href') and \
16+
urlparse(tag['href']).netloc == site
17+
18+
def get_links(url):
19+
global known_links
20+
global http
21+
known_links.add(url)
22+
status, response = http.request(url)
23+
soup = BeautifulSoup(response, 'html.parser')
24+
links = set( urljoin( url, x['href'] ) for x in soup.find_all(local_link))
25+
print("found {} dup links".format( len(links.intersection(known_links)) ))
26+
return( links.difference( known_links ) )
27+
28+
def recurse(url):
29+
global browser
30+
global known_links
31+
browser.open_new_tab(url)
32+
print(f"recursing over {url}")
33+
new_links = get_links(url)
34+
print( "Found {} new links on {}".format(len(new_links), url) )
35+
known_links.update(new_links)
36+
for link in new_links:
37+
recurse(link)
38+
39+
if __name__ == '__main__':
40+
url = 'http://www.google.com/'
41+
site = urlparse(url).netloc
42+
http = httplib2.Http()
43+
#proxy = httplib2.proxy_info_from_url('http://squid:3128')
44+
#http = httplib2.Http(proxy_info=proxy)
45+
browser = webbrowser.get();
46+
recurse(url)

0 commit comments

Comments
 (0)