File tree Expand file tree Collapse file tree 1 file changed +46
-0
lines changed Expand file tree Collapse file tree 1 file changed +46
-0
lines changed Original file line number Diff line number Diff line change
1
+ #!/usr/bin/env python3
2
+
3
+ import webbrowser
4
+ import httplib2
5
+ from urllib .parse import urlparse , urljoin
6
+ from bs4 import BeautifulSoup , SoupStrainer
7
+
8
+ site = None
9
+ browser = None
10
+ http = None
11
+ known_links = set ()
12
+
13
+ def local_link (tag ):
14
+ global site
15
+ return tag .has_attr ('href' ) and \
16
+ urlparse (tag ['href' ]).netloc == site
17
+
18
+ def get_links (url ):
19
+ global known_links
20
+ global http
21
+ known_links .add (url )
22
+ status , response = http .request (url )
23
+ soup = BeautifulSoup (response , 'html.parser' )
24
+ links = set ( urljoin ( url , x ['href' ] ) for x in soup .find_all (local_link ))
25
+ print ("found {} dup links" .format ( len (links .intersection (known_links )) ))
26
+ return ( links .difference ( known_links ) )
27
+
28
+ def recurse (url ):
29
+ global browser
30
+ global known_links
31
+ browser .open_new_tab (url )
32
+ print (f"recursing over { url } " )
33
+ new_links = get_links (url )
34
+ print ( "Found {} new links on {}" .format (len (new_links ), url ) )
35
+ known_links .update (new_links )
36
+ for link in new_links :
37
+ recurse (link )
38
+
39
+ if __name__ == '__main__' :
40
+ url = 'http://www.google.com/'
41
+ site = urlparse (url ).netloc
42
+ http = httplib2 .Http ()
43
+ #proxy = httplib2.proxy_info_from_url('http://squid:3128')
44
+ #http = httplib2.Http(proxy_info=proxy)
45
+ browser = webbrowser .get ();
46
+ recurse (url )
You can’t perform that action at this time.
0 commit comments