-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcrawl.py
More file actions
executable file
·40 lines (31 loc) · 928 Bytes
/
crawl.py
File metadata and controls
executable file
·40 lines (31 loc) · 928 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env python
import requests
from lxml import html,etree
import urlparse
import collections
import sys
# Disable SSL warnings
try:
import requests.packages.urllib3
requests.packages.urllib3.disable_warnings()
except:
pass
START = sys.argv[1]
urlq = collections.deque()
urlq.append(START)
found = set()
found.add(START)
while len(urlq):
url = urlq.popleft()
response = requests.get(url)
body = html.fromstring(response.content)
# Prints the page title
print body.xpath('//title/text()')
result = etree.tostring(body, pretty_print=True, method="html")
print result
# Find all links, but make sure we stay on the same site.
links = {urlparse.urljoin(response.url, url) for url in body.xpath('//a/@href') if urlparse.urljoin(response.url, url).startswith(START)}
# Set difference to find new URLs
for link in (links - found):
found.add(link)
urlq.append(link)