-
-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
zap.py
54 lines (53 loc) · 2.25 KB
/
zap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
import requests
from core.requester import requester
from core.utils import verb, xmlParser
from core.colors import run, good
from plugins.wayback import time_machine
def zap(inputUrl, archive, domain, host, internal, robots):
"""Extract links from robots.txt and sitemap.xml."""
if archive:
from plugins.wayback import time_machine
print('%s Fetching URLs from archive.org' % run)
if False:
archived_urls = time_machine(domain, 'domain')
else:
archived_urls = time_machine(host, 'host')
print('%s Retrieved %i URLs from archive.org' % (
good, len(archived_urls) - 1))
for url in archived_urls:
verb('Internal page', url)
internal.add(url)
# Makes request to robots.txt
response = requests.get(inputUrl + '/robots.txt').text
# Making sure robots.txt isn't some fancy 404 page
if '<body' not in response:
# If you know it, you know it
matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response)
if matches:
# Iterating over the matches, match is a tuple here
for match in matches:
# One item in match will always be empty so will combine both
# items
match = ''.join(match)
# If the URL doesn't use a wildcard
if '*' not in match:
url = inputUrl + match
# Add the URL to internal list for crawling
internal.add(url)
# Add the URL to robots list
robots.add(url)
print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
# Makes request to sitemap.xml
response = requests.get(inputUrl + '/sitemap.xml').text
# Making sure robots.txt isn't some fancy 404 page
if '<body' not in response:
matches = xmlParser(response)
if matches: # if there are any matches
print('%s URLs retrieved from sitemap.xml: %s' % (
good, len(matches)))
for match in matches:
verb('Internal page', match)
# Cleaning up the URL and adding it to the internal list for
# crawling
internal.add(match)