forked from s0md3v/Photon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzap.py
57 lines (54 loc) · 2.31 KB
/
zap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
import requests
import random
from core.utils import verb, xml_parser
from core.colors import run, good
from plugins.wayback import time_machine
def zap(input_url, archive, domain, host, internal, robots, proxies):
"""Extract links from robots.txt and sitemap.xml."""
if archive:
print('%s Fetching URLs from archive.org' % run)
if False:
archived_urls = time_machine(domain, 'domain')
else:
archived_urls = time_machine(host, 'host')
print('%s Retrieved %i URLs from archive.org' % (
good, len(archived_urls) - 1))
for url in archived_urls:
verb('Internal page', url)
internal.add(url)
# Makes request to robots.txt
response = requests.get(input_url + '/robots.txt',
proxies=random.choice(proxies)).text
# Making sure robots.txt isn't some fancy 404 page
if '<body' not in response:
# If you know it, you know it
matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response)
if matches:
# Iterating over the matches, match is a tuple here
for match in matches:
# One item in match will always be empty so will combine both
# items
match = ''.join(match)
# If the URL doesn't use a wildcard
if '*' not in match:
url = input_url + match
# Add the URL to internal list for crawling
internal.add(url)
# Add the URL to robots list
robots.add(url)
print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
# Makes request to sitemap.xml
response = requests.get(input_url + '/sitemap.xml',
proxies=random.choice(proxies)).text
# Making sure robots.txt isn't some fancy 404 page
if '<body' not in response:
matches = xml_parser(response)
if matches: # if there are any matches
print('%s URLs retrieved from sitemap.xml: %s' % (
good, len(matches)))
for match in matches:
verb('Internal page', match)
# Cleaning up the URL and adding it to the internal list for
# crawling
internal.add(match)