-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetchagewalls.py
137 lines (119 loc) · 5.38 KB
/
fetchagewalls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# This script is designed to fetch the URL and subdomains of adult websites, and catch any URLs that may be using age checking/geoip.
# This script is experimental and a W.I.P
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
import urllib3
import validators
# Suppress InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Set to store unique domains
unique_domains = set()
def get_base_domain(url):
parsed_url = urlparse(url)
netloc_parts = parsed_url.netloc.split('.')
if len(netloc_parts) > 2:
# Handles subdomains like sub.example.com -> example.com
return '.'.join(netloc_parts[-2:])
return parsed_url.netloc # Returns the original if no subdomain exists
def log_unique_url(url, file_name):
# Extract the domain of the URL
parsed_url = urlparse(url)
domain = parsed_url.netloc
# Check if the domain is already in the set
if len(domain) > 2:
if domain not in unique_domains:
unique_domains.add(domain) # Add domain to the set
with open(file_name, "a") as file:
if file.tell() == 0: # Check if file is empty
file.write(domain) # Write without newline if file is empty
else:
file.write("\n" + domain) # Append with newline
print(f"Added new domain: {domain}")
def fetch_resources(url, keyword, grab_all, go_nuclear):
base_domain = get_base_domain(url)
# Dictionary to store resources in memory
resource_cache = {}
print(f"Fetching URL: {url}")
try:
response = requests.get(url, verify=False)
response.raise_for_status()
html_content = response.text
except Exception as e:
print(f"Error fetching the main page: {url} - {e}")
return
print("Parsing HTML...")
soup = BeautifulSoup(html_content, 'html.parser')
# Collect resource URLs
resources = []
nuclear_count = 0
for tag, attr in [('script', 'src'), ('link', 'href'), ('img', 'src')]:
for resource in soup.find_all(tag):
resource_url = resource.get(attr)
if resource_url:
resources.append(urljoin(url, resource_url))
# Fetch and store resources with progress bar
print("Fetching resources...")
for resource_url in tqdm(resources, desc="Fetching", unit="file"):
try:
res = requests.get(resource_url, verify=False)
res.raise_for_status()
# Store content in memory
resource_cache[resource_url] = res.text
except Exception as e:
print(f"Error fetching resource: {resource_url} - {e}")
print("Scanning resources...")
for name, content in tqdm(resource_cache.items(), desc="Scanning", unit="file"):
if keyword.lower() in content.lower():
log_unique_url(name, f"{base_domain}.txt")
elif grab_all or go_nuclear:
parsed_url = urlparse(name)
full_domain = parsed_url.netloc
if full_domain.endswith(base_domain):
log_unique_url(resource_url, f"{base_domain}.txt")
elif go_nuclear:
if full_domain not in unique_domains:
nuclear_count += 1
log_unique_url(resource_url, f"{base_domain}.nuclear.txt")
print(f"{len(unique_domains)} domain(s) has been added to {base_domain}.txt")
if go_nuclear and nuclear_count != 0:
print(f"\nSince you chose to go nuclear on querying, it is separated at {base_domain}.nuclear.txt\n{nuclear_count} Unrelated domain(s) has been added to the list.")
elif go_nuclear and nuclear_count == 0:
print("You chose to go nuclear on querying, but no unrelated domains was found.")
if __name__ == "__main__":
try:
website_url = None
while not website_url:
website_url = input("Enter website URL (must include the http(s)://): ")
if not validators.url(website_url):
print("Error: Invalid URL format.")
exit("\n")
search_keyword = input("Enter keyword to search for (optional, by default it would search for \"age\"): ")
grab_all = None
while not grab_all:
grab_all_choice = input(f"(Recommended) Grab all domains and subdomains related to {website_url}, regardless of the search keyword? (Y/N): ")
if "y" in grab_all_choice.lower():
grab_all = True
break
if "n" in grab_all_choice.lower():
grab_all = False
break
else:
print("Invalid option, please type (Y)es or (N)o")
go_nuclear = None
while not go_nuclear:
go_nuclear_choice = input(f"(Not Recommended, Nuclear/Aggressive option) Grab ALL domains found upon query, regardless of relation or keyword (THIS WILL CATCH **EVERYTHING** IT FINDS) (Y/N): ")
if "y" in go_nuclear_choice.lower():
go_nuclear = True
break
if "n" in go_nuclear_choice.lower():
go_nuclear = False
break
else:
print("Invalid option, please type (Y)es or (N)o")
if not search_keyword:
search_keyword = "age"
fetch_resources(website_url, search_keyword, grab_all, go_nuclear)
except KeyboardInterrupt:
exit("\n")