Skip to content

Commit 529ff63

Browse files
committed
updates helper function to defang urls
1 parent 558f530 commit 529ff63

File tree

3 files changed

+15
-10
lines changed

3 files changed

+15
-10
lines changed

crawler/processing/payload_processing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def process(self, url, content):
8787
shasum + file_ext, content)
8888
self.logger.info(SubCrawlColors.CYAN +
8989
"[PAYLOAD] Saved file " +
90-
SubCrawlHelpers.make_safe_http(url) +
90+
SubCrawlHelpers.defang_url(url) +
9191
SubCrawlColors.RESET)
9292
except Exception as e:
9393
self.logger.error("[PAYLOAD] " + str(e))

crawler/subcrawl.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def main(argv):
155155
scraped_domains.add(parsed.netloc)
156156
else:
157157
logger.debug("[~] Domain already added to the scanning queue: "
158-
+ str(parsed.netloc))
158+
+ SubCrawlHelpers.defang_url(str(parsed.netloc)))
159159
else:
160160
logger.info("[ENGINE] Using file input for URL processing...")
161161
try:
@@ -172,7 +172,8 @@ def main(argv):
172172
scrape_urls.add(parsed_url)
173173
scraped_domains.add(parsed.netloc)
174174
else:
175-
logger.debug("[ENGINE] Domain already added to the scanning queue: " + str(parsed.netloc))
175+
logger.debug("[ENGINE] Domain already added to the scanning queue: "
176+
+ str(parsed.netloc))
176177
except Exception as e:
177178
logger.error("[ENGINE] Error reading input file for URL processing: " + str(e))
178179
sys.exit(-1)
@@ -188,7 +189,7 @@ def main(argv):
188189
for start_url in scrape_urls:
189190
# This will add the full URL if it ends with an extension, then passes it along for parsing
190191
if start_url.endswith('.exe'):
191-
logger.debug("[ENGINGE] Adding EXE URL directly: " + start_url)
192+
logger.debug("[ENGINGE] Adding EXE URL directly: " + SubCrawlHelpers.defang_url(start_url))
192193
if start_url not in distinct_urls:
193194
distinct_urls.append(start_url)
194195
domain_urls.setdefault(parsed.netloc, []).append(start_url)
@@ -205,7 +206,7 @@ def main(argv):
205206
for path in paths:
206207
tmp_url = urljoin(tmp_url, path) + "/"
207208

208-
logger.debug("Generated new URL: " + tmp_url)
209+
logger.debug("Generated new URL: " + SubCrawlHelpers.defang_url(tmp_url))
209210

210211
if tmp_url not in distinct_urls:
211212
distinct_urls.append(tmp_url)
@@ -264,7 +265,7 @@ def scrape_manager(data):
264265
init_pages = domain_urls
265266
process_processing_modules = processing_modules
266267

267-
logger.debug("[ENGINE] Starting down path... " + domain_urls[0])
268+
logger.debug("[ENGINE] Starting down path... " + SubCrawlHelpers.defang_url(domain_urls[0]))
268269

269270
result_dicts = list()
270271
for url in domain_urls:
@@ -285,7 +286,7 @@ def scrape(start_url, s_data):
285286
try:
286287
scrape_domain = dict()
287288
request_start = datetime.datetime.now()
288-
logger.debug("[ENGINE] Scanning URL: " + start_url)
289+
logger.debug("[ENGINE] Scanning URL: " + SubCrawlHelpers.defang_url(start_url))
289290
resp = requests.get(start_url, timeout=SubCrawlHelpers.get_config(
290291
process_cfg, "crawler", "http_request_timeout"),
291292
headers=SubCrawlHelpers.get_config(process_cfg, "crawler",
@@ -338,7 +339,7 @@ def scrape(start_url, s_data):
338339

339340
if next_page not in crawl_pages and next_page not in init_pages \
340341
and not next_page.lower().endswith(tuple(SubCrawlHelpers.get_config(process_cfg, "crawler", "ext_exclude"))):
341-
logger.debug("[ENGINE] Discovered: " + next_page)
342+
logger.debug("[ENGINE] Discovered: " + SubCrawlHelpers.defang_url(next_page))
342343
crawl_pages.append(next_page)
343344
scrape(next_page, s_data)
344345
else:

crawler/utils/helpers.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import hashlib
33
import re
44
import sys
5+
from urllib.parse import urlparse
56

67
# Source: https://codereview.stackexchange.com/questions/19663/http-url-validating
78
valid_url = re.compile(
@@ -24,8 +25,11 @@ def save_content(file_name, data):
2425
with open(file_name, "wb") as file:
2526
file.write(data)
2627

27-
def make_safe_http(url):
28-
return url.replace('http', 'hxxp')
28+
def defang_url(url):
29+
parsed_url = urlparse(url)
30+
last_dot = parsed_url.netloc.rindex('.')
31+
defanged = parsed_url.netloc[0:last_dot] + '[.]' + parsed_url.netloc[last_dot + 1:]
32+
return url.replace(parsed_url.netloc, defanged).replace('http', 'hxxp')
2933

3034
def is_valid_url(url):
3135
if valid_url.match(url):

0 commit comments

Comments
 (0)