Skip to content

Commit

Permalink
fix #12, fix #10
Browse files Browse the repository at this point in the history
  • Loading branch information
s0md3v authored Feb 27, 2022
1 parent 96e1079 commit 31ba4a7
Showing 1 changed file with 31 additions and 45 deletions.
76 changes: 31 additions & 45 deletions uro/uro.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import re
import sys
import argparse
from urllib.parse import urlparse

from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)

urlmap = {}
params_seen = []
patterns_seen = []
content_patterns = []

blacklist = r'(post|blog)s?|docs|support/|/(\d{4}|pages?)/\d+/'
static_exts = ('js', 'css', 'png', 'pdf', 'jpg', 'jpeg', 'ico', 'bmp', 'svg', 'gif')
re_int = re.compile(r'/\d+([?/]|$)')
re_content = re.compile(r'(post|blog)s?|docs|support/|/(\d{4}|pages?)/\d+/')
static_exts = ('js', 'css', 'png', 'jpg', 'jpeg', 'svg',
'ico','webp', 'ttf', 'otf', 'woff', 'gif',
'pdf', 'bmp', 'eot', 'mp3', 'woff2', 'mp4', 'avi'
)


def params_to_dict(params: str) -> list:
Expand Down Expand Up @@ -47,29 +51,14 @@ def compare_params(og_params: list, new_params: dict) -> bool:
return set(new_params.keys()) - og_set


def is_seen(path: str) -> bool:
"""
checks if a url matches any recorded patterns
"""
for pattern in patterns_seen:
if re.search(pattern, path):
return compare_params(path)


def is_content(path: str) -> bool:
"""
checks if a path is likely to contain
human written content e.g. a blog
"""
if path.count('-') > 3:
new_parts = []
for part in re.escape(path).split('/'):
if part.count('-') > 3:
new_parts.append('[^/]+')
else:
new_parts.append(part)
content_patterns.append('/'.join(new_parts))
return True
for part in path.split('/'):
if part.count('-') > 3:
return True
return False


Expand Down Expand Up @@ -109,53 +98,50 @@ def matches_patterns(path: str) -> bool:
return False


def is_blacklisted(path: str) -> bool:
def has_bad_ext(path: str) -> bool:
"""
checks if the url matches the blacklist regex
checks if a url has a blacklisted extension
"""
return re.search(blacklist, path)
return False if '/' in path.split('.')[-1] else path.lower().endswith(static_exts)


def has_bad_ext(path: str) -> bool:
def is_new_param(params: list) -> bool:
"""
checks if a url has a blacklisted extension
checks if a there's an unseen param within given params
"""
return False if '/' in path.split('.')[-1] else path.lower().endswith(static_exts)
for param in params:
if param in params_seen:
return False
return True


def main():
if not sys.stdin.isatty():
for line in sys.stdin:
parsed = urlparse(line.strip())
host = parsed.scheme + '://' + parsed.netloc
path, params = parsed.path, params_to_dict(parsed.query)
if host not in urlmap:
urlmap[host] = {}
if has_bad_ext(path):
path, params = parsed.path, params_to_dict(parsed.query)
has_new_param = False if not params else is_new_param(params.keys())
new_params = [param for param in params.keys() if param not in params_seen]
params_seen.extend(new_params)
if has_bad_ext(path) or re_content.search(path) or is_content(path):
continue
if not params:
if is_content(path) or is_blacklisted(path):
continue
if (not params or has_new_param) and re_int.match(path):
pattern = create_pattern(path)
if matches_patterns(path):
continue
if '\\d+' in pattern and not pattern_exists(pattern):
if not pattern_exists(pattern):
patterns_seen.append(pattern)
if path not in urlmap[host]:
elif path not in urlmap[host]:
urlmap[host][path] = [params] if params else []
elif params and compare_params(urlmap[host][path], params):
elif has_new_param or compare_params(urlmap[host][path], params):
urlmap[host][path].append(params)
for host, value in urlmap.items():
for path, params in value.items():
if params:
for param in params:
print(host + path + dict_to_params(param))
elif '-' in path:
matched = False
for pattern in content_patterns:
if re.search(pattern, path):
matched = True
break
if not matched:
print(host + path)
else:
print(host + path)

0 comments on commit 31ba4a7

Please sign in to comment.