-
Notifications
You must be signed in to change notification settings - Fork 0
/
sub_count.py
31 lines (28 loc) · 1.09 KB
/
sub_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# prints out # of subdomains associated with each domain
from collections import defaultdict
import csv
import sys
def count_subdomains(file_path, keywords):
with open(file_path, 'r') as f:
reader = csv.reader(f)
lines = [row[2] for row in reader]
domains = defaultdict(set)
for line in lines:
line = line.strip()
# Check for duplicated domain names and remove them
if len(line) % 2 == 0 and line[:len(line)//2] == line[len(line)//2:]:
line = line[:len(line)//2]
# Check if the domain name contains any of the keywords and ignore it if it does
if any(keyword in line for keyword in keywords):
continue
parts = line.split('.')
if len(parts) > 2:
root_domain = '.'.join(parts[-2:])
subdomain = parts[0]
domains[root_domain].add(subdomain)
for domain, subdomains in domains.items():
print(f'{len(subdomains)} {domain}')
if __name__ == '__main__':
file_path = sys.argv[1]
ignore_list = ['word1', 'word2']
count_subdomains(file_path, ignore_list)