-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
98 lines (81 loc) · 3.13 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""Extracts URLs found in the pdf file located at
https://www.emta.ee/ariklient/registreerimine-ettevotlus/hasartmangukorraldajale/blokeeritud-hasartmangu
and writes them to a .txt blocklist
"""
import datetime
import logging
import re
import requests
import tldextract
from pdfminer.high_level import extract_text
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO, format="%(message)s")
def current_datetime_str() -> str:
"""Current time's datetime string in UTC.
Returns:
str: Timestamp in strftime format "%d_%b_%Y_%H_%M_%S-UTC"
"""
return datetime.datetime.now(datetime.UTC).strftime("%d_%b_%Y_%H_%M_%S-UTC")
def clean_url(url: str) -> str:
"""Remove zero width spaces, leading/trailing whitespaces, trailing slashes,
and URL prefixes from a URL
Args:
url (str): URL
Returns:
str: URL without zero width spaces, leading/trailing whitespaces, trailing slashes,
and URL prefixes
"""
removed_zero_width_spaces = re.sub(r"[\u200B-\u200D\uFEFF]", "", url)
removed_leading_and_trailing_whitespaces = removed_zero_width_spaces.strip()
removed_trailing_slashes = removed_leading_and_trailing_whitespaces.rstrip("/")
removed_https = re.sub(r"^[Hh][Tt][Tt][Pp][Ss]:\/\/", "", removed_trailing_slashes)
removed_http = re.sub(r"^[Hh][Tt][Tt][Pp]:\/\/", "", removed_https)
return removed_http
def extract_urls() -> set[str]:
"""Extract URLs found in
https://ncfailid.emta.ee/s/6BEtzQAgFH4y349/download/Blokeeritud_domeeninimed.pdf
Returns:
set[str]: Unique URLs
"""
try:
endpoint: str = "https://ncfailid.emta.ee/s/6BEtzQAgFH4y349/download/Blokeeritud_domeeninimed.pdf"
res = requests.get(endpoint, verify=True, timeout=120)
with open("source.pdf", "wb") as f:
f.write(res.content)
text = extract_text("source.pdf")
entries = [
f
for e in text.split("\n")
if not (f := e.strip()).isnumeric() # type: ignore
]
urls = set(
maybe_url_cleaned
for maybe_url in entries
if (maybe_url_cleaned := clean_url(maybe_url)) # type: ignore
and len(tldextract.extract(maybe_url_cleaned).registered_domain)
)
return urls
except Exception as error:
logger.error(error)
return set()
if __name__ == "__main__":
urls: set[str] = extract_urls()
registered_domains: set[str] = set(
tldextract.extract(url).registered_domain for url in urls
)
if not urls:
raise ValueError("URL extraction failed")
timestamp: str = current_datetime_str()
filename = "blocklist.txt"
with open(filename, "w") as f:
f.writelines("\n".join(sorted(urls)))
logger.info("%d URLs written to %s at %s", len(urls), filename, timestamp)
filename = "blocklist_UBL.txt"
with open(filename, "w") as f:
f.writelines("\n".join(f"*://*.{r}/*" for r in sorted(registered_domains)))
logger.info(
"%d Registered Domains written to %s at %s",
len(registered_domains),
filename,
timestamp,
)