-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser.py
158 lines (124 loc) · 3.75 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import sqlite3
from urllib.parse import urlparse
import requests
import sentry_sdk
from dotenv import dotenv_values
from requests.exceptions import (
SSLError,
ConnectTimeout,
ConnectionError,
ReadTimeout,
InvalidSchema,
)
from bs4 import BeautifulSoup
CONFIG = dotenv_values()
sentry_sdk.init(
CONFIG["SENTRY_URL"],
traces_sample_rate=1.0,
)
PROXIES = {
"http": "socks5h://127.0.0.1:9050",
"https": "socks5h://127.0.0.1:9050",
}
GLOBAL_UNIQ = set()
class BadContentType(Exception):
pass
def create_schema(connect):
cur = connect.cursor()
cur.execute(
"""
CREATE TABLE IF NOT EXISTS sites
(id integer primary key autoincrement, parent_id integer, domain text unique, title text)
"""
)
connect.commit()
def get_first_domain(link):
parse_result = urlparse(link)
domain = parse_result.netloc.split(".")[-1]
return domain
def prepare_link(raw, response):
href = raw.get("href", "")
if not href:
return ""
link = href
if "http" not in href:
parse_result = urlparse(response.request.url)
link = f"{parse_result.scheme}://{parse_result.netloc}{href}"
if get_first_domain(link) != "onion":
return ""
return link
def fetch_links(response):
soup = BeautifulSoup(response.content, "html.parser")
links = []
for link in soup.find_all("a"):
link = prepare_link(link, response)
if link:
links.append(link)
return links
def parse_title(response):
soup = BeautifulSoup(response.content, "html.parser")
title = soup.find("title")
if title:
return title.text
return title
def request_page(url):
response = requests.head(url, timeout=20, proxies=PROXIES, allow_redirects=False)
ct = response.headers.get("Content-Type", "")
if ct != "text/html":
raise BadContentType(ct)
return requests.get(url, timeout=20, proxies=PROXIES, allow_redirects=False)
def add_site(cur, domain, parent_id=None):
cur.execute(
"""
INSERT INTO sites(parent_id, domain) VALUES (?, ?)
""",
(parent_id, domain),
)
def get_site(cur, domain):
cur.execute("select id from sites where domain=?", (domain,))
result = cur.fetchone()
if result is None:
return
return result[0]
def set_title(cur, id, title):
cur.execute("update sites set title=? where id=?", (title, id))
def parser(connect):
url = "http://s4k4ceiapwwgcm3mkb6e4diqecpo7kvdnfr5gg7sph7jjppqkvwwqtyd.onion"
queue = set()
cur = connect.cursor()
for link in fetch_links(request_page(url)):
queue.add(link)
parse_result = urlparse(link)
if not get_site(cur, parse_result.netloc):
add_site(cur, parse_result.netloc)
connect.commit()
while queue:
print("queue size", len(queue))
link = queue.pop()
db_id = get_site(cur, urlparse(link).netloc)
print("work with", link)
try:
page = request_page(link)
except (SSLError, ConnectionError, ConnectTimeout, ReadTimeout, InvalidSchema):
print("skip with error")
continue
except BadContentType:
print("skip page with bad content type")
continue
title = parse_title(page)
set_title(cur, db_id, title)
for link in fetch_links(page):
if link in GLOBAL_UNIQ:
continue
GLOBAL_UNIQ.add(link)
queue.add(link)
if not get_site(cur, urlparse(link).netloc):
add_site(cur, urlparse(link).netloc, db_id)
connect.commit()
if __name__ == "__main__":
con = sqlite3.connect("parser.db")
try:
create_schema(con)
parser(con)
finally:
con.close()