-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.sql
203 lines (181 loc) · 6.07 KB
/
crawl.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
.load sqlitepipe/sqlitepipe
/*
-- Use this for manually testing a url!
with whitelisted_url(id,domain,url,path,rank) as (
select
1,
'https://www.example.com',
'https://www.example.com/?var=val',
'/?var=val',
0.00015
),
*/
with whitelisted_url(id,domain,url,path,rank) as (
select
whitelist.id,
whitelist.domain,
--page.id,
page.url,
page.path,
page_rank.rank
from whitelist
left outer join (
select
id,
url,
printf("%s",pipe('uri-parser/uri-parser --protocol --host ' || quote(page.url) || ' \
| awk ''BEGIN{FS=" "} {printf $1 "://" $2}''')
) as domain,
printf("%s",pipe('uri-parser/uri-parser --path --query ' || quote(page.url) || ' \
| awk ''BEGIN{FS=" "} {if($1==$2){print $1;}else{print $1 " " $2;}}'' \
| tr -d ''\n'' | tr '' '' ?')
) as path
from page
where content is null -- TODO add a last_fetched field and query-by
and is_retired is not 1
) page on page.domain = whitelist.domain
left join page_rank on page_rank.id = page.id
where page.url is not null
order by page_rank.rank desc
limit 1
),
-- TODO this is only working on whitelisted_url, NOT every link we get from it #fixme
-- maybe we could not return the globbed pattern and join on it as-needed?
disallow_rules(pattern,is_match) as (
select
pattern,
glob(pattern || '*' ,(select path from whitelisted_url)) as is_match
from rule
where
whitelist_id = (select id from whitelisted_url)
and is_allowed = 0
),
disallow_match(num_matches) as (
select
count(is_match)
from
disallow_rules
where is_match = "1"
),
fetch_content(url,full_content) as (
select
url,
case when (
(select num_matches from disallow_match) >= 1
) then printf("%s","") -- just return the empty str and we'll catch it later
else
-- the '-nonumbers' options might help w/ full-text search
-- however it removes the 'Reference' delimiter making separating links & content easier
-- TODO we might want to add in hiddenlinks later
-- TODO handle response code
pipe('lynx -dump -nostatus -notitle -unique_urls --hiddenlinks=ignore ' || quote(url))
end as full_content
from whitelisted_url
),
-- gets the content from lynx's output
parse_content(url,content) as (
select
url,
-- can't quote full_content
printf("%s",pipe('tac | sed ''0,/^References/d'' | tac',full_content)) as content
from fetch_content
),
-- gets the links from lynx's output
parse_links(url,links) as (
select
url,
printf("%s",pipe('tac | sed ''/^References/q'' | head -n -2 | sed ''s/\s\+[0-9]\+\.\s//''',full_content)) as links
from fetch_content
),
-- https://stackoverflow.com/questions/34659643/split-a-string-into-rows-using-pure-sqlite
-- https://stackoverflow.com/users/11654/cl
-- creates a tmp tbl w/ the parsed links
split_links(word,str,hasnewline) as (
values('',(select links from parse_links),1)
union all
select
substr(str, 0,
case when instr(str, x'0a')
then instr(str, x'0a')
else length(str)+1 end),
ltrim(substr(str, instr(str, x'0a')), x'0a'),
instr(str, x'0a')
from split_links
where hasnewline
),
link_tbl(link) as (
select trim(word) FROM split_links WHERE word!=''
),
defragment_links(url) as (
select
-- get the distinct, defragmented urls
distinct(printf("%s",pipe('xargs uri-parser/uri-parser --defragment | tr -d ''\n''',link))) as url
from link_tbl
),
-- validates that we have a protocol and a host (avoid javascript:void(0) for example)
validate_links(url) as (
select
case when (
pipe('xargs uri-parser/uri-parser --protocol --host \
| awk ''{if(NF==2){printf "1"}}''',url)
) then url
end as url
from defragment_links
),
-- this does not check that this inserted link is allowed to be crawled
-- only creates a new record for it
link_inserts(link_insert_queries) as (
select
printf("%s",x'0a' || 'insert into page (url) values (' || quote(url) || ');')
|| printf("%s",x'0a' || 'insert into link (src_page_id,dest_page_id) values ('
|| '(select id from page where url = ' || (select quote(url) from whitelisted_url) || '),'
|| '(select id from page where url = ' || quote(url) || '));') as link_insert_queries
from validate_links
where coalesce(url,'') <> ''
),
content_insert(content_insert_query) as (
select
printf("%s",'update page set content = ' || quote(content) || ' where url = ' || quote(url) || ';')
from
parse_content
),
full_insert(insert_queries) as (
select
-- determine parseability
case when (
coalesce(whitelisted_url.url,'') = ''
or coalesce(parse_content.content,'') = ''
) then printf("%s",'update page set is_retired = 1 where url = ' || quote(whitelisted_url.url) || ';')
else
printf("%s",x'0a' || 'begin transaction;')
|| x'0a' || content_insert.content_insert_query
|| group_concat(link_insert_queries,'')
|| printf("%s",x'0a' || 'commit;')
end as insert_queries
from content_insert,whitelisted_url,parse_content,link_inserts
)
select
--whitelisted_url.*,
--disallow_rules.*,
--fetch_content.*,
--parse_content.*
--parse_links.*
--link_tbl.*,
--validate_links.*,
--link_inserts.*,
--content_insert.*,
-- to enable we only want to return the full_insert queries!
full_insert.insert_queries
from
--whitelisted_url,
--disallow_rules,
--fetch_content,
--parse_content,
--parse_links,
--link_tbl,
--validate_links,
--link_inserts,
--content_insert,
full_insert
--limit 10
;