-
Notifications
You must be signed in to change notification settings - Fork 0
/
oeis_drafts.pl
173 lines (135 loc) · 5.29 KB
/
oeis_drafts.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/perl
# Daniel "Trizen" Șuteu
# Date: 07 April 2019
# https://github.com/trizen
# Get the list of OEIS drafts and generate an HTML file, highlighting the sequences that need more terms.
use 5.014;
use strict;
use warnings;
use LWP::UserAgent::Cached;
use HTML::Entities qw(decode_entities encode_entities);
require LWP::UserAgent;
require HTTP::Message;
use constant {
USE_TOR_PROXY => 0, # true to use the Tor proxy (127.0.0.1:9050)
};
my $cache_dir = 'cache';
if (not -d $cache_dir) {
mkdir($cache_dir);
}
my $lwp = LWP::UserAgent::Cached->new(
timeout => 60,
show_progress => 1,
agent => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
cache_dir => $cache_dir,
ssl_opts => {verify_hostname => 1, SSL_version => 'TLSv1_3'},
nocache_if => sub {
my ($response) = @_;
my $code = $response->code;
return 1 if ($code >= 500); # do not cache any bad response
return 1 if ($code == 401); # don't cache an unauthorized response
return 1 if ($response->request->method ne 'GET'); # cache only GET requests
return;
},
);
my $lwp_uc = LWP::UserAgent->new(
timeout => 60,
show_progress => 1,
agent => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
ssl_opts => {verify_hostname => 1, SSL_version => 'TLSv1_3'},
);
{
state $accepted_encodings = HTTP::Message::decodable();
$lwp->default_header('Accept-Encoding' => $accepted_encodings);
$lwp_uc->default_header('Accept-Encoding' => $accepted_encodings);
require LWP::ConnCache;
my $cache = LWP::ConnCache->new;
$cache->total_capacity(undef); # no limit
$lwp->conn_cache($cache);
$lwp_uc->conn_cache($cache);
}
if (USE_TOR_PROXY) {
$lwp->proxy(['http', 'https'], "socks://127.0.0.1:9050");
$lwp_uc->proxy(['http', 'https'], "socks://127.0.0.1:9050");
}
my @all_ids;
my $start = 0;
while (1) {
my $content = $lwp_uc->get("https://oeis.org/draft?start=$start")->decoded_content;
my @ids;
while ($content =~ m{<td><a href="/draft/(A\d+)">A\d+</a>}g) {
push @ids, $1;
}
@ids || last;
push @all_ids, @ids;
$start += 100;
}
say "Found: ", scalar(@all_ids), " ids";
open my $fh, '>:utf8', 'links.html';
print $fh <<'EOF';
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<style>
tt { font-family: monospace; font-size: 100%; }
p.editing { font-family: monospace; margin: 10px; text-indent: -10px; word-wrap:break-word;}
p { word-wrap: break-word; }
</style>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<title>OEIS links</title>
</head>
<body bgcolor=#ffffff>
EOF
#~ say $fh "<ul>";
sub remove_tags {
my ($str) = @_;
$str =~ s/<.*?>//gs;
join(' ', split(' ', $str));
}
my $k = 1;
foreach my $id (@all_ids) {
my $url = "https://oeis.org/draft/$id";
my $content = $lwp->get($url)->decoded_content;
my $more = 0;
if ( $content =~ m{<div class=sectname>KEYWORD</div>\s*<div class=sectbody>\s*.*?<span title="(.*?)">more</span>}
or $content =~ m{<div class=sectname>KEYWORD</div>\s*<div class=sectbody>\s*.*?<span title="(.*?)">hard</span>}) {
$more = 1;
}
my $author = '';
my $name = '';
#<<<
if ( $content =~ m{.*<div class=sectname>NAME</div>\s*<div class=sectbody>\s*<p class="diffs"><tt><span style="color: #\d+;">(.*?)</span>}s
or $content =~ m{.*<div class=sectname>NAME</div>\s*<div class=sectbody>\s*<p class="diffs"><tt><del>.*?</del></tt></p>\s*<p class="diffs"><tt>(.*?)</tt></p>}s
or $content =~ m{.*<div class=sectname>NAME</div>\s*<div class=sectbody>\s*<p class="diffs"><tt><span style="color: #\d+;">(.*?)</span></tt></p>}s
or $content =~ m{.*<div class=sectname>NAME</div>\s*<div class=sectbody>\s*<p class="diffs"><tt>(.*?)</tt></p>}s) {
$name = remove_tags($1);
}
else {
warn "Failed to extract name for ID: $id\n";
}
#>>>
#<<<
if ( $content =~ m{.*<div class=sectname>AUTHOR</div>\s*<div class=sectbody>\s*<p class="diffs"><tt><span style="color: #\d+;"><a href="/wiki/User:.*?">(.*?)</a>}s
or $content =~ m{.*<div class=sectname>AUTHOR</div>\s*<div class=sectbody>\s*<p class="diffs"><tt><ins><a href="/wiki/User:.*?">(.*?)</a>}s
or $content =~ m{.*<div class=sectname>AUTHOR</div>\s*<div class=sectbody>\s*<p class="diffs"><tt><span style="color: #\d+;">(.*?)</span></tt></p>}s
or $content =~ m{.*<div class=sectname>AUTHOR</div>\s*<div class=sectbody>\s*<p class="diffs"><tt>(.*?)</tt></p>}s
) {
$author = remove_tags($1);
}
else {
warn "Failed to extract author for ID: $id\n";
}
#>>>
my $tname = $name;
if ($more) {
$tname = "<big><b>$tname</b></big>";
}
say $fh "<pre>" . $tname . " -- $author</pre>";
say $fh "<ul>";
say $fh "<li> [$k] <a href=$url>$url</a> </li>";
say $fh "</ul>";
#say $fh "<li>[$k] <a href=$url>$url</a><br> -- $name -- $author</li>";
++$k;
}
#~ say $fh "</ul>";
say $fh "</body></html>";