Skip to content

Commit

Permalink
add HTML scraper for the Ubuntu pastebin
Browse files Browse the repository at this point in the history
Ughr.
  • Loading branch information
grawity committed Dec 23, 2020
1 parent 78f6022 commit 4eeee11
Showing 1 changed file with 21 additions and 0 deletions.
21 changes: 21 additions & 0 deletions getpaste
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,22 @@ sub dl_sour {
return $data;
}

sub dl_ubuntu {
my ($url, $frag) = @_;

my $body = get($url);
if ($body =~ m{<div class="paste"><pre>(.+?)</pre></div>}s) {
$body = $1;
$body =~ s{<span>([^<>]*?)</span>}{$1}g;
$body =~ s{<span class="\w+">([^<>]*?)</span>}{$1}g;
if ($body =~ /[<>]/) {
_warn("unmatched HTML tags remain!");
}
return decode_entities($body);
}
return;
}

sub dl_zerobin {
my ($url, $frag) = @_;

Expand Down Expand Up @@ -1383,6 +1399,11 @@ sub dl_zerobin {
path => qr!^/\d+!,
to_query => "tx=on",
},
{
host => "pastebin.ubuntu.com",
path => qr!^/p/\w+!,
parser => \&dl_ubuntu,
},
{
host => "paste.xinu.at",
path => qr!^/\w+!,
Expand Down

0 comments on commit 4eeee11

Please sign in to comment.