Skip to content
This repository has been archived by the owner on May 11, 2019. It is now read-only.

Commit

Permalink
parse fixes, access log smarts
Browse files Browse the repository at this point in the history
git-svn-id: svn://occams/govtrack/gather/us@1227 6c5aeb89-041d-0410-8b88-fd54b8401f9f
  • Loading branch information
JoshData committed Feb 11, 2010
1 parent 90dcaa1 commit 2c5800a
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 54 deletions.
100 changes: 100 additions & 0 deletions access_log_smarts.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
use Parse::AccessLogEntry;
use URI;
use URI::QueryParam;
use XML::LibXML;

require "util.pl";

$p = Parse::AccessLogEntry->new();

for my $wd ('house', 'senate', 'congress', 'act', 'bill', 'resolution', 'legislation', 's.', 's.res.', 's.j.res.', 's.con.res.', 'h.r.', 'hr', 'h.res.', 'h.con.res.', 'h.j.res.', 'vote',
'of', 'what', 'sponsor', 'status', 'govtrack') {
$stopwords{$wd} = 1;
}

while (!eof(STDIN)) {
$line = <STDIN>;
$log = $p->parse($line);

if (!$first_date) { $first_date = $log->{date}; }

my $r = $log->{refer};
if ($r eq '-') { next; }

my $f = $log->{file};
if ($f !~ m%^/congress/bill\.xpd\?bill=((h|hj|hc|hr|s|sj|sc|sr)\d+-\d+)$%) { next; }
$f = $1;

$r = URI->new($r);
eval { # parse may fail
if ($r->host =~ /google/) {
$F{$f}++;

$q = lc($r->query_param('q'));
@wds = split(/\s+/, $q);
@stops = ();
for my $wd (@wds) {
push @stops, ($stopwords{$wd} || ($wd =~ /^[\d.]+:?$/) || ($wd =~ (/^[hs][a-z\.]*[- ]?\d+$/i))) ? 1 : '';
}
for $i (0..scalar(@wds)-1) {
for $j ($i..scalar(@wds)-1) {
$wd = join(" ", @wds[$i..$j]);
if (join("", @stops[$i..$j]) ne "") { next; }
if ($stopwords{$wd}) { next; }

$wd =~ s/-/ /g;
$wd =~ s/["()]//g;

$Q{$f}{$wd} += sqrt(length($wd));
}
}

$ctr++;
if ($ctr == 5000) { last; }
}
};
}

$doc = $XMLPARSER->parse_string('<popular-bills/>');
$doc->documentElement->setAttribute('last-updated', Now());
$doc->documentElement->setAttribute('log-start', $first_date);

$ctr = 0;
@sorted = sort { $F{$b} <=> $F{$a} } keys %F;
for my $f (@sorted) {
#print "$F{$f} $f\n";
my $fnode = $doc->createElement('bill');
$fnode->setAttribute('id', $f);
$fnode->setAttribute('hits', $F{$f});
$doc->documentElement->appendChild($fnode);

$ctr2 = 0;
@sorted2 = sort { $Q{$f}{$b} <=> $Q{$f}{$a} } keys %{$Q{$f}};
for my $i (0..scalar(@sorted2)-1) {
my $q = $sorted2[$i];

if ($Q{$f}{$q} < $Q{$f}{$sorted2[0]}/10) { last; }

# If this term contains or is contained in a term we already saw, skip it.
my $ok = 1;
for my $i2 (0..$i-1) {
my $q2 = $sorted2[$i2];
if (index($q, $q2) >=0 || index($q2, $q) >= 0) { $ok = 0; last; }
}
if (!$ok) { next; }

#print " $Q{$f}{$q} <$q>\n";

my $qnode = $doc->createElement('search-string');
$qnode->setAttribute('score', $Q{$f}{$q});
$qnode->appendText($q);
$fnode->appendChild($qnode);

if ($ctr2++ == 20) { last; }
}

if ($ctr++ == 100) { last; }
}

print $doc->toFile("../data/misc/popularbills.xml", 1);

8 changes: 6 additions & 2 deletions cronjob
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,20 @@ sub DoIndex {
print "Indexing CR\n";
require "indexcr.pl";
MakeCRIndex($session);
print "Done Indexing CR\n";

if ($Weekly) {
#if ($Weekly) {
# REP STATS
print "RepStats\n";
require "repstat.pl";
DoRepStats($session);
print "Done RepStats\n";

# SUMMARY
print "StatsSummary\n";
require "statsummary.pl";
DoStatsSummary($session);
}
#}

# LUCENE

Expand All @@ -107,6 +109,8 @@ sub DoIndex {

system('mysql -u root govtrack -Be "SELECT DISTINCT value FROM billindex WHERE idx = \"crs\"" > /home/govtrack/data/us/crs_terms');

system('tail -10000000 ~/logs/access_log |perl access_log_smarts.pl');

print `date`;
}

Expand Down
8 changes: 4 additions & 4 deletions database.tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ CREATE TABLE `people` (
KEY `lastnameenc` (`lastnameenc`(15)),
KEY `lastnamealt` (`lastnamealt`(15)),
KEY `lismemberid` (`lismemberid`)
) ENGINE=MyISAM AUTO_INCREMENT=412384 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
) ENGINE=MyISAM AUTO_INCREMENT=412385 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand All @@ -74,7 +74,7 @@ CREATE TABLE `people_roles` (
PRIMARY KEY (`personroleid`),
KEY `personid` (`personid`),
KEY `state` (`state`,`enddate`)
) ENGINE=MyISAM AUTO_INCREMENT=42503 DEFAULT CHARSET=latin1;
) ENGINE=MyISAM AUTO_INCREMENT=42504 DEFAULT CHARSET=latin1;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand All @@ -97,7 +97,7 @@ CREATE TABLE `people_videos` (
KEY `personid` (`personid`,`date`),
KEY `date` (`date`),
KEY `link` (`link`(127))
) ENGINE=MyISAM AUTO_INCREMENT=3975156 DEFAULT CHARSET=utf8;
) ENGINE=MyISAM AUTO_INCREMENT=4471271 DEFAULT CHARSET=utf8;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand Down Expand Up @@ -169,4 +169,4 @@ CREATE TABLE `committees` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;

-- Dump completed on 2010-01-03 18:03:39
-- Dump completed on 2010-02-07 10:34:17
6 changes: 4 additions & 2 deletions database.tables2.sql
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ CREATE TABLE `billstatus` (
`fulltitle` text COLLATE utf8_unicode_ci NOT NULL,
`statusdate` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
`statusxml` text COLLATE utf8_unicode_ci NOT NULL,
`status` tinytext COLLATE utf8_unicode_ci,
PRIMARY KEY (`session`,`type`,`number`),
KEY `fulltitle` (`fulltitle`(100)),
KEY `statusdate` (`statusdate`)
Expand Down Expand Up @@ -205,10 +206,11 @@ CREATE TABLE `questions` (
`text` text NOT NULL,
`topic` text NOT NULL,
`moderator` varchar(12) NOT NULL,
`ipaddr` varchar(16) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `question` (`question`),
KEY `topic` (`topic`(16))
) ENGINE=MyISAM AUTO_INCREMENT=26498 DEFAULT CHARSET=utf8;
) ENGINE=MyISAM AUTO_INCREMENT=27606 DEFAULT CHARSET=utf8;
/*!40101 SET character_set_client = @saved_cs_client */;

--
Expand Down Expand Up @@ -243,4 +245,4 @@ CREATE TABLE `votes` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;

-- Dump completed on 2010-01-03 18:03:39
-- Dump completed on 2010-02-07 10:34:17
46 changes: 11 additions & 35 deletions fetchbilltext.pl
Original file line number Diff line number Diff line change
Expand Up @@ -354,42 +354,14 @@ sub FetchBillTextHTML {
my $type2 = $type;
if ($type2 eq "hr") { $type2 = "hres"; }

my $URL = "http://thomas.loc.gov/cgi-bin/query/z?c$session:$type2$number:";
sleep(1);
my $response = $UA->get($URL);
if (!$response->is_success) {
warn "Could not fetch bill text at $URL: " .
$response->code . " " .
$response->message;
return;
}
my $indexhtml = $response->content;
$HTTP_BYTES_FETCHED += length($indexhtml);

# First, when there's only one status available, this page isn't
# an index but the text itself...

if ($indexhtml =~ /Printer Friendly Display/) {
FetchBillTextHTML2($session, $type, $number, $get_statuses[0], $indexhtml);
return;
}

# And if the page says that the text hasn't been received from GPO,
# that's ok. We'll just move on.
if ($indexhtml =~ /has not yet been received from GPO/) {
return;
}

foreach my $status (@get_statuses) {
my $file = "../data/us/bills.text/$session/$type/$type$number$status.html";
if (-e $file) { next; }

if ($indexhtml !~ /<a href="(\/cgi-bin\/query\/D\?[^"]+)">\[[HRESCONJ\.0-9]+\.$status\]<\/a>/i) {
warn "Could not find link to status text for $status at $URL";
next;
}
# THOMAS started generating pages w/o the temp link if you don't specify Mozilla in the UA
my $UA = LWP::UserAgent->new(keep_alive => 2, timeout => 30, agent => "Mozilla/4.0 (GovTrack.us scraper)", from => "operations@govtrack.us");

my $URL2 = "http://thomas.loc.gov" . $1;
my $URL2 = "http://thomas.loc.gov/cgi-bin/query/z?c$session:$type2$number.$status:";
sleep(1);
my $response = $UA->get($URL2);
if (!$response->is_success) {
Expand All @@ -411,8 +383,9 @@ sub FetchBillTextHTML2 {
mkdir "../data/us/bills.text/$session/$type";

# move to printer friendly page
if ($htmlpage !~ /<a href="(\/cgi-bin\/query\/C\?[^"]+)"[^>]*>(<em>)?Printer Friendly Display/) {
die "Could not find the link to the printer friendly display in $session/$type$number/$status";
if ($htmlpage !~ /<a href="(\/cgi-bin\/query\/C\?[^"]+)"[^>]*>(<em>)?Printer Friendly/i) {
warn "Could not find the link to the printer friendly display in $session/$type$number/$status";
return;
}

my $URL = "http://thomas.loc.gov" . $1;
Expand All @@ -433,7 +406,7 @@ sub FetchBillTextHTML2 {
# chop off everything before the status line
# sometimes IH appears here as RIH
# sometimes the wrong status code shows up (EH instead of ENR)
if ($htmlpage !~ s/^[\w\W]*?<p>(<em>.<\/em>)?\s*([HRESCONJ\.]+ *$number R?($status|[A-Z]{2,3})(\dS)?[\n\r])/$2/i
if ($htmlpage !~ s/^[\w\W]*?<p>(<em>.<\/em>)?\s*([HRESCONJ\.]+ *$number R?($status|[A-Z]{2,3})(\dS)?(\/PP)?[\n\r])/$2/i
&& $htmlpage !~ s/^[\w\W]*?\n\s*([HRESCONJ\.]+ *$number ?R?($status|[A-Z]{2,3}|IHIS|)(\dS)?<p>[\n\r])/$1/i
&& ($status ne 'enr' || $htmlpage !~ s/^[\w\W]*?<p>\s*([HRESCONJ\.]+ ?$number[\n\r])/$1/i)
&& $htmlpage !~ s/^[\w\W]*?<p>(<h3><b>Suspend the Rules and Pass the Bill)/$1/i
Expand Down Expand Up @@ -563,7 +536,10 @@ sub CreateGeneratedBillTexts {
my $textdir = "../data/us/bills.text/$session";
my $cmpdir = "../data/us/bills.text.cmp/$session";

system("mkdir -p {$textdir,$cmpdir}/{h,s,hr,sr,hj,sj,hc,sc}");
# This isn't working and creates weird directories probably because
# it's executed with sh and not bash, so the braces are treated
# literally.
#system("mkdir -p {$textdir,$cmpdir}/{h,s,hr,sr,hj,sj,hc,sc}");

opendir BILLS, "$billdir";
foreach my $bill (sort(readdir(BILLS))) {
Expand Down
15 changes: 7 additions & 8 deletions parse_status.pl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ sub UpdateBills {
my $lastseq;

my $offset = 0;
my ($seq, $bt, $bn, $rec);

while (defined($offset)) {
my $url = "http://thomas.loc.gov/cgi-bin/bdquery/d?d$SESSION:$offset:./list/bss/d$SESSION$tbt.lst:\[\[o\]\]";
Expand All @@ -57,8 +58,6 @@ sub UpdateBills {
my ($content, $mtime) = Download($url);
if (!$content) { warn; return; }

my ($seq, $bt, $bn, $rec);

my @lines = split(/[\n\r]/, $content);
for my $line (@lines) {
if ($line =~ /(.*)<hr>/) {
Expand Down Expand Up @@ -102,10 +101,10 @@ sub UpdateBills {
if (!defined($lastseq)) {
warn "No $tbt bills."
}
}

if (defined($rec)) {
UpdateBills2($SESSION, $bt, $bn, $rec, \%changehash);
if (defined($rec)) {
UpdateBills2($SESSION, $bt, $bn, $rec, \%changehash);
}
}

open CHANGES, ">$changefile";
Expand Down Expand Up @@ -358,7 +357,7 @@ sub GovGetBill {
my $cline = shift(@content);

if ($titlesmode == 1) {
if ($cline =~ /<\/ul>/) { $titlesmode = 0; next; }
if ($cline =~ /<\/ul>/i) { $titlesmode = 0; next; }
$titles .= $cline;
next;
}
Expand Down Expand Up @@ -788,14 +787,14 @@ sub GovGetBill {
}
$titles =~ s/[\n\r]//g;
$titles =~ s/<\/?i>//gi;
while ($titles =~ m/<li>([\w\W]*?)( as [\w ]*)?:<br>([\w\W]+?)(<p>|$)/gi) {
while ($titles =~ m/<li>([\w\W]*?)( as [\w ]*)?:<br\/?>([\w\W]+?)(<p>|<\/ul>|$)/gi) {
my $type = $1;
my $when = $2;
my $ts = $3;
$type =~ s/ title(\(s\))?//i;
$when =~ s/^ as //i;

foreach my $t (split(/<BR>/i, $ts)) {
foreach my $t (split(/<BR\/?>/i, $ts)) {
$t =~ s/<\/?[^>]+>//g;
$t =~ s/&nbsp;/ /g;
$t =~ s/\s*\(identified by CRS\)//gi;
Expand Down
3 changes: 2 additions & 1 deletion repstat.pl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ sub GetPeopleList {
my $session = shift;

# WRITE OUT PERSON DATABASE GENERAL INFO
my @reps = DBSelect(HASH, people, [id, firstname, middlename, lastnameenc, namemod, nickname, birthday, gender, religion, osid, bioguideid, metavidid, youtubeid], []);
my @reps = DBSelect(HASH, people, [id, firstname, middlename, lastnameenc, namemod, nickname, birthday, gender, religion, pvsid, osid, bioguideid, metavidid, youtubeid], []);

open PEOPLE_ALL, ">../data/us/people.xml";
binmode(PEOPLE_ALL, ":utf8");
Expand Down Expand Up @@ -96,6 +96,7 @@ sub GetPeopleList {
print $PEOPLE " birthday='$rep{birthday}'" if $rep{birthday} ne "";
print $PEOPLE " gender='$rep{gender}'" if $rep{gender} ne "";
print $PEOPLE " religion='$rep{religion}'" if $rep{religion} ne "";
print $PEOPLE " pvsid='$rep{pvsid}'" if $rep{pvsid} ne "";
print $PEOPLE " osid='$rep{osid}'" if $rep{osid} ne "";
print $PEOPLE " bioguideid='$rep{bioguideid}'" if $rep{bioguideid} ne "";
print $PEOPLE " metavidid='$rep{metavidid}'" if $rep{metavidid} ne "";
Expand Down
5 changes: 3 additions & 2 deletions util.pl
Original file line number Diff line number Diff line change
Expand Up @@ -412,9 +412,10 @@ sub ParseDateTime {
$year = $1; $month = $2; $date = $3;
$hour = $4; $minute = $5; $second = $6;
if ($7 ne "") { warn "GMT timezome ignored"; }
} elsif ($when =~ /^([a-zA-Z]+) (\d+), (\d\d\d\d)(,?\s+(\d+):(\d+)\s*(am|pm))?$/i) {
} elsif ($when =~ /^([a-zA-Z]+) (\d+), (\d\d\d\d)(,?\s+(\d+)(:(\d+))?\s*(am|pm|a\.m\.|p\.m\.))?$/i) {
$year = $3; $month = $Months{uc($1)}; $date = $2;
$hour = $5; $minute = $6; $ampm = $7;
$hour = $5; $minute = $7; $ampm = $8;
if (!$minute) { $minute = '00'; }
if ($ampm =~ /p/i && $hour != 12) { $hour += 12; }
if ($ampm =~ /a/i && $hour == 12) { $hour -= 12; }
} elsif ($when =~ /^(\d+)-([A-Z]+)-(\d\d\d\d)( (\d+):(\d+)\s+(AM|PM))?$/i) {
Expand Down

0 comments on commit 2c5800a

Please sign in to comment.