forked from bdheath/Big-Cases
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbigcases_scrape_docs.py
184 lines (150 loc) · 5.44 KB
/
bigcases_scrape_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import re
import dbconnect
from documentcloud import DocumentCloud
import time
from urlparse import urljoin
import glob
import shutil
import os
from xvfbwrapper import Xvfb
from bigcases_settings import settings
waittime = 10
# Configure the headless browser so that it forces downloads
br = ''
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-print-preview")
profile = {
'download.default_directory': settings.file_temp_path,
'download.prompt.for.download': False,
'download.directory_upgrade':True,
'plugins.plugins_disabled': ['Chrome PDF Viewer']
}
chrome_options.add_experimental_option('prefs',profile)
START = 'https://pacer.login.uscourts.gov/csologin/login.jsf?appurl=pcl.uscourts.gov/search'
ACCESS = 'public'
URL = re.compile('"(http.*?)"', re.IGNORECASE)
COSTS =[
re.compile('Cost: </FONT></TH><TD ALIGN=LEFT><FONT SIZE=-1 COLOR=DARKBLUE>(.*?)<', re.IGNORECASE),
re.compile('Cost: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE">(.*?)<', re.IGNORECASE),
re.compile('Cost: .{50,80}>([\s\d\.]{3,7})<', re.IGNORECASE)
]
dc = DocumentCloud(settings.dc_user, settings.dc_pass)
db = dbconnect.db(host=settings.db_host, user=settings.db_user, pwd=settings.db_pass, port=settings.db_port)
def handleLogin():
print "# Renewing login"
global br
br.get(START)
br.find_element_by_name('login:loginName').send_keys(settings.pacer_user)
br.find_element_by_name('login:password').send_keys(settings.pacer_pass)
br.find_element_by_name('login:clientCode').send_keys(settings.pacer_client)
br.find_element_by_name('login:clientCode').send_keys(Keys.RETURN)
time.sleep(3)
def getDocument(pid, url):
global waittime
global br
br.get(url)
time.sleep(2)
html = br.page_source
if re.search('<input\s+type=password', html, re.IGNORECASE):
# logged out somehow
print '# Oops, have to login again'
handleLogin()
br.get(url)
time.sleep(2)
html = br.page_source
if re.search('View All', html):
# Multiple documents on this entry. Try to get them all as a single document if you can
br.find_element_by_xpath("//input[@value='View All']").click()
time.sleep(4)
waittime += 10
html = br.page_source
MULTIPATTERN = re.compile('<td colspan=\"?2\"?><b>Document Number:</b> <a.*?>(\d+)<', re.IGNORECASE)
if MULTIPATTERN.search( html):
# Document with multiple attachments, but cannot view all
# Get the main document only
xp = MULTIPATTERN.search(html).group(1)
xpath = "//a[text()='" + xp + "']"
br.find_element_by_xpath(xpath).click()
time.sleep(4)
html = br.page_source
price = None
for COST in COSTS:
if COST.search(html):
price = float(COST.search(html).group(1))
break
print ' - price: ' + str(price)
# Now fetch the document if it's price is less than the max cost
if price <= settings.pacer_max_price and price is not None:
print ' - extract'
#br.find_element_by_xpath("//input[@type='submit']").click()
br.find_element_by_xpath("//input[@value='View Document']").click()
time.sleep(waittime)
# Now fish out the most recent file and stick it where you want it
files = glob.glob(settings.file_temp_path + '/*')
files.sort(key=os.path.getmtime, reverse=True)
if len(files) >= 2:
newfn = settings.file_archive_path + '/document-' + str(pid) + '.pdf'
print ' - now: ' + newfn
shutil.move(files[0], newfn)
dcid = None
dcdoc = dc.documents.upload(newfn, source='U.S. District Court via big_cases bot', project = settings.dc_project_id, access = ACCESS)
print ' - DocumentCloud: ' + str(dcdoc.id)
# Flag bigcase = 3
# This means the document is processing - do not post it
# Flag reverts to 1 after document upload is complete
db.run(""" UPDATE court.pacer_raw
SET modified = modified,
scraped = 1,
scraped_time = NOW(),
dcid = %s,
document_location = %s,
price = %s,
bigcase = 3
WHERE pid = %s """,
(dcdoc.id, str(dcdoc.published_url), price, pid, ))
# Wait until the document is public
obj = dc.documents.get(dcdoc.id)
while obj.access != 'public':
print ' - Pausing for document to become public (%s) ' % obj.access
time.sleep(5)
obj = dc.documents.get(dcdoc.id)
# Re-flag the docket entry in the database as ready to post
db.run(""" UPDATE court.pacer_raw
SET bigcase = 1
WHERE pid = %s """, (pid, ))
return
if __name__ == '__main__':
print '# PACER DOCUMENT DOWNLOADER '
print '# Querying new records'
# Now get the latest relevant documents
cases = db.getDict(""" SELECT *
FROM court.pacer_raw
WHERE bigcase = 1
AND description LIKE %s
AND scraped = 0
ORDER BY pid DESC
LIMIT %s """,
('%http%', settings.max_files_to_scrape, ))
if len(cases) > 0:
try:
print '# Starting Chrome'
display = Xvfb()
display.start()
br = webdriver.Chrome( chrome_options = chrome_options)
br.set_page_load_timeout(settings.http_timeout)
handleLogin()
for case in cases:
if URL.search(case['description']):
print ' - ' + case['title'] + ' (' + case['court'] + ')'
getDocument(case['pid'], URL.search(case['description']).group(1))
br.quit()
display.stop()
except Exception, e:
with open('bigcases_scrape.log','a') as logfile:
logfile.write('ERROR: %s\n' % str(e) )
br.quit()
display.stop()