-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdonor_scraper_soup.py
126 lines (107 loc) · 3.49 KB
/
donor_scraper_soup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#Made by Sangpil Kim
#May 2016
#Py 2_7
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup as bs4
import requests
import csv
import os
def initDriver():
driver = webdriver.Firefox()
driver.wait = WebDriverWait(driver,1)
return driver
def lookup(driver, query):
driver.get("http://www.opensecrets.org/indivs/")
button = driver.wait.until(EC.element_to_be_clickable(
(By.ID, "name"))) #find donor search box
button.click() #Click input box
_input = driver.wait.until(EC.element_to_be_clickable(
(By.ID, "name"))) #Iput text saving space
_input.send_keys(query) #Send "Yoon" text
_id = driver.wait.until(EC.element_to_be_clickable(
(By.NAME, "submit"))) #Find search botton
_id.click() #Click saerch botton
def getSoup(driver):
r = requests.get(driver.current_url)
html = r.text
soup = bs4(html)
return soup
def scrap(soup, count):
data = []
soup = getSoup(driver)
table = soup.find('table', attrs={'id':'top'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
count = count + 1
return count, data
def updateDriver(driver,count):
isEnd = True
soup = getSoup(driver)
#End page check
mydivs = soup.find_all("div", { "class" : "pageCtrl" })
for my in mydivs[0]:
if my.string:
if my.string.strip() == 'Next':
isEnd = False
print 'Is End? %s' %isEnd
#Getting URL
link_container = []
for link in soup.find_all('a'):
link_str = link.get('href')
if "page" in link_str:
link_container.append(link_str)
num =[]
if len(link_container) > 0:
for s in link_container[0]:
if s.isdigit():
num.append(s)
link_root = link_container[0]
for nu in num:
link_root = link_root.replace(nu,'')
up_url = link_root+str(count)
print 'Next page: %s' %up_url
if not isEnd:
driver.get('http://www.opensecrets.org/indivs/'+up_url)
return isEnd
def iter_scrap(driver):
container = []
count = 1
endPage = False
while not endPage:
print 'Current Page : %s' %driver.current_url
soup = getSoup(driver)
count, data = scrap(soup,count)
container.append(data)
endPage = updateDriver(driver,count)
if endPage == True:
driver.quit()
print 'Is end page? %s' %endPage
return container
def save_file(name,data):
save_root = "./save"
if not os.path.exists(save_root):
os.makedirs(save_root)
name = name+".csv"
csvfile = "./save/"+name
with open(csvfile, "w") as output:
for infos in data:
for info in infos:
info[0] = info[0].replace('\n',' ').encode('utf-8')
writer = csv.writer(output, lineterminator='\n')
writer.writerows(infos)
print 'Saved as '+name
if __name__ == "__main__":
driver = initDriver()
name = sys.argv[1]
lookup(driver, name)
container = iter_scrap(driver)
save_file(name,container)