-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathfoxnews.py
161 lines (126 loc) · 5.1 KB
/
foxnews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
import numpy as np
import psycopg2
from dateutil import parser
import ipdb
from datetime import datetime as dt
def getHref(webelements):
# for i, element in enumerate(webelements):
# print element.find_element_by_tag_name('a').get_attribute('href')
return [wElement.find_element_by_tag_name('a').get_attribute('href') for wElement in webelements]
def showmore():
# get the next page
for i, element in enumerate(browser.find_elements_by_class_name("more-btn")):
print element.find_element_by_tag_name('a').get_attribute('href')
def connect_to_db():
'''
NAME
connect_to_db
SYNOPSIS
Connect to postgres
DESCRIPTION
Connect to postgres at __main__
'''
# query database for a list of urls and put in set
DBNAME = 'newscontent'
DBUSER = 'ethancheung'
pf = open('password.txt')
passwd = pf.readline()
PASSWRD = passwd
#ipdb.set_trace()
return psycopg2.connect(database= DBNAME, user=DBUSER, password=PASSWRD) #, host = '/tmp')
def getContent(setOfURL):
#check for duplicates
conn = connect_to_db()
cur = conn.cursor()
with conn:
allURLs = "SELECT url from stocknews_newscontent"
cur.execute(allURLs)
existingURL = set(eItem[0] for eItem in cur.fetchall())
newURL = setOfURL.difference(existingURL)
for oneURL in newURL:
browser.get(oneURL)
strTitle = ''
datePub = ''
articleHeader = browser.find_elements_by_xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[3]/article')
if len(articleHeader) > 0:
for eHeader in articleHeader[0].find_elements_by_tag_name('h1'):
strTitle = eHeader.text
for dHeader in articleHeader[0].find_elements_by_tag_name('time'):
datePub = dHeader.get_attribute('datetime')
# retrieve the content via p tags
mainStory = browser.find_elements_by_xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[3]/article/div/div[3]')
if len(mainStory) == 1:
content = ''
for eContent in mainStory[0].find_elements_by_tag_name('p'):
content += ''.join(eContent.text)
storeContent(strTitle, datePub, content, oneURL)
return True
def storeContent(strTitle, datePub, content, iurl):
'''
NAME
storeContent
SYNOPSIS
Stores raw string data to Postgres using 'psql newscontent' generated with Django
DESCRIPTION
Stored variables:
id | integer | not null default nextval('stocknews_newscontent_id_seq'::regclass)
url | character varying(1000)
title | character varying(4000)
content | character varying(50000)
date | date
table: stocknews_newscontent
'''
strContent = content.replace("'","")
conn = connect_to_db()
cur = conn.cursor()
dateObj = str(parser.parse(datePub.strip() , tzinfos={'EST', -18000}))
if dateObj == '' or dateObj is None:
dateObj = dt.now()
strTitle = strTitle.replace("'","")
sql = "INSERT into stocknews_newscontent (url, title, content, date) values ('"+ iurl + "','" + strTitle + "','" + strContent + "','" + datePub + "');"
try:
cur.execute(sql)
except:
datePub = dt.now()
cur.execute(sql)
conn.commit()
return "The item : %s was successfully saved to the databse" % strTitle
def main():
'''
NAME: main
SYNOPSIS:
Macro economic news scraper for Fox News
DESCRIPTION:
'Show More' button displays an extra 10 links
Scraper clicks the 'Show More' button a configured number of times
Then opens each link individual and scrapes the content going through H3 tags
'''
url = 'http://www.foxnews.com/us/economy/index.html#'
browser.get(url)
numClicks = 1
advanceClicks = 400
while numClicks < advanceClicks:
if numClicks == 1:
#url = 'http://www.foxnews.com/us/economy/index.html#'
for eClick in np.arange(advanceClicks):
# find the Show More button and click it a bunch of times
browser.find_element_by_class_name("btn-smll").click()
time.sleep(1)
numClicks += 1
# all news headlines are li but on under ul
setURL = set()
for i, element in enumerate(browser.find_elements_by_xpath('//*[@id="section-content"]/div[1]/div[4]/div/div/div[6]/div/div/div/ul/li')):
try:
h3Element = element.find_element_by_tag_name('h3')
# ipdb.set_trace()
setURL.add(h3Element.find_element_by_tag_name('a').get_attribute('href'))
except: # NoSuchElementException:
pass
#strTitle, datePub, strContent, strUrl = getContent(setURL)
getContent(setURL)
browser = webdriver.Safari() #Chrome()
if __name__ == '__main__':
main()