-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_news_urls.py
54 lines (46 loc) · 2.11 KB
/
get_news_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-
import json
import requests
import re
import time
import multiprocessing
__author__ = u'马浩翔'
class UrlProcess(multiprocessing.Process):
"""
获取URL进程类
"""
def __init__(self, media_queue, public_news_set):
multiprocessing.Process.__init__(self)
self.media_queue = media_queue # 媒体id队列
self.public_news_set = public_news_set # 新闻url集合
self.basic_url = 'http://www.toutiao.com/pgc/ma/?media_id= &page_type=1&max_behot_time= &count=10&version=2&platform=pc&as=479BB4B7254C150&cp=7E0AC8874BB0985'
# 重写run方法,取出media id,构造url
def run(self):
while not self.media_queue.empty():
media_id = self.media_queue.get(block=False)
current_time = int(time.time())
media_url = re.sub('media_id=.*?&','media_id=%s&'%media_id,self.basic_url)
media_url = re.sub('max_behot_time=.*?&','max_behot_time=%d&' % current_time, media_url)
self.get_news_urls(media_url)
# 访问媒体页面,获取返回的数据中的新闻url列表
def get_news_urls(self, media_url):
news_urls_list = []
page = 10 # 向后抓取的页面数
for i in range(page):
response = requests.get(media_url)
response_json = json.loads(response.content)
news_data = response_json['data']
news_urls = [news['source_url'] for news in news_data]
news_titles = [news['title'] for news in news_data]
news_urls_list.extend(news_urls)
max_behot_time = response_json['next']['max_behot_time'] # 获取max_behot_time,用来构造新的url
if max_behot_time == '0':
break
media_url = re.sub('max_behot_time=([\d]+?)&','max_behot_time=%s&'%max_behot_time, media_url)
print media_url
print "\n".join(news_titles)
time.sleep(1) # 休眠,防止反爬虫机制
for each in news_urls_list: # 将新闻url插入公共的新闻url集合中
self.public_news_set.append(each)
if __name__ == '__main__':
pass