-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
92 lines (70 loc) · 2.41 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from bs4 import BeautifulSoup
import requests
import json,datetime,os
""" 爬虫测试代码 """
today=str(datetime.datetime.now())[5:7]+str(datetime.datetime.now())[8:10]
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/\
537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
news_num=0
mulu=''
print('本程序自动爬取网易新闻(国内/国际/要闻版)首页的70条新闻评论。生成文件并保存。\n\n制作:云云\n完工:2021-6-3')
key0=input('\n输入1-国内新闻,2-国际新闻,3-要闻:')
duiying={'1':'guonei','2':'guoji','3':'yaowen20200213'}
key=duiying[key0]
foldername=today+key+'/'
try:
os.makedirs(foldername)
except:
pass
def getcomments(idd):
comment_num=0
content=''
url1='https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/'
# 热评
url2='/comments/hotList'
url=url1+idd+url2
params={'ibc': 'newspc',
'limit': 40,
'showLevelThreshold': 72,
'headLimit': 1,
'tailLimit': 2,
'offset': 0,
'callback': 'jsonp_1622619924685',
'_': 1622619924686}
res=requests.get(url,params=params,headers=headers)
res=res.text[20:-1]
res='['+res+']'
jsoner=json.loads(res)
try:
comments=jsoner[0]["comments"]
for i in comments:
comment_num+=1
content+=str(comment_num)+':'+comments[i]['content']+'\n'
except:
content='no comments'
return content
urla='https://temp.163.com/special/00804KVA/cm_'
urlb='.js?callback=data_callback'
# https://temp.163.com/special/00804KVA/cm_guonei.js?callback=data_callback
# https://temp.163.com/special/00804KVA/cm_guoji.js?callback=data_callback
# https://temp.163.com/special/00804KVA/cm_yaowen20200213.js?callback=data_callback
url0=urla+key+urlb
res0=requests.get(url0,headers=headers).text
res0=res0[14:-1]
res0='['+res0+']'
js0=json.loads(res0)
items=js0[0]
for i in items:
news_num+=1
title=i['title']
print(i['commenturl'])
idd=i['commenturl'][-21:-5]
txtname=foldername+str(news_num)+'.txt'
muluname=foldername+'0-目录.txt'
content=getcomments(idd)
mulu+=(str(news_num)+':'+title+'\n')
print(str(news_num)+':'+title)
with open (txtname,'w',encoding='utf-8') as f:
f.write(title+'\n'+content)
with open (muluname,'w',encoding='utf-8') as f2:
f2.write(mulu)