Skip to content

Commit a5ad852

Browse files
committed
update readme.md
1 parent 5360835 commit a5ad852

File tree

4 files changed

+136
-0
lines changed

4 files changed

+136
-0
lines changed

Crawer/README.MD

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# 代码详细说明请看文章
2+

Crawer/meizitu.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import requests
2+
import os
3+
import time
4+
import threading
5+
from bs4 import BeautifulSoup
6+
7+
8+
def download_page(url):
9+
'''
10+
用于下载页面
11+
'''
12+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
13+
r = requests.get(url, headers=headers)
14+
r.encoding = 'gb2312'
15+
return r.text
16+
17+
18+
def get_pic_list(html):
19+
'''
20+
获取每个页面的套图列表,之后循环调用get_pic函数获取图片
21+
'''
22+
soup = BeautifulSoup(html, 'html.parser')
23+
pic_list = soup.find_all('li', class_='wp-item')
24+
for i in pic_list:
25+
a_tag = i.find('h3', class_='tit').find('a')
26+
link = a_tag.get('href')
27+
text = a_tag.get_text()
28+
get_pic(link, text)
29+
30+
31+
def get_pic(link, text):
32+
'''
33+
获取当前页面的图片,并保存
34+
'''
35+
html = download_page(link) # 下载界面
36+
soup = BeautifulSoup(html, 'html.parser')
37+
pic_list = soup.find('div', id="picture").find_all('img') # 找到界面所有图片
38+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
39+
create_dir('pic/{}'.format(text))
40+
for i in pic_list:
41+
pic_link = i.get('src') # 拿到图片的具体 url
42+
r = requests.get(pic_link, headers=headers) # 下载图片,之后保存到文件
43+
with open('pic/{}/{}'.format(text, link.split('/')[-1]), 'wb') as f:
44+
f.write(r.content)
45+
time.sleep(1) # 休息一下,不要给网站太大压力,避免被封
46+
47+
48+
def create_dir(name):
49+
if not os.path.exists(name):
50+
os.makedirs(name)
51+
52+
53+
def execute(url):
54+
page_html = download_page(url)
55+
get_pic_list(page_html)
56+
57+
58+
def main():
59+
create_dir('pic')
60+
queue = [i for i in range(1, 72)] # 构造 url 链接 页码。
61+
threads = []
62+
while len(queue) > 0:
63+
for thread in threads:
64+
if not thread.is_alive():
65+
threads.remove(thread)
66+
while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5
67+
cur_page = queue.pop(0)
68+
url = 'http://meizitu.com/a/more_{}.html'.format(cur_page)
69+
thread = threading.Thread(target=execute, args=(url,))
70+
thread.setDaemon(True)
71+
thread.start()
72+
print('{}正在下载{}页'.format(threading.current_thread().name, cur_page))
73+
threads.append(thread)
74+
75+
76+
if __name__ == '__main__':
77+
main()

Crawer/qiubai_crawer.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
5+
def download_page(url):
6+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
7+
r = requests.get(url, headers=headers)
8+
return r.text
9+
10+
11+
def get_content(html, page):
12+
output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n"""
13+
soup = BeautifulSoup(html, 'html.parser')
14+
con = soup.find(id='content-left')
15+
con_list = con.find_all('div', class_="article")
16+
for i in con_list:
17+
author = i.find('h2').string # 获取作者名字
18+
content = i.find('div', class_='content').find('span').get_text() # 获取内容
19+
stats = i.find('div', class_='stats')
20+
vote = stats.find('span', class_='stats-vote').find('i', class_='number').string
21+
comment = stats.find('span', class_='stats-comments').find('i', class_='number').string
22+
author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别
23+
if author_info is not None: # 非匿名用户
24+
class_list = author_info['class']
25+
if "womenIcon" in class_list:
26+
gender = '女'
27+
elif "manIcon" in class_list:
28+
gender = '男'
29+
else:
30+
gender = ''
31+
age = author_info.string # 获取年龄
32+
else: # 匿名用户
33+
gender = ''
34+
age = ''
35+
36+
save_txt(output.format(page, author, gender, age, vote, comment, content))
37+
38+
39+
def save_txt(*args):
40+
for i in args:
41+
with open('qiubai.txt', 'a', encoding='utf-8') as f:
42+
f.write(i)
43+
44+
45+
def main():
46+
# 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url,
47+
# 当然我们最好是用 Beautiful Soup找到页面底部有多少页。
48+
for i in range(1, 14):
49+
url = 'https://qiushibaike.com/text/page/{}'.format(i)
50+
html = download_page(url)
51+
get_content(html, i)
52+
53+
if __name__ == '__main__':
54+
main()

爬虫集合/README.MD

+3
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
# 代码详细说明请看文章
22

3+
![Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw)
4+
![Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew)
5+

0 commit comments

Comments
 (0)