Skip to content

Commit e8500a9

Browse files
committed
添加爬去代理的程序ip_pachong.py
1 parent 6db6806 commit e8500a9

File tree

1 file changed

+106
-0
lines changed

1 file changed

+106
-0
lines changed

ip_pachong.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# -*- coding: utf-8 -*-
2+
3+
'''
4+
python 3.7.0
5+
'''
6+
7+
# 导入模块
8+
import time
9+
import requests, re, random, os
10+
from bs4 import BeautifulSoup
11+
12+
def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=10):
13+
'''
14+
检测爬取到的ip地址可否使用,能使用返回True,否则返回False,默认去访问百度测试代理
15+
:param ip:
16+
:param url_for_test:
17+
:param set_timeout:
18+
:return:
19+
'''
20+
try:
21+
r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout)
22+
if r.status_code == 200:
23+
return True
24+
else:
25+
return False
26+
except:
27+
return False
28+
29+
def scrawl_ip(url, num, url_for_test='https://www.baidu.com'):
30+
'''
31+
爬取代理ip地址,代理的url是西祠代理
32+
:param url:
33+
:param num:
34+
:param url_for_test:
35+
:return:
36+
'''
37+
ip_list = []
38+
for num_page in range(1, num+1):
39+
url = url + str(num_page)
40+
41+
response = requests.get(url, headers=headers)
42+
response.encoding = 'utf-8'
43+
content = response.text
44+
45+
pattern = re.compile('<td class="country">.*?alt="Cn" />.*?</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>', re.S)
46+
items = re.findall(pattern, content)
47+
for ip in items:
48+
if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中
49+
print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1]))
50+
ip_list.append(ip[0]+':'+ip[1])
51+
return ip_list
52+
53+
time.sleep(5) # 等待5秒爬取下一页
54+
55+
def get_random_ip(): # 随机获取一个IP
56+
ind = random.randint(0, len(total_ip)-1)
57+
return total_ip[ind]
58+
59+
60+
# 爬取代理的url地址,选择的是西祠代理
61+
url_ip = "http://www.xicidaili.com/nt/"
62+
63+
# 设定等待时间
64+
set_timeout = 10
65+
66+
# 爬取代理的页数,2表示爬取2页的ip地址
67+
num = 2
68+
69+
# 代理的使用次数
70+
count_time = 5
71+
72+
# 构造headers
73+
UserAgent_List = [
74+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
75+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
76+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
77+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
78+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
79+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
80+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
81+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
82+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
83+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
84+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
85+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
86+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
87+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
88+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
89+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
90+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
91+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
92+
]
93+
94+
headers = {'User-Agent': random.choice(UserAgent_List),
95+
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
96+
'Accept-Encoding': 'gzip',
97+
}
98+
99+
100+
# 爬取IP代理
101+
total_ip = scrawl_ip(url_ip, num)
102+
103+
104+
105+
106+

0 commit comments

Comments
 (0)