|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +''' |
| 4 | +python 3.7.0 |
| 5 | +''' |
| 6 | + |
| 7 | +# 导入模块 |
| 8 | +import time |
| 9 | +import requests, re, random, os |
| 10 | +from bs4 import BeautifulSoup |
| 11 | + |
| 12 | +def ip_test(ip, url_for_test='https://www.baidu.com', set_timeout=10): |
| 13 | + ''' |
| 14 | + 检测爬取到的ip地址可否使用,能使用返回True,否则返回False,默认去访问百度测试代理 |
| 15 | + :param ip: |
| 16 | + :param url_for_test: |
| 17 | + :param set_timeout: |
| 18 | + :return: |
| 19 | + ''' |
| 20 | + try: |
| 21 | + r = requests.get(url_for_test, headers=headers, proxies={'http': ip[0]+':'+ip[1]}, timeout=set_timeout) |
| 22 | + if r.status_code == 200: |
| 23 | + return True |
| 24 | + else: |
| 25 | + return False |
| 26 | + except: |
| 27 | + return False |
| 28 | + |
| 29 | +def scrawl_ip(url, num, url_for_test='https://www.baidu.com'): |
| 30 | + ''' |
| 31 | + 爬取代理ip地址,代理的url是西祠代理 |
| 32 | + :param url: |
| 33 | + :param num: |
| 34 | + :param url_for_test: |
| 35 | + :return: |
| 36 | + ''' |
| 37 | + ip_list = [] |
| 38 | + for num_page in range(1, num+1): |
| 39 | + url = url + str(num_page) |
| 40 | + |
| 41 | + response = requests.get(url, headers=headers) |
| 42 | + response.encoding = 'utf-8' |
| 43 | + content = response.text |
| 44 | + |
| 45 | + pattern = re.compile('<td class="country">.*?alt="Cn" />.*?</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>', re.S) |
| 46 | + items = re.findall(pattern, content) |
| 47 | + for ip in items: |
| 48 | + if ip_test(ip[1], url_for_test): # 测试爬取到ip是否可用,测试通过则加入ip_list列表之中 |
| 49 | + print('测试通过,IP地址为' + str(ip[0]) + ':' + str(ip[1])) |
| 50 | + ip_list.append(ip[0]+':'+ip[1]) |
| 51 | + return ip_list |
| 52 | + |
| 53 | + time.sleep(5) # 等待5秒爬取下一页 |
| 54 | + |
| 55 | +def get_random_ip(): # 随机获取一个IP |
| 56 | + ind = random.randint(0, len(total_ip)-1) |
| 57 | + return total_ip[ind] |
| 58 | + |
| 59 | + |
| 60 | +# 爬取代理的url地址,选择的是西祠代理 |
| 61 | +url_ip = "http://www.xicidaili.com/nt/" |
| 62 | + |
| 63 | +# 设定等待时间 |
| 64 | +set_timeout = 10 |
| 65 | + |
| 66 | +# 爬取代理的页数,2表示爬取2页的ip地址 |
| 67 | +num = 2 |
| 68 | + |
| 69 | +# 代理的使用次数 |
| 70 | +count_time = 5 |
| 71 | + |
| 72 | +# 构造headers |
| 73 | +UserAgent_List = [ |
| 74 | + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", |
| 75 | + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", |
| 76 | + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", |
| 77 | + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", |
| 78 | + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", |
| 79 | + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", |
| 80 | + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", |
| 81 | + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", |
| 82 | + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", |
| 83 | + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", |
| 84 | + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", |
| 85 | + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", |
| 86 | + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", |
| 87 | + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", |
| 88 | + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", |
| 89 | + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", |
| 90 | + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", |
| 91 | + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" |
| 92 | +] |
| 93 | + |
| 94 | +headers = {'User-Agent': random.choice(UserAgent_List), |
| 95 | + 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
| 96 | + 'Accept-Encoding': 'gzip', |
| 97 | + } |
| 98 | + |
| 99 | + |
| 100 | +# 爬取IP代理 |
| 101 | +total_ip = scrawl_ip(url_ip, num) |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | + |
0 commit comments