import requests
from bs4 import BeautifulSoup
import re
import time
from lxml import etree

time1 = time.time()

category_xpath_link = '//*[@id="mw-subcategories"]/div/ul/li/div/div/a'

page_xpath_link = '//*[@id="mw-pages"]/div/div/div/ul/li/a'

def scrappy(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36'
r = requests.get(url, headers=headers)
html = r.text
with open('title.txt', 'w', encoding='utf-8') as f:
except Exception:
s = etree.HTML(html)
categorys = s.xpath(category_xpath_link)
with open('category_pages/category.txt', 'w', encoding='utf-8') as f:
with open('category_pages/category.txt', 'a', encoding='utf-8') as f:
for text1 in categorys:
f.write(text1.text + '\n')

pages = s.xpath(page_xpath_link)
with open('category_pages/pages.txt', 'w', encoding='utf-8') as f:
with open('category_pages/pages.txt', 'a', encoding='utf-8') as f:
for text2 in pages:
f.write(text2.text + '\n')

time2 = time.time()
print('Total time', time2 - time1)

# filename += '-->' + str(url.split(':')[-1])
# print(filename)

# links = BeautifulSoup(html, 'lxml').select('body > div > div > div > div > div > div > div > div > ul > li > a')
# for tag2 in links:
# print(str(tag2).split('< a href = '))

# soup = BeautifulSoup(html, 'lxml')
# subcategories_url = soup.find('div', id='mw-subcategories').find('div', class_='CategoryTreeItem').find('a')
# text1 = soup.find_all('a', class_='CategoryTreeLabel')
# links1 = s.xpath('//*[@id="mw-pages"]/div/ul/li/a/@ *')
# write_pages(page=('\n' * 3 + filename + '\n'))
# for tag2 in links1:
# write_pages(page=(str(tag2) + '\n'))
# subcategories_url = s.xpath('//*[@id="mw-subcategories"]/div/div/div/ul/li/div/div/a/@href')
# if subcategories_url:
# for tag in subcategories_url:
# # scrappy(url='' + tag)
# print('' + tag)

# for tag in text1:
# pattern1 = re.compile(r'^<a class="CategoryTreeLabel CategoryTreeLabelNs14 CategoryTreeLabelCategory" href=')
# sub1 = pattern1.sub('', str(tag))
# pattern2 = re.compile(r'</a>')
# sub2 = pattern2.sub('', sub1).split('>')
# link_category_list = sub2[0]
# filename = sub2[-1]
# if link_category_list:
# print(filename)
# scrappy(url='' + str(link_category_list).lstrip('"').rstrip('"'), filename=filename)

# def write_pages(page):
# with open('./category_pages.txt', 'a', encoding='utf-8') as f:
# f.write(page)

Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
# See documentation in:

import scrapy

class BaidubaikeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
text = scrapy.Field()
Chinese_name = scrapy.Field()
foreign_name = scrapy.Field()
abbreviation = scrapy.Field()
proposal_time = scrapy.Field()
proposal_location = scrapy.Field()
source = scrapy.Field()
definition = scrapy.Field()
basicInfo = scrapy.Field()
detail = scrapy.Field()
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
# See documentation in:
import base64
import random

from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

from BaiduBaike.useragent import agents

class BaidubaikeSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Response, dict
# or Item objects.

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):'Spider opened: %s' %

class BaidubaikeDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain

def spider_opened(self, spider):'Spider opened: %s' %

class customUserAgentmiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent

class RandomProxy(object):
def process_request(self, request, spider):
{'ip_port': '', 'user_passwd': 'user1:pass1'},
{'ip_port': '', 'user_passwd': 'user2:pass2'},
{'ip_port': '', 'user_passwd': 'user3:pass3'},
{'ip_port': '', 'user_passwd': 'user4:pass4'},
# 随机获取一个代理
proxy = random.choice(PROXIES)
# 对账户密码进行base64编码转换
base64_userpasswd = base64.b64encode(proxy['user_passwd'])
# 对应到代理服务器的信令格式里
request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
request.meta['proxy'] = "http://" + proxy['ip_port']
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See:
import pymongo

class BaidubaikePipeline(object):
def __init__(self):
# self.client = pymongo.MongoClient(host="", port=27017)
self.client = pymongo.MongoClient(host="localhost", port=27017)
self.db = self.client['YJ_DB']
# self.db = self.client['Baike']
self.collection = self.db['buchong']

def process_item(self, item, spider):
# print(item)
return item

def __del__(self):

