Add files via upload

HKXSWJ · Sep 22, 2018 · 35d97b3 · 35d97b3
1 parent 361843b
commit 35d97b3
Show file tree

Hide file tree

Showing 45 changed files with 12,961 additions and 0 deletions.
diff --git a/知识图谱/Scrapy_for_wiki_and_baidu_baike/Category_ai.py b/知识图谱/Scrapy_for_wiki_and_baidu_baike/Category_ai.py
@@ -0,0 +1,133 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+import time
+from lxml import etree
+
+time1 = time.time()
+
+category_xpath_link = '//*[@id="mw-subcategories"]/div/ul/li/div/div/a'
+
+page_xpath_link = '//*[@id="mw-pages"]/div/div/div/ul/li/a'
+
+
+def scrappy(url):
+    try:
+        headers = {
+            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36'
+        }
+        r = requests.get(url, headers=headers)
+        html = r.text
+        print(html)
+        with open('title.txt', 'w', encoding='utf-8') as f:
+            f.write(html)
+    except Exception:
+        pass
+    s = etree.HTML(html)
+    categorys = s.xpath(category_xpath_link)
+    with open('category_pages/category.txt', 'w', encoding='utf-8') as f:
+        f.truncate()
+        print('文件已清理...')
+    with open('category_pages/category.txt', 'a', encoding='utf-8') as f:
+        for text1 in categorys:
+            f.write(text1.text + '\n')
+
+
+    pages = s.xpath(page_xpath_link)
+    with open('category_pages/pages.txt', 'w', encoding='utf-8') as f:
+        f.truncate()
+        print('文件已清理...')
+    with open('category_pages/pages.txt', 'a', encoding='utf-8') as f:
+        for text2 in pages:
+            f.write(text2.text + '\n')
+
+
+
+
+
+scrappy('https://www.slideshare.net/NVIDIA/top-5-deep-learning-and-ai-stories-october-6-2017-80543540')
+time2 = time.time()
+print('Total time', time2 - time1)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    # filename += '-->' + str(url.split(':')[-1])
+    # print(filename)
+
+
+
+    # links = BeautifulSoup(html, 'lxml').select('body > div > div > div > div > div > div > div > div > ul > li > a')
+    # for tag2 in links:
+    #     print(str(tag2).split('< a href = '))
+
+
+
+    # soup = BeautifulSoup(html, 'lxml')
+    # subcategories_url = soup.find('div', id='mw-subcategories').find('div', class_='CategoryTreeItem').find('a')
+    # text1 = soup.find_all('a', class_='CategoryTreeLabel')
+    # links1 = s.xpath('//*[@id="mw-pages"]/div/ul/li/a/@ *')
+    # write_pages(page=('\n' * 3 + filename + '\n'))
+    # for tag2 in links1:
+    #     write_pages(page=(str(tag2) + '\n'))
+    # subcategories_url = s.xpath('//*[@id="mw-subcategories"]/div/div/div/ul/li/div/div/a/@href')
+    # if subcategories_url:
+    #     for tag in subcategories_url:
+    #         # scrappy(url='https://zh.wikipedia.org' + tag)
+    #         print('https://zh.wikipedia.org' + tag)
+
+    # for tag in text1:
+    #     pattern1 = re.compile(r'^<a class="CategoryTreeLabel CategoryTreeLabelNs14 CategoryTreeLabelCategory" href=')
+    #     sub1 = pattern1.sub('', str(tag))
+    #     pattern2 = re.compile(r'</a>')
+    #     sub2 = pattern2.sub('', sub1).split('>')
+    #     link_category_list = sub2[0]
+    #     filename = sub2[-1]
+    #     if link_category_list:
+    #         print(filename)
+            # scrappy(url='https://zh.wikipedia.org' + str(link_category_list).lstrip('"').rstrip('"'), filename=filename)
+
+# def write_pages(page):
+#     with open('./category_pages.txt', 'a', encoding='utf-8') as f:
+#         f.write(page)
+
+
+
+
+
+
diff --git a/..._for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/__init__.cpython-35.pyc b/..._for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/__init__.cpython-35.pyc
diff --git a/...apy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/items.cpython-35.pyc b/...apy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/items.cpython-35.pyc
diff --git a/...r_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/middlewares.cpython-35.pyc b/...r_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/middlewares.cpython-35.pyc
diff --git a/...for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/pipelines.cpython-35.pyc b/...for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/pipelines.cpython-35.pyc
diff --git a/..._for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/settings.cpython-35.pyc b/..._for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/settings.cpython-35.pyc
diff --git a/...for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/useragent.cpython-35.pyc b/...for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/__pycache__/useragent.cpython-35.pyc
diff --git a/知识图谱/Scrapy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/items.py b/知识图谱/Scrapy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/items.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class BaidubaikeItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    name = scrapy.Field()
+    text = scrapy.Field()
+    Chinese_name = scrapy.Field()
+    foreign_name = scrapy.Field()
+    abbreviation = scrapy.Field()
+    proposal_time = scrapy.Field()
+    proposal_location = scrapy.Field()
+    source = scrapy.Field()
+    definition = scrapy.Field()
+    basicInfo = scrapy.Field()
+    detail = scrapy.Field()
+    pass
diff --git a/知识图谱/Scrapy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/middlewares.py b/知识图谱/Scrapy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/middlewares.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+import base64
+import random
+
+from scrapy import signals
+from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
+
+from BaiduBaike.useragent import agents
+
+
+class BaidubaikeSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class BaidubaikeDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class customUserAgentmiddleware(UserAgentMiddleware):
+    def process_request(self, request, spider):
+        agent = random.choice(agents)
+        request.headers["User-Agent"] = agent
+
+class RandomProxy(object):
+    def process_request(self, request, spider):
+        PROXIES = [
+            {'ip_port': '111.8.60.9:8123', 'user_passwd': 'user1:pass1'},
+            {'ip_port': '101.71.27.120:80', 'user_passwd': 'user2:pass2'},
+            {'ip_port': '122.96.59.104:80', 'user_passwd': 'user3:pass3'},
+            {'ip_port': '122.224.249.122:8088', 'user_passwd': 'user4:pass4'},
+        ]
+        # 随机获取一个代理
+        proxy = random.choice(PROXIES)
+        # 对账户密码进行base64编码转换
+        base64_userpasswd = base64.b64encode(proxy['user_passwd'])
+        # 对应到代理服务器的信令格式里
+        request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
+        request.meta['proxy'] = "http://" + proxy['ip_port']
diff --git a/知识图谱/Scrapy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/pipelines.py b/知识图谱/Scrapy_for_wiki_and_baidu_baike/baidu/BaiduBaike/BaiduBaike/pipelines.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import pymongo
+
+
+class BaidubaikePipeline(object):
+    def __init__(self):
+        # self.client = pymongo.MongoClient(host="47.75.70.164", port=27017)
+        self.client = pymongo.MongoClient(host="localhost", port=27017)
+        self.db = self.client['YJ_DB']
+        # self.db = self.client['Baike']
+        self.collection = self.db['buchong']
+
+    def process_item(self, item, spider):
+        # print(item)
+        self.collection.insert(dict(item))
+        return item
+
+    def __del__(self):
+        self.client.close()