forked from linode/docs
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdocs404.py
79 lines (63 loc) · 2.18 KB
/
docs404.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
import logging
import scrapy
from scrapy import Item, Field
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
class Docs404Item(Item):
referer = Field()
status = Field()
url = Field()
class Docs404Spider(CrawlSpider):
def __init__(self, *args, **kwargs):
loggers = ['scrapy.core.engine',
'scrapy.downloadermiddlewares.redirect',
'scrapy.spidermiddlewares.offsite',
'scrapy.middleware']
for l in loggers:
logger = logging.getLogger(l)
logger.setLevel(logging.WARNING)
super().__init__(*args, **kwargs)
# Delay if server is returning lots of 500s
# DOWNLOAD_DELAY=0.1
name = 'docs404'
allowed_domains = ['localhost' ]
start_urls = ['http://localhost:1313/docs']
handle_httpstatus_list = [404]
rules = (
Rule(LinkExtractor(allow=r'/docs/', deny=r'/docs/contribute'),
callback='parse_item', follow=True),
)
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
item = Docs404Item()
if response.status == 404:
ref = response.request.headers.get('Referer')
item['referer'] = ref if ref else 'orphaned link'
item['status'] = response.status
item['url'] = response.url
return item
if __name__ == "__main__":
import os
import sys
import requests
from blueberry import BASE_URL
process = CrawlerProcess({ 'USER_AGENT': 'docs404',
'FEED_URI': 'temp.csv',
'FEED_FORMAT': 'csv' })
process.crawl(Docs404Spider)
process.start()
f = open('temp.csv')
os.remove('temp.csv')
try:
requests.get(BASE_URL)
except requests.exceptions.ConnectionError:
print('\n\nHugo server not running on port 1313')
sys.exit(1)
if sum([1 for line in f]) != 0:
print('404 response in HTML - see logs')
sys.exit(1)
else:
print('\n\nScraper did not find any 404 links')