2
2
# -*- coding: utf-8 -*-
3
3
4
4
import scrapy
5
- import requests
6
5
import urlparse
7
6
import re
8
7
from lib .data import spider_conf
8
+ from lib .common import gen_urls
9
9
10
10
11
11
class FileSensorSpider (scrapy .Spider ):
12
12
name = 'filesensor'
13
+ handle_httpstatus_list = [301 , 302 , 204 , 206 , 403 , 500 ]
13
14
14
- def __init__ (self , ** kw ):
15
- super (FileSensorSpider , self ).__init__ (** kw )
15
+ def __init__ (self ):
16
+ super (FileSensorSpider , self ).__init__ ()
16
17
self .url = spider_conf .start_urls
17
- print (self .url )
18
+ print ('[START] ' + self .url )
18
19
if not self .url .startswith ('http://' ) and not self .url .startswith ('https://' ):
19
20
self .url = 'http://%s/' % self .url
20
21
self .allowed_domains = [re .sub (r'^www\.' , '' , urlparse .urlparse (self .url ).hostname )]
@@ -24,28 +25,22 @@ def start_requests(self):
24
25
25
26
def parse (self , response ):
26
27
print ('[%s]%s' % (response .status , response .url ))
27
- self .check (response .url )
28
+
29
+ # generate new urls with /dict/suffix.txt
30
+ for new_url in gen_urls (response .url ):
31
+ # avoid recursive loop
32
+ yield scrapy .Request (new_url , callback = self .parse_end )
28
33
29
34
extracted_url = []
30
- extracted_url .extend (response .xpath ('//*/@href | //*/@src | //form/@action' ).extract ())
35
+ try :
36
+ # TODO handle this <a href="/.htaccess">
37
+ extracted_url .extend (response .xpath ('//*/@href | //*/@src | //form/@action' ).extract ())
38
+ except :
39
+ return
31
40
32
41
for url in extracted_url :
33
42
next_url = response .urljoin (url )
34
43
yield scrapy .Request (next_url , callback = self .parse )
35
44
36
- def check (self , url ):
37
- url = url .split ('?' )[0 ]
38
- url_piece = url .split ('/' )
39
- if '.' not in url_piece [- 1 ]:
40
- return
41
- if not urlparse .urlparse (url ).path :
42
- return
43
-
44
- new_urls = []
45
- new_urls .append (url + '~' ) # index.php~
46
- new_urls .append ('/' .join (url_piece [:- 1 ]) + '/.' + url_piece [- 1 ] + '.swp' ) # .index.php.swp
47
-
48
- for url in new_urls :
49
- r = requests .get (url )
50
- if r .status_code != 404 :
51
- print ('[%s]%s' % (r .status_code , url ))
45
+ def parse_end (self , response ):
46
+ print ('[Found!][%s]%s' % (response .status , response .url ))
0 commit comments