Skip to content

Commit 8feb5b4

Browse files
committed
load suffix from dicts, remove requests
1 parent 5028625 commit 8feb5b4

File tree

5 files changed

+58
-24
lines changed

5 files changed

+58
-24
lines changed

dic/suffix.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
~
2+
-
3+
_
4+
__
5+
_bak
6+
_old
7+
_new
8+
.bak
9+
.zip
10+
.rar
11+
.tar.gz
12+
.tar.xz
13+
.7z
14+
.old
15+
.new

lib/common.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# !/usr/bin/env python
22
# -*- coding: utf-8 -*-
33
from cmdparse import get_arguments
4-
from data import spider_conf
4+
from data import spider_conf, dict_data
5+
import urlparse
56

67

78
def init_options():
@@ -12,3 +13,26 @@ def init_options():
1213
spider_conf.start_urls = args.get('-u')
1314
if args.get('-f'):
1415
pass # TODO
16+
load_dict_suffix()
17+
18+
19+
def load_dict_suffix():
20+
with open('dic/suffix.txt') as f: # TODO path!
21+
dict_data.url_suffix = f.read().split('\n')
22+
23+
24+
def gen_urls(base_url):
25+
url = base_url.split('?')[0].rstrip('/')
26+
if not urlparse.urlparse(url).path:
27+
return []
28+
29+
final_urls = []
30+
31+
# index.php -> .index.php.swp
32+
url_piece = url.split('/')
33+
final_urls.append('/'.join(url_piece[:-1]) + '/.' + url_piece[-1].strip('.') + '.swp')
34+
35+
for each in dict_data.url_suffix:
36+
final_urls.append(url + each)
37+
38+
return final_urls

lib/data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
from datatype import AttribDict
55

66
spider_conf = AttribDict()
7+
dict_data = AttribDict()

scrapy_project/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
# !/usr/bin/env python
22
# -*- coding: utf-8 -*-
3-
__author__ = 'xy'

scrapy_project/spiders/filesensor.py

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,20 @@
22
# -*- coding: utf-8 -*-
33

44
import scrapy
5-
import requests
65
import urlparse
76
import re
87
from lib.data import spider_conf
8+
from lib.common import gen_urls
99

1010

1111
class FileSensorSpider(scrapy.Spider):
1212
name = 'filesensor'
13+
handle_httpstatus_list = [301, 302, 204, 206, 403, 500]
1314

14-
def __init__(self, **kw):
15-
super(FileSensorSpider, self).__init__(**kw)
15+
def __init__(self):
16+
super(FileSensorSpider, self).__init__()
1617
self.url = spider_conf.start_urls
17-
print(self.url)
18+
print('[START] ' + self.url)
1819
if not self.url.startswith('http://') and not self.url.startswith('https://'):
1920
self.url = 'http://%s/' % self.url
2021
self.allowed_domains = [re.sub(r'^www\.', '', urlparse.urlparse(self.url).hostname)]
@@ -24,28 +25,22 @@ def start_requests(self):
2425

2526
def parse(self, response):
2627
print('[%s]%s' % (response.status, response.url))
27-
self.check(response.url)
28+
29+
# generate new urls with /dict/suffix.txt
30+
for new_url in gen_urls(response.url):
31+
# avoid recursive loop
32+
yield scrapy.Request(new_url, callback=self.parse_end)
2833

2934
extracted_url = []
30-
extracted_url.extend(response.xpath('//*/@href | //*/@src | //form/@action').extract())
35+
try:
36+
# TODO handle this <a href="/.htaccess">
37+
extracted_url.extend(response.xpath('//*/@href | //*/@src | //form/@action').extract())
38+
except:
39+
return
3140

3241
for url in extracted_url:
3342
next_url = response.urljoin(url)
3443
yield scrapy.Request(next_url, callback=self.parse)
3544

36-
def check(self, url):
37-
url = url.split('?')[0]
38-
url_piece = url.split('/')
39-
if '.' not in url_piece[-1]:
40-
return
41-
if not urlparse.urlparse(url).path:
42-
return
43-
44-
new_urls = []
45-
new_urls.append(url + '~') # index.php~
46-
new_urls.append('/'.join(url_piece[:-1]) + '/.' + url_piece[-1] + '.swp') # .index.php.swp
47-
48-
for url in new_urls:
49-
r = requests.get(url)
50-
if r.status_code != 404:
51-
print('[%s]%s' % (r.status_code, url))
45+
def parse_end(self, response):
46+
print('[Found!][%s]%s' % (response.status, response.url))

0 commit comments

Comments
 (0)