load suffix from dicts, remove requests

Xyntax · Xyntax · commit 8feb5b4cd769 · 2017-02-27T22:18:27.000+08:00
diff --git a/dic/suffix.txt b/dic/suffix.txt
@@ -0,0 +1,15 @@
+~
+-
+_
+__
+_bak
+_old
+_new
+.bak
+.zip
+.rar
+.tar.gz
+.tar.xz
+.7z
+.old
+.new
diff --git a/lib/common.py b/lib/common.py
@@ -1,7 +1,8 @@
 # !/usr/bin/env python
 #  -*- coding: utf-8 -*-
 from cmdparse import get_arguments
-from data import spider_conf
+from data import spider_conf, dict_data
+import urlparse
 
 
 def init_options():
@@ -12,3 +13,26 @@ def init_options():
         spider_conf.start_urls = args.get('-u')
     if args.get('-f'):
         pass  # TODO
+    load_dict_suffix()
+
+
+def load_dict_suffix():
+    with open('dic/suffix.txt') as f:  # TODO path!
+        dict_data.url_suffix = f.read().split('\n')
+
+
+def gen_urls(base_url):
+    url = base_url.split('?')[0].rstrip('/')
+    if not urlparse.urlparse(url).path:
+        return []
+
+    final_urls = []
+
+    # index.php -> .index.php.swp
+    url_piece = url.split('/')
+    final_urls.append('/'.join(url_piece[:-1]) + '/.' + url_piece[-1].strip('.') + '.swp')
+
+    for each in dict_data.url_suffix:
+        final_urls.append(url + each)
+
+    return final_urls
diff --git a/lib/data.py b/lib/data.py
@@ -4,3 +4,4 @@
 from datatype import AttribDict
 
 spider_conf = AttribDict()
+dict_data = AttribDict()
diff --git a/scrapy_project/__init__.py b/scrapy_project/__init__.py
@@ -1,3 +1,2 @@
 # !/usr/bin/env python
 #  -*- coding: utf-8 -*-
-__author__ = 'xy'
diff --git a/scrapy_project/spiders/filesensor.py b/scrapy_project/spiders/filesensor.py
@@ -2,19 +2,20 @@
 #  -*- coding: utf-8 -*-
 
 import scrapy
-import requests
 import urlparse
 import re
 from lib.data import spider_conf
+from lib.common import gen_urls
 
 
 class FileSensorSpider(scrapy.Spider):
     name = 'filesensor'
+    handle_httpstatus_list = [301, 302, 204, 206, 403, 500]
 
-    def __init__(self, **kw):
-        super(FileSensorSpider, self).__init__(**kw)
+    def __init__(self):
+        super(FileSensorSpider, self).__init__()
         self.url = spider_conf.start_urls
-        print(self.url)
+        print('[START] ' + self.url)
         if not self.url.startswith('http://') and not self.url.startswith('https://'):
             self.url = 'http://%s/' % self.url
         self.allowed_domains = [re.sub(r'^www\.', '', urlparse.urlparse(self.url).hostname)]
@@ -24,28 +25,22 @@ def start_requests(self):
 
     def parse(self, response):
         print('[%s]%s' % (response.status, response.url))
-        self.check(response.url)
+
+        # generate new urls with /dict/suffix.txt
+        for new_url in gen_urls(response.url):
+            # avoid recursive loop
+            yield scrapy.Request(new_url, callback=self.parse_end)
 
         extracted_url = []
-        extracted_url.extend(response.xpath('//*/@href | //*/@src | //form/@action').extract())
+        try:
+            # TODO handle this <a href="/.htaccess">
+            extracted_url.extend(response.xpath('//*/@href | //*/@src | //form/@action').extract())
+        except:
+            return
 
         for url in extracted_url:
             next_url = response.urljoin(url)
             yield scrapy.Request(next_url, callback=self.parse)
 
-    def check(self, url):
-        url = url.split('?')[0]
-        url_piece = url.split('/')
-        if '.' not in url_piece[-1]:
-            return
-        if not urlparse.urlparse(url).path:
-            return
-
-        new_urls = []
-        new_urls.append(url + '~')  # index.php~
-        new_urls.append('/'.join(url_piece[:-1]) + '/.' + url_piece[-1] + '.swp')  # .index.php.swp
-
-        for url in new_urls:
-            r = requests.get(url)
-            if r.status_code != 404:
-                print('[%s]%s' % (r.status_code, url))
+    def parse_end(self, response):
+        print('[Found!][%s]%s' % (response.status, response.url))

-Original file line number
+Diff line change
@@ @@ -0,0 +1,15 @@ @@
 +~
 +-
 +_
 +__
 +_bak
 +_old
 +_new
 +.bak
 +.zip
 +.rar
 +.tar.gz
 +.tar.xz
 +.7z
 +.old
 +.new
Original file line number	Diff line number	Diff line change
`@@ -4,3 +4,4 @@`
`4`	`4`	`from datatype import AttribDict`
`5`	`5`
`6`	`6`	`spider_conf = AttribDict()`
	`7`	`+dict_data = AttribDict()`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`# !/usr/bin/env python`
`2`	`2`	`# -- coding: utf-8 --`
`3`		`-__author__ = 'xy'`