11import config
22import logging
3+ from urllib .parse import urljoin
34
45import re
6+ from urllib .parse import urlparse
57from urllib .request import urlopen , Request
68from urllib .robotparser import RobotFileParser
7- from urllib .parse import urlparse
89from datetime import datetime
910
1011import os
1112
1213class Crawler ():
13-
14+
1415 # Variables
1516 parserobots = False
1617 output = None
@@ -22,7 +23,7 @@ class Crawler():
2223 exclude = []
2324 skipext = []
2425 drop = []
25-
26+
2627 debug = False
2728
2829 tocrawl = set ([])
@@ -39,12 +40,13 @@ class Crawler():
3940 nb_url = 1 # Number of url.
4041 nb_rp = 0 # Number of url blocked by the robots.txt
4142 nb_exclude = 0 # Number of url excluded by extension or word
42-
43+
4344 output_file = None
4445
4546 target_domain = ""
4647
47- def __init__ (self , parserobots = False , output = None , report = False ,domain = "" , exclude = [], skipext = [], drop = [], debug = False ):
48+ def __init__ (self , parserobots = False , output = None , report = False ,domain = "" ,
49+ exclude = [], skipext = [], drop = [], debug = False , verbose = False ):
4850 self .parserobots = parserobots
4951 self .output = output
5052 self .report = report
@@ -53,34 +55,44 @@ def __init__(self, parserobots=False, output=None, report=False ,domain="", excl
5355 self .skipext = skipext
5456 self .drop = drop
5557 self .debug = debug
58+ self .verbose = verbose
5659
5760 if self .debug :
58- logging .basicConfig (level = logging .DEBUG )
61+ log_level = logging .DEBUG
62+ elif self .verbose :
63+ log_level = logging .INFO
64+ else :
65+ log_level = logging .ERROR
66+
67+ logging .basicConfig (level = log_level )
5968
6069 self .tocrawl = set ([domain ])
6170
6271 try :
6372 self .target_domain = urlparse (domain )[1 ]
6473 except :
74+ logging .error ("Invalide domain" )
6575 raise ("Invalid domain" )
6676
67-
6877 if self .output :
6978 try :
7079 self .output_file = open (self .output , 'w' )
7180 except :
72- logging .debug ("Output file not available." )
81+ logging .error ("Output file not available." )
7382 exit (255 )
7483
7584 def run (self ):
76- print (config .xml_header , file = self .output_file )
85+ print (config .xml_header , file = self .output_file )
7786
78- logging .debug ("Start the crawling process" )
87+ if self .parserobots :
88+ self .check_robots ()
89+
90+ logging .info ("Start the crawling process" )
7991
8092 while len (self .tocrawl ) != 0 :
8193 self .__crawling ()
8294
83- logging .debug ("Crawling as reach the end of all found link " )
95+ logging .info ("Crawling has reached end of all found links " )
8496
8597 print (config .xml_footer , file = self .output_file )
8698
@@ -90,8 +102,9 @@ def __crawling(self):
90102
91103 url = urlparse (crawling )
92104 self .crawled .add (crawling )
105+ logging .info ("Crawling #{}: {}" .format (len (self .crawled ), url .geturl ()))
93106 request = Request (crawling , headers = {"User-Agent" :config .crawler_user_agent })
94-
107+
95108 try :
96109 response = urlopen (request )
97110 except Exception as e :
@@ -142,14 +155,14 @@ def __crawling(self):
142155 links = self .linkregex .findall (msg )
143156 for link in links :
144157 link = link .decode ("utf-8" )
145- # logging.debug("Found : {0}".format(link))
158+ logging .debug ("Found : {0}" .format (link ))
146159 if link .startswith ('/' ):
147160 link = 'http://' + url [1 ] + link
148161 elif link .startswith ('#' ):
149162 link = 'http://' + url [1 ] + url [2 ] + link
150163 elif not link .startswith ('http' ):
151164 link = 'http://' + url [1 ] + '/' + link
152-
165+
153166 # Remove the anchor part if needed
154167 if "#" in link :
155168 link = link [:link .index ('#' )]
@@ -173,7 +186,7 @@ def __crawling(self):
173186 continue
174187 if ("javascript" in link ):
175188 continue
176-
189+
177190 # Count one more URL
178191 self .nb_url += 1
179192
@@ -196,7 +209,7 @@ def __crawling(self):
196209 continue
197210
198211 self .tocrawl .add (link )
199-
212+
200213 return None
201214
202215 def __continue_crawling (self ):
@@ -207,12 +220,10 @@ def exclude_link(self,link):
207220 if link not in self .excluded :
208221 self .excluded .add (link )
209222
210- def checkRobots (self ):
211- if self .domain [len (self .domain )- 1 ] != "/" :
212- self .domain += "/"
213- request = Request (self .domain + "robots.txt" , headers = {"User-Agent" :config .crawler_user_agent })
223+ def check_robots (self ):
224+ robots_url = urljoin (self .domain , "robots.txt" )
214225 self .rp = RobotFileParser ()
215- self .rp .set_url (self . domain + "robots.txt" )
226+ self .rp .set_url (robots_url )
216227 self .rp .read ()
217228
218229 def can_fetch (self , link ):
@@ -254,4 +265,3 @@ def make_report(self):
254265 print ("Link with status {0}:" .format (code ))
255266 for uri in self .marked [code ]:
256267 print ("\t - {0}" .format (uri ))
257-
0 commit comments