11import config
22import logging
3+ from urllib .parse import urljoin
34
45import re
6+ from urllib .parse import urlparse
57from urllib .request import urlopen , Request
68from urllib .robotparser import RobotFileParser
7- from urllib . parse import urlparse
9+ from datetime import datetime
810
911import os
1012
1113class Crawler ():
12-
14+
1315 # Variables
1416 parserobots = False
1517 output = None
@@ -21,26 +23,30 @@ class Crawler():
2123 exclude = []
2224 skipext = []
2325 drop = []
24-
26+
2527 debug = False
2628
2729 tocrawl = set ([])
2830 crawled = set ([])
2931 excluded = set ([])
32+
33+ marked = {}
34+
3035 # TODO also search for window.location={.*?}
31- linkregex = re .compile (b'<a href=[\' |"](.*?)[\' "].*?>' )
36+ linkregex = re .compile (b'<a [^>]* href=[\' |"](.*?)[\' "].*?>' )
3237
3338 rp = None
3439 response_code = {}
3540 nb_url = 1 # Number of url.
3641 nb_rp = 0 # Number of url blocked by the robots.txt
3742 nb_exclude = 0 # Number of url excluded by extension or word
38-
43+
3944 output_file = None
4045
4146 target_domain = ""
4247
43- def __init__ (self , parserobots = False , output = None , report = False ,domain = "" , exclude = [], skipext = [], drop = [], debug = False ):
48+ def __init__ (self , parserobots = False , output = None , report = False ,domain = "" ,
49+ exclude = [], skipext = [], drop = [], debug = False , verbose = False ):
4450 self .parserobots = parserobots
4551 self .output = output
4652 self .report = report
@@ -49,31 +55,44 @@ def __init__(self, parserobots=False, output=None, report=False ,domain="", excl
4955 self .skipext = skipext
5056 self .drop = drop
5157 self .debug = debug
58+ self .verbose = verbose
5259
5360 if self .debug :
54- logging .basicConfig (level = logging .DEBUG )
61+ log_level = logging .DEBUG
62+ elif self .verbose :
63+ log_level = logging .INFO
64+ else :
65+ log_level = logging .ERROR
66+
67+ logging .basicConfig (level = log_level )
5568
5669 self .tocrawl = set ([domain ])
5770
5871 try :
5972 self .target_domain = urlparse (domain )[1 ]
6073 except :
74+ logging .error ("Invalide domain" )
6175 raise ("Invalid domain" )
6276
63-
6477 if self .output :
6578 try :
6679 self .output_file = open (self .output , 'w' )
6780 except :
68- logging .debug ("Output file not available." )
81+ logging .error ("Output file not available." )
6982 exit (255 )
7083
7184 def run (self ):
72- print (config .xml_header , file = self .output_file )
85+ print (config .xml_header , file = self .output_file )
86+
87+ if self .parserobots :
88+ self .check_robots ()
89+
90+ logging .info ("Start the crawling process" )
91+
92+ while len (self .tocrawl ) != 0 :
93+ self .__crawling ()
7394
74- logging .debug ("Start the crawling process" )
75- self .__crawling ()
76- logging .debug ("Crawling as reach the end of all found link" )
95+ logging .info ("Crawling has reached end of all found links" )
7796
7897 print (config .xml_footer , file = self .output_file )
7998
@@ -83,18 +102,26 @@ def __crawling(self):
83102
84103 url = urlparse (crawling )
85104 self .crawled .add (crawling )
86-
105+ logging .info ("Crawling #{}: {}" .format (len (self .crawled ), url .geturl ()))
106+ request = Request (crawling , headers = {"User-Agent" :config .crawler_user_agent })
107+
87108 try :
88- request = Request (crawling , headers = {"User-Agent" :config .crawler_user_agent })
89109 response = urlopen (request )
90110 except Exception as e :
91111 if hasattr (e ,'code' ):
92112 if e .code in self .response_code :
93113 self .response_code [e .code ]+= 1
94114 else :
95115 self .response_code [e .code ]= 1
116+
117+ # Gestion des urls marked pour le reporting
118+ if self .report :
119+ if e .code in self .marked :
120+ self .marked [e .code ].append (crawling )
121+ else :
122+ self .marked [e .code ] = [crawling ]
123+
96124 logging .debug ("{1} ==> {0}" .format (e , crawling ))
97- response .close ()
98125 return self .__continue_crawling ()
99126
100127 # Read the response
@@ -104,28 +131,38 @@ def __crawling(self):
104131 self .response_code [response .getcode ()]+= 1
105132 else :
106133 self .response_code [response .getcode ()]= 1
134+
107135 response .close ()
136+
137+ # Get the last modify date
138+ if 'last-modified' in response .headers :
139+ date = response .headers ['Last-Modified' ]
140+ else :
141+ date = response .headers ['Date' ]
142+
143+ date = datetime .strptime (date , '%a, %d %b %Y %H:%M:%S %Z' )
144+
108145 except Exception as e :
109146 logging .debug ("{1} ===> {0}" .format (e , crawling ))
110- return self . __continue_crawling ()
147+ return None
111148
112149
113- print ("<url><loc>" + url .geturl ()+ "</loc></url>" , file = self .output_file )
150+ print ("<url><loc>" + url .geturl ()+ "</loc><lastmod>" + date . strftime ( '%Y-%m-%dT%H:%M:%S+00:00' ) + "</lastmod>< /url>" , file = self .output_file )
114151 if self .output_file :
115152 self .output_file .flush ()
116153
117154 # Found links
118155 links = self .linkregex .findall (msg )
119156 for link in links :
120157 link = link .decode ("utf-8" )
121- # logging.debug("Found : {0}".format(link))
158+ logging .debug ("Found : {0}" .format (link ))
122159 if link .startswith ('/' ):
123160 link = 'http://' + url [1 ] + link
124161 elif link .startswith ('#' ):
125162 link = 'http://' + url [1 ] + url [2 ] + link
126163 elif not link .startswith ('http' ):
127164 link = 'http://' + url [1 ] + '/' + link
128-
165+
129166 # Remove the anchor part if needed
130167 if "#" in link :
131168 link = link [:link .index ('#' )]
@@ -149,7 +186,7 @@ def __crawling(self):
149186 continue
150187 if ("javascript" in link ):
151188 continue
152-
189+
153190 # Count one more URL
154191 self .nb_url += 1
155192
@@ -173,7 +210,7 @@ def __crawling(self):
173210
174211 self .tocrawl .add (link )
175212
176- return self . __continue_crawling ()
213+ return None
177214
178215 def __continue_crawling (self ):
179216 if self .tocrawl :
@@ -183,12 +220,10 @@ def exclude_link(self,link):
183220 if link not in self .excluded :
184221 self .excluded .add (link )
185222
186- def checkRobots (self ):
187- if self .domain [len (self .domain )- 1 ] != "/" :
188- self .domain += "/"
189- request = Request (self .domain + "robots.txt" , headers = {"User-Agent" :config .crawler_user_agent })
223+ def check_robots (self ):
224+ robots_url = urljoin (self .domain , "robots.txt" )
190225 self .rp = RobotFileParser ()
191- self .rp .set_url (self . domain + "robots.txt" )
226+ self .rp .set_url (robots_url )
192227 self .rp .read ()
193228
194229 def can_fetch (self , link ):
@@ -224,4 +259,9 @@ def make_report(self):
224259 print ("Number of link exclude : {0}" .format (self .nb_exclude ))
225260
226261 for code in self .response_code :
227- print ("Nb Code HTTP {0} : {1}" .format (code , self .response_code [code ]))
262+ print ("Nb Code HTTP {0} : {1}" .format (code , self .response_code [code ]))
263+
264+ for code in self .marked :
265+ print ("Link with status {0}:" .format (code ))
266+ for uri in self .marked [code ]:
267+ print ("\t - {0}" .format (uri ))
0 commit comments