c4software · c4software · Aug 13, 2012 · Aug 12, 2012 · Aug 13, 2012
diff --git a/README.md b/README.md
@@ -20,6 +20,10 @@ Enable debug :
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --debug
 
+Enable report for print summary of the crawl:
+
+	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --report
+
 Skip url (by extension) (skip pdf AND xml url):
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml 

diff --git a/main.py b/main.py
@@ -48,6 +48,7 @@ def exclude_url(exclude, link):
 parser.add_argument('--output', action="store", default=None, help="Output file")
 parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
 parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
+parser.add_argument('--report', action="store_true", default=False, required=False, help="Display a report")
 
 group = parser.add_mutually_exclusive_group()
 group.add_argument('--config', action="store", default=None, help="Configuration file in json format")
@@ -132,6 +133,7 @@ def exclude_url(exclude, link):
 response_code={}
 nb_url=1 # Number of url.
 nb_rp=0 # Number of url blocked by the robots.txt
+nb_exclude=0 # Number of url excluded by extension or word
 print (header, file=output_file)
 while tocrawl:
 	crawling = tocrawl.pop()
@@ -225,12 +227,14 @@ def exclude_url(exclude, link):
 		if (target_extension in arg.skipext):
 			if link not in excluded:
 				excluded.add(link)
+			nb_exclude+=1
 			continue
 
 		# Check if the current url doesn't contain an excluded word
 		if (not exclude_url(arg.exclude, link)):
 			if link not in excluded:
 				excluded.add(link)
+			nb_exclude+=1
 			continue
 
 		tocrawl.add(link)
@@ -239,11 +243,17 @@ def exclude_url(exclude, link):
 if arg.debug:
 	logging.debug ("Number of found URL : {0}".format(nb_url))
 	logging.debug ("Number of link crawled : {0}".format(len(crawled)))
+
+if arg.report:
+	print ("Number of found URL : {0}".format(nb_url))
+	print ("Number of link crawled : {0}".format(len(crawled)))
 	if arg.parserobots:
-		logging.debug ("Number of link block by robots.txt : {0}".format(nb_rp))
+		print ("Number of link block by robots.txt : {0}".format(nb_rp))
+	if arg.skipext or arg.exclude:
+		print ("Number of link exclude : {0}".format(nb_exclude))
 
 	for code in response_code:
-		logging.debug ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))
+		print ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))
 
 if output_file:
 	output_file.close()