Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ Enable debug :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --debug

Enable report for print summary of the crawl:

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --report

Skip url (by extension) (skip pdf AND xml url):

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml
Expand Down
14 changes: 12 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def exclude_url(exclude, link):
parser.add_argument('--output', action="store", default=None, help="Output file")
parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
parser.add_argument('--report', action="store_true", default=False, required=False, help="Display a report")

group = parser.add_mutually_exclusive_group()
group.add_argument('--config', action="store", default=None, help="Configuration file in json format")
Expand Down Expand Up @@ -132,6 +133,7 @@ def exclude_url(exclude, link):
response_code={}
nb_url=1 # Number of url.
nb_rp=0 # Number of url blocked by the robots.txt
nb_exclude=0 # Number of url excluded by extension or word
print (header, file=output_file)
while tocrawl:
crawling = tocrawl.pop()
Expand Down Expand Up @@ -225,12 +227,14 @@ def exclude_url(exclude, link):
if (target_extension in arg.skipext):
if link not in excluded:
excluded.add(link)
nb_exclude+=1
continue

# Check if the current url doesn't contain an excluded word
if (not exclude_url(arg.exclude, link)):
if link not in excluded:
excluded.add(link)
nb_exclude+=1
continue

tocrawl.add(link)
Expand All @@ -239,11 +243,17 @@ def exclude_url(exclude, link):
if arg.debug:
logging.debug ("Number of found URL : {0}".format(nb_url))
logging.debug ("Number of link crawled : {0}".format(len(crawled)))

if arg.report:
print ("Number of found URL : {0}".format(nb_url))
print ("Number of link crawled : {0}".format(len(crawled)))
if arg.parserobots:
logging.debug ("Number of link block by robots.txt : {0}".format(nb_rp))
print ("Number of link block by robots.txt : {0}".format(nb_rp))
if arg.skipext or arg.exclude:
print ("Number of link exclude : {0}".format(nb_exclude))

for code in response_code:
logging.debug ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))
print ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))

if output_file:
output_file.close()