Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ Skip url (by extension) (skip pdf AND xml url):

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml

Drop attribute from url (regexp) :
Drop url via regexp :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}"
or (remove the index.html in the sitemap)
>>> python main.py --domain http://blog.lesite.us --drop "index.[a-z]{4}"

Exclude url by filter a part of it :

Expand Down
41 changes: 23 additions & 18 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def exclude_url(exclude, link):
rp.set_url(arg.domain+"robots.txt")
rp.read()

responseCode={}
nbUrl=1
nbRp=0
response_code={}
nb_url=1 # Number of url.
nb_rp=0 # Number of url blocked by the robots.txt
print (header, file=output_file)
while tocrawl:
crawling = tocrawl.pop()
Expand All @@ -146,12 +146,12 @@ def exclude_url(exclude, link):
response = urlopen(request)
except Exception as e:
if hasattr(e,'code'):
if e.code in responseCode:
responseCode[e.code]+=1
if e.code in response_code:
response_code[e.code]+=1
else:
responseCode[e.code]=1
response_code[e.code]=1
#else:
# responseCode['erreur']+=1
# response_code['erreur']+=1
if arg.debug:
logging.debug ("{1} ==> {0}".format(e, crawling))
response.close()
Expand All @@ -160,10 +160,10 @@ def exclude_url(exclude, link):
# Read the response
try:
msg = response.read()
if response.getcode() in responseCode:
responseCode[response.getcode()]+=1
if response.getcode() in response_code:
response_code[response.getcode()]+=1
else:
responseCode[response.getcode()]=1
response_code[response.getcode()]=1
response.close()
except Exception as e:
if arg.debug:
Expand Down Expand Up @@ -212,18 +212,23 @@ def exclude_url(exclude, link):
continue

# Count one more URL
nbUrl+=1
nb_url+=1

if (can_fetch(arg.parserobots, rp, link, arg.debug) == False):
# Check if the navigation is allowed by the robots.txt
if (not can_fetch(arg.parserobots, rp, link, arg.debug)):
if link not in excluded:
excluded.add(link)
nbRp+=1
nb_rp+=1
continue

# Check if the current file extension is allowed or not.
if (target_extension in arg.skipext):
if link not in excluded:
excluded.add(link)
continue
if (exclude_url(arg.exclude, link)==False):

# Check if the current url doesn't contain an excluded word
if (not exclude_url(arg.exclude, link)):
if link not in excluded:
excluded.add(link)
continue
Expand All @@ -232,13 +237,13 @@ def exclude_url(exclude, link):
print (footer, file=output_file)

if arg.debug:
logging.debug ("Number of found URL : {0}".format(nbUrl))
logging.debug ("Number of found URL : {0}".format(nb_url))
logging.debug ("Number of link crawled : {0}".format(len(crawled)))
if arg.parserobots:
logging.debug ("Number of link block by robots.txt : {0}".format(nbRp))
logging.debug ("Number of link block by robots.txt : {0}".format(nb_rp))

for code in responseCode:
logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code]))
for code in response_code:
logging.debug ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))

if output_file:
output_file.close()