Skip to content

Commit aab7e47

Browse files
authored
Merge pull request #7 from c4software/master
Update from origin c4software
2 parents ab49738 + dda18bc commit aab7e47

File tree

8 files changed

+789
-34
lines changed

8 files changed

+789
-34
lines changed

.gitignore

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
*.py[cod]
2+
3+
# C extensions
4+
*.so
5+
6+
# Packages
7+
*.egg
8+
*.egg-info
9+
dist
10+
build
11+
eggs
12+
parts
13+
bin
14+
var
15+
sdist
16+
develop-eggs
17+
.installed.cfg
18+
lib
19+
lib64
20+
21+
# Installer logs
22+
pip-log.txt
23+
24+
# Unit test / coverage reports
25+
.coverage
26+
.tox
27+
nosetests.xml
28+
29+
# Translations
30+
*.mo
31+
32+
# Mr Developer
33+
.mr.developer.cfg
34+
.project
35+
.pydevproject

LICENSE.txt

Lines changed: 675 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,14 @@ Read a config file to set parameters:
1616

1717
>>> python main.py --config config.json
1818

19-
Enable debug :
19+
Enable debug:
2020

2121
>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --debug
2222

23+
Enable verbose output:
24+
25+
>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --verbose
26+
2327
Enable report for print summary of the crawl:
2428

2529
>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --report

__pycache__/config.cpython-32.pyc

-519 Bytes
Binary file not shown.

__pycache__/crawler.cpython-32.pyc

-7.08 KB
Binary file not shown.

config.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
"pdf",
55
"xml"
66
],
7-
"parserobots":true,
7+
"parserobots":false,
88
"debug":true,
9-
"output":false,
9+
"output":"sitemap.xml",
1010
"exclude": [
1111
"action=edit"
1212
]
13-
}
13+
}

crawler.py

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
import config
22
import logging
3+
from urllib.parse import urljoin
34

45
import re
6+
from urllib.parse import urlparse
57
from urllib.request import urlopen, Request
68
from urllib.robotparser import RobotFileParser
7-
from urllib.parse import urlparse
9+
from datetime import datetime
810

911
import os
1012

1113
class Crawler():
12-
14+
1315
# Variables
1416
parserobots = False
1517
output = None
@@ -21,26 +23,30 @@ class Crawler():
2123
exclude = []
2224
skipext = []
2325
drop = []
24-
26+
2527
debug = False
2628

2729
tocrawl = set([])
2830
crawled = set([])
2931
excluded = set([])
32+
33+
marked = {}
34+
3035
# TODO also search for window.location={.*?}
31-
linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')
36+
linkregex = re.compile(b'<a [^>]*href=[\'|"](.*?)[\'"].*?>')
3237

3338
rp = None
3439
response_code={}
3540
nb_url=1 # Number of url.
3641
nb_rp=0 # Number of url blocked by the robots.txt
3742
nb_exclude=0 # Number of url excluded by extension or word
38-
43+
3944
output_file = None
4045

4146
target_domain = ""
4247

43-
def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
48+
def __init__(self, parserobots=False, output=None, report=False ,domain="",
49+
exclude=[], skipext=[], drop=[], debug=False, verbose=False):
4450
self.parserobots = parserobots
4551
self.output = output
4652
self.report = report
@@ -49,31 +55,44 @@ def __init__(self, parserobots=False, output=None, report=False ,domain="", excl
4955
self.skipext = skipext
5056
self.drop = drop
5157
self.debug = debug
58+
self.verbose = verbose
5259

5360
if self.debug:
54-
logging.basicConfig(level=logging.DEBUG)
61+
log_level = logging.DEBUG
62+
elif self.verbose:
63+
log_level = logging.INFO
64+
else:
65+
log_level = logging.ERROR
66+
67+
logging.basicConfig(level=log_level)
5568

5669
self.tocrawl = set([domain])
5770

5871
try:
5972
self.target_domain = urlparse(domain)[1]
6073
except:
74+
logging.error("Invalide domain")
6175
raise ("Invalid domain")
6276

63-
6477
if self.output:
6578
try:
6679
self.output_file = open(self.output, 'w')
6780
except:
68-
logging.debug ("Output file not available.")
81+
logging.error ("Output file not available.")
6982
exit(255)
7083

7184
def run(self):
72-
print (config.xml_header, file=self.output_file)
85+
print(config.xml_header, file=self.output_file)
86+
87+
if self.parserobots:
88+
self.check_robots()
89+
90+
logging.info("Start the crawling process")
91+
92+
while len(self.tocrawl) != 0:
93+
self.__crawling()
7394

74-
logging.debug("Start the crawling process")
75-
self.__crawling()
76-
logging.debug("Crawling as reach the end of all found link")
95+
logging.info("Crawling has reached end of all found links")
7796

7897
print (config.xml_footer, file=self.output_file)
7998

@@ -83,18 +102,26 @@ def __crawling(self):
83102

84103
url = urlparse(crawling)
85104
self.crawled.add(crawling)
86-
105+
logging.info("Crawling #{}: {}".format(len(self.crawled), url.geturl()))
106+
request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
107+
87108
try:
88-
request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
89109
response = urlopen(request)
90110
except Exception as e:
91111
if hasattr(e,'code'):
92112
if e.code in self.response_code:
93113
self.response_code[e.code]+=1
94114
else:
95115
self.response_code[e.code]=1
116+
117+
# Gestion des urls marked pour le reporting
118+
if self.report:
119+
if e.code in self.marked:
120+
self.marked[e.code].append(crawling)
121+
else:
122+
self.marked[e.code] = [crawling]
123+
96124
logging.debug ("{1} ==> {0}".format(e, crawling))
97-
response.close()
98125
return self.__continue_crawling()
99126

100127
# Read the response
@@ -104,28 +131,38 @@ def __crawling(self):
104131
self.response_code[response.getcode()]+=1
105132
else:
106133
self.response_code[response.getcode()]=1
134+
107135
response.close()
136+
137+
# Get the last modify date
138+
if 'last-modified' in response.headers:
139+
date = response.headers['Last-Modified']
140+
else:
141+
date = response.headers['Date']
142+
143+
date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')
144+
108145
except Exception as e:
109146
logging.debug ("{1} ===> {0}".format(e, crawling))
110-
return self.__continue_crawling()
147+
return None
111148

112149

113-
print ("<url><loc>"+url.geturl()+"</loc></url>", file=self.output_file)
150+
print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file=self.output_file)
114151
if self.output_file:
115152
self.output_file.flush()
116153

117154
# Found links
118155
links = self.linkregex.findall(msg)
119156
for link in links:
120157
link = link.decode("utf-8")
121-
#logging.debug("Found : {0}".format(link))
158+
logging.debug("Found : {0}".format(link))
122159
if link.startswith('/'):
123160
link = 'http://' + url[1] + link
124161
elif link.startswith('#'):
125162
link = 'http://' + url[1] + url[2] + link
126163
elif not link.startswith('http'):
127164
link = 'http://' + url[1] + '/' + link
128-
165+
129166
# Remove the anchor part if needed
130167
if "#" in link:
131168
link = link[:link.index('#')]
@@ -149,7 +186,7 @@ def __crawling(self):
149186
continue
150187
if ("javascript" in link):
151188
continue
152-
189+
153190
# Count one more URL
154191
self.nb_url+=1
155192

@@ -173,7 +210,7 @@ def __crawling(self):
173210

174211
self.tocrawl.add(link)
175212

176-
return self.__continue_crawling()
213+
return None
177214

178215
def __continue_crawling(self):
179216
if self.tocrawl:
@@ -183,12 +220,10 @@ def exclude_link(self,link):
183220
if link not in self.excluded:
184221
self.excluded.add(link)
185222

186-
def checkRobots(self):
187-
if self.domain[len(self.domain)-1] != "/":
188-
self.domain += "/"
189-
request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
223+
def check_robots(self):
224+
robots_url = urljoin(self.domain, "robots.txt")
190225
self.rp = RobotFileParser()
191-
self.rp.set_url(self.domain+"robots.txt")
226+
self.rp.set_url(robots_url)
192227
self.rp.read()
193228

194229
def can_fetch(self, link):
@@ -224,4 +259,9 @@ def make_report(self):
224259
print ("Number of link exclude : {0}".format(self.nb_exclude))
225260

226261
for code in self.response_code:
227-
print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
262+
print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
263+
264+
for code in self.marked:
265+
print ("Link with status {0}:".format(code))
266+
for uri in self.marked[code]:
267+
print ("\t- {0}".format(uri))

main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
import crawler
77

88
# Gestion des parametres
9-
parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map')
9+
parser = argparse.ArgumentParser(description='Crawler pour la creation de site map')
1010

1111
parser.add_argument('--skipext', action="append", default=[], required=False, help="File extension to skip")
1212
parser.add_argument('--parserobots', action="store_true", default=False, required=False, help="Ignore file defined in robots.txt")
1313
parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
14+
parser.add_argument('-v', '--verbose', action="store_true", help="Enable verbose output")
1415
parser.add_argument('--output', action="store", default=None, help="Output file")
1516
parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
1617
parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
@@ -51,4 +52,4 @@
5152
crawl.run()
5253

5354
if arg.report:
54-
crawl.make_report()
55+
crawl.make_report()

0 commit comments

Comments
 (0)