Skip to content

Commit f24e79b

Browse files
committed
Merge pull request theanti9#5 from sjparsons/status
Status Codes and URL limited
2 parents 82ccb6b + ba0ecb9 commit f24e79b

File tree

2 files changed

+34
-10
lines changed

2 files changed

+34
-10
lines changed

PyCrawler.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#!/usr/bin/python
12
import sys
23
import re
34
import urllib2
@@ -13,7 +14,8 @@
1314
1) database file name
1415
2) start url
1516
3) crawl depth
16-
4) verbose (optional)
17+
4) domains to limit to, regex (optional)
18+
5) verbose (optional)
1719
Start out by checking to see if the args are there and
1820
set them to their variables
1921
"""
@@ -23,12 +25,15 @@
2325
dbname = sys.argv[1]
2426
starturl = sys.argv[2]
2527
crawldepth = int(sys.argv[3])
26-
if len(sys.argv) == 5:
27-
if (sys.argv[4].upper() == "TRUE"):
28+
if len(sys.argv) >= 5:
29+
domains = sys.argv[4]
30+
if len(sys.argv) == 6:
31+
if (sys.argv[5].upper() == "TRUE"):
2832
verbose = True
2933
else:
3034
verbose = False
3135
else:
36+
domains = False
3237
verbose = False
3338
# urlparse the start url
3439
surlparsed = urlparse.urlparse(starturl)
@@ -37,7 +42,7 @@
3742
connection = sqlite.connect(dbname)
3843
cursor = connection.cursor()
3944
# crawl_index: holds all the information of the urls that have been crawled
40-
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )')
45+
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256), status INTEGER )')
4146
# queue: this should be obvious
4247
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))')
4348
# status: Contains a record of when crawling was started and stopped.
@@ -48,6 +53,10 @@
4853
# Compile keyword and link regex expressions
4954
keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
5055
linkregex = re.compile('<a.*\shref=[\'"](.*?)[\'"].*?>')
56+
if domains:
57+
domainregex = re.compile(domains)
58+
else:
59+
domainregex = False
5160
crawled = []
5261

5362
# set crawling status and stick starting url into the queue
@@ -99,7 +108,7 @@ def run(self):
99108
Args:
100109
crawling: this should be a url
101110
102-
crawl() opens the page at the "crawling" url, parses it and puts it into the databes.
111+
crawl() opens the page at the "crawling" url, parses it and puts it into the database.
103112
It looks for the page title, keywords, and links.
104113
"""
105114
def crawl(self, crawling):
@@ -111,6 +120,8 @@ def crawl(self, crawling):
111120
curdepth = crawling[2]
112121
# crawling urL
113122
curl = crawling[3]
123+
if domainregex and not domainregex.search(curl):
124+
return
114125
# Split the link into its sections
115126
url = urlparse.urlparse(curl)
116127

@@ -140,10 +151,21 @@ def crawl(self, crawling):
140151
request.add_header("User-Agent", "PyCrawler")
141152
# Build the url opener, open the link and read it into msg
142153
opener = urllib2.build_opener()
143-
msg = opener.open(request).read()
154+
f = opener.open(request)
155+
msg = f.read()
156+
# put meta data in info
157+
info = f.info()
158+
144159

145-
except:
160+
except urllib2.URLError, e:
146161
# If it doesn't load, skip this url
162+
#print e.code
163+
try:
164+
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, '', '', e.code))
165+
connection.commit
166+
except:
167+
pass
168+
147169
return
148170

149171
# Find what's between the title tags
@@ -169,7 +191,7 @@ def crawl(self, crawling):
169191

170192
try:
171193
# Put now crawled link into the db
172-
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?) )", (cid, pid, curl, title, keywordlist))
194+
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, title, keywordlist, 200))
173195
connection.commit()
174196
except:
175197
pass

README

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PyCrawler is very simple to use. It takes 4 arguments:
1+
PyCrawler is very simple to use. It takes 5 arguments:
22

33
1) database file name: The file that that will be used to store information as a sqlite database. If the filename given does not exist, it will be created.
44

@@ -7,4 +7,6 @@ PyCrawler is very simple to use. It takes 4 arguments:
77

88
3) crawl depth: This should be the number of pages deep the crawler should follow from the starting url before backing out.
99

10-
4) verbose (optional): If you want PyCrawler to spit out the urls it is looking at, this should be "true" if it is missing, or has any other value, it will be ignored and considered false.
10+
4) url regex (optional): A regex to filter the URLs. If not set then all URLs will be be logged.
11+
12+
5) verbose (optional): If you want PyCrawler to spit out the urls it is looking at, this should be "true" if it is missing, or has any other value, it will be ignored and considered false.

0 commit comments

Comments
 (0)