32
32
# Connect to the db and create the tables if they don't already exist
33
33
connection = sqlite .connect (dbname )
34
34
cursor = connection .cursor ()
35
- cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )' )
35
+ cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )' )
36
36
cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))' )
37
37
cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
38
38
connection .commit ()
@@ -69,7 +69,7 @@ def run(self):
69
69
70
70
# if theres nothing in the que, then set the status to done and exit
71
71
if crawling == None :
72
- cursor .execute ("INSERT INTO status VALUES ((?), (? ))" , (0 , "datetime('now')" ))
72
+ cursor .execute ("INSERT INTO status VALUES ((?), datetime('now' ))" , (0 ,))
73
73
connection .commit ()
74
74
sys .exit ("Done!" )
75
75
# Crawl the link
@@ -95,7 +95,11 @@ def crawl(self, crawling):
95
95
del crawled [:]
96
96
try :
97
97
# Load the link
98
- response = urllib2 .urlopen (curl )
98
+ request = urllib2 .Request (curl )
99
+ request .add_header ("User-Agent" , "PyCrawler" )
100
+ opener = urllib2 .build_opener ()
101
+ response = opener .open (request ).read ()
102
+
99
103
except :
100
104
# If it doesn't load, skip this url
101
105
return
0 commit comments