Skip to content

Commit 848ce4a

Browse files
committed
Restructed the way the request is made very slightly to add a User-Agent
1 parent 7f8095b commit 848ce4a

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

PyCrawler.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
# Connect to the db and create the tables if they don't already exist
3333
connection = sqlite.connect(dbname)
3434
cursor = connection.cursor()
35-
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )')
35+
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )')
3636
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))')
3737
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
3838
connection.commit()
@@ -69,7 +69,7 @@ def run(self):
6969

7070
# if theres nothing in the que, then set the status to done and exit
7171
if crawling == None:
72-
cursor.execute("INSERT INTO status VALUES ((?), (?))", (0, "datetime('now')"))
72+
cursor.execute("INSERT INTO status VALUES ((?), datetime('now'))", (0,))
7373
connection.commit()
7474
sys.exit("Done!")
7575
# Crawl the link
@@ -95,7 +95,11 @@ def crawl(self, crawling):
9595
del crawled[:]
9696
try:
9797
# Load the link
98-
response = urllib2.urlopen(curl)
98+
request = urllib2.Request(curl)
99+
request.add_header("User-Agent", "PyCrawler")
100+
opener = urllib2.build_opener()
101+
response = opener.open(request).read()
102+
99103
except:
100104
# If it doesn't load, skip this url
101105
return

0 commit comments

Comments
 (0)