Skip to content

Commit 2865af4

Browse files
committed
Merge pull request #10 from sibiryakov/sqlalchemy-decomposition
Page/Request instance creation decomposition in backends
2 parents 676a553 + 4818c91 commit 2865af4

File tree

2 files changed

+31
-15
lines changed

2 files changed

+31
-15
lines changed

crawlfrontier/contrib/backends/memory/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,7 @@ def request_error(self, request, error):
4848
def _get_or_create_request(self, request):
4949
fingerprint = request.meta['fingerprint']
5050
if fingerprint not in self.requests:
51-
new_request = request.copy()
52-
new_request.meta['created_at'] = datetime.datetime.utcnow()
53-
new_request.meta['depth'] = 0
51+
new_request = self._create_request(request)
5452
self.requests[fingerprint] = new_request
5553
self.manager.logger.backend.debug('Creating request %s' % new_request)
5654
return new_request, True
@@ -59,6 +57,12 @@ def _get_or_create_request(self, request):
5957
self.manager.logger.backend.debug('Request exists %s' % request)
6058
return page, False
6159

60+
def _create_request(self, request):
61+
new_request = request.copy()
62+
new_request.meta['created_at'] = datetime.datetime.utcnow()
63+
new_request.meta['depth'] = 0
64+
return new_request
65+
6266
def _compare_pages(self, first, second):
6367
raise NotImplementedError
6468

crawlfrontier/contrib/backends/sqlalchemy/__init__.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,8 @@ def frontier_stop(self):
122122

123123
def add_seeds(self, seeds):
124124
for seed in seeds:
125-
db_page, _ = self._get_or_create_db_page(url=seed.url, fingerprint=seed.meta['fingerprint'])
125+
db_page, _ = self._get_or_create_db_page(url=seed.url, fingerprint=seed.meta['fingerprint'],
126+
request_or_response=seed)
126127
self.session.commit()
127128

128129
def get_next_requests(self, max_next_requests):
@@ -134,35 +135,34 @@ def get_next_requests(self, max_next_requests):
134135
next_pages = []
135136
for db_page in query:
136137
db_page.state = Page.State.QUEUED
137-
request = self.manager.request_model(url=db_page.url)
138+
request = self._create_request(db_page) # FIXME: we loose all the Request metadata here: methods, meta...
138139
next_pages.append(request)
139140
self.session.commit()
140141
return next_pages
141142

142143
def page_crawled(self, response, links):
143-
db_page, _ = self._get_or_create_db_page(url=response.url, fingerprint=response.meta['fingerprint'])
144+
db_page, _ = self._get_or_create_db_page(url=response.url, fingerprint=response.meta['fingerprint'],
145+
request_or_response=response)
144146
db_page.state = Page.State.CRAWLED
145147
db_page.status_code = response.status_code
148+
# TODO: a performance bottle-neck on big volumes, operations should be batched here
146149
for link in links:
147-
db_page_from_link, created = self._get_or_create_db_page(url=link.url, fingerprint=link.meta['fingerprint'])
150+
db_page_from_link, created = self._get_or_create_db_page(url=link.url, fingerprint=link.meta['fingerprint'],
151+
request_or_response=link)
148152
if created:
149153
db_page_from_link.depth = db_page.depth+1
150154
self.session.commit()
151155

152156
def request_error(self, request, error):
153-
db_page, _ = self._get_or_create_db_page(url=request.url, fingerprint=request.meta['fingerprint'])
157+
db_page, _ = self._get_or_create_db_page(url=request.url, fingerprint=request.meta['fingerprint'],
158+
request_or_response=request)
154159
db_page.state = Page.State.ERROR
155160
db_page.error = error
156161
self.session.commit()
157162

158-
def _get_or_create_db_page(self, url, fingerprint):
163+
def _get_or_create_db_page(self, url, fingerprint, request_or_response):
159164
if not self._request_exists(fingerprint):
160-
db_request = self.page_model()
161-
db_request.fingerprint = fingerprint
162-
db_request.state = Page.State.NOT_CRAWLED
163-
db_request.url = url
164-
db_request.depth = 0
165-
db_request.created_at = datetime.datetime.utcnow()
165+
db_request = self._create_page(url, fingerprint, request_or_response)
166166
self.session.add(db_request)
167167
self.manager.logger.backend.debug('Creating request %s' % db_request)
168168
return db_request, True
@@ -171,6 +171,18 @@ def _get_or_create_db_page(self, url, fingerprint):
171171
self.manager.logger.backend.debug('Request exists %s' % db_request)
172172
return db_request, False
173173

174+
def _create_page(self, url, fingerprint, request_or_response):
175+
page = self.page_model()
176+
page.fingerprint = fingerprint
177+
page.state = Page.State.NOT_CRAWLED
178+
page.url = url
179+
page.depth = 0
180+
page.created_at = datetime.datetime.utcnow()
181+
return page
182+
183+
def _create_request(self, db_page):
184+
return self.manager.request_model(url=db_page.url)
185+
174186
def _request_exists(self, fingerprint):
175187
q = self.page_model.query(self.session).filter_by(fingerprint=fingerprint)
176188
return self.session.query(q.exists()).scalar()

0 commit comments

Comments
 (0)