Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions crawlfrontier/contrib/backends/memory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def request_error(self, request, error):
def _get_or_create_request(self, request):
fingerprint = request.meta['fingerprint']
if fingerprint not in self.requests:
new_request = request.copy()
new_request.meta['created_at'] = datetime.datetime.utcnow()
new_request.meta['depth'] = 0
new_request = self._create_request(request)
self.requests[fingerprint] = new_request
self.manager.logger.backend.debug('Creating request %s' % new_request)
return new_request, True
Expand All @@ -59,6 +57,12 @@ def _get_or_create_request(self, request):
self.manager.logger.backend.debug('Request exists %s' % request)
return page, False

def _create_request(self, request):
new_request = request.copy()
new_request.meta['created_at'] = datetime.datetime.utcnow()
new_request.meta['depth'] = 0
return new_request

def _compare_pages(self, first, second):
raise NotImplementedError

Expand Down
36 changes: 24 additions & 12 deletions crawlfrontier/contrib/backends/sqlalchemy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def frontier_stop(self):

def add_seeds(self, seeds):
for seed in seeds:
db_page, _ = self._get_or_create_db_page(url=seed.url, fingerprint=seed.meta['fingerprint'])
db_page, _ = self._get_or_create_db_page(url=seed.url, fingerprint=seed.meta['fingerprint'],
request_or_response=seed)
self.session.commit()

def get_next_requests(self, max_next_requests):
Expand All @@ -134,35 +135,34 @@ def get_next_requests(self, max_next_requests):
next_pages = []
for db_page in query:
db_page.state = Page.State.QUEUED
request = self.manager.request_model(url=db_page.url)
request = self._create_request(db_page) # FIXME: we loose all the Request metadata here: methods, meta...
next_pages.append(request)
self.session.commit()
return next_pages

def page_crawled(self, response, links):
db_page, _ = self._get_or_create_db_page(url=response.url, fingerprint=response.meta['fingerprint'])
db_page, _ = self._get_or_create_db_page(url=response.url, fingerprint=response.meta['fingerprint'],
request_or_response=response)
db_page.state = Page.State.CRAWLED
db_page.status_code = response.status_code
# TODO: a performance bottle-neck on big volumes, operations should be batched here
for link in links:
db_page_from_link, created = self._get_or_create_db_page(url=link.url, fingerprint=link.meta['fingerprint'])
db_page_from_link, created = self._get_or_create_db_page(url=link.url, fingerprint=link.meta['fingerprint'],
request_or_response=link)
if created:
db_page_from_link.depth = db_page.depth+1
self.session.commit()

def request_error(self, request, error):
db_page, _ = self._get_or_create_db_page(url=request.url, fingerprint=request.meta['fingerprint'])
db_page, _ = self._get_or_create_db_page(url=request.url, fingerprint=request.meta['fingerprint'],
request_or_response=request)
db_page.state = Page.State.ERROR
db_page.error = error
self.session.commit()

def _get_or_create_db_page(self, url, fingerprint):
def _get_or_create_db_page(self, url, fingerprint, request_or_response):
if not self._request_exists(fingerprint):
db_request = self.page_model()
db_request.fingerprint = fingerprint
db_request.state = Page.State.NOT_CRAWLED
db_request.url = url
db_request.depth = 0
db_request.created_at = datetime.datetime.utcnow()
db_request = self._create_page(url, fingerprint, request_or_response)
self.session.add(db_request)
self.manager.logger.backend.debug('Creating request %s' % db_request)
return db_request, True
Expand All @@ -171,6 +171,18 @@ def _get_or_create_db_page(self, url, fingerprint):
self.manager.logger.backend.debug('Request exists %s' % db_request)
return db_request, False

def _create_page(self, url, fingerprint, request_or_response):
page = self.page_model()
page.fingerprint = fingerprint
page.state = Page.State.NOT_CRAWLED
page.url = url
page.depth = 0
page.created_at = datetime.datetime.utcnow()
return page

def _create_request(self, db_page):
return self.manager.request_model(url=db_page.url)

def _request_exists(self, fingerprint):
q = self.page_model.query(self.session).filter_by(fingerprint=fingerprint)
return self.session.query(q.exists()).scalar()
Expand Down