-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Simplify website crawler using the generic HTTPCase opener
- Loading branch information
1 parent
f9e24e1
commit 78e044b
Showing
7 changed files
with
123 additions
and
197 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# -*- coding: utf-8 -*- | ||
import test_converter | ||
import test_requests | ||
import test_crawl | ||
import test_ui | ||
import test_views |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# -*- coding: utf-8 -*- | ||
import logging | ||
import urlparse | ||
import unittest2 | ||
import urllib2 | ||
import time | ||
import werkzeug.urls | ||
|
||
import lxml.html | ||
|
||
import openerp | ||
from openerp import tools | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
class Crawler(openerp.tests.HttpCase): | ||
""" Test suite crawling an openerp CMS instance and checking that all | ||
internal links lead to a 200 response. | ||
If a username and a password are provided, authenticates the user before | ||
starting the crawl | ||
""" | ||
|
||
at_install = False | ||
post_install = True | ||
|
||
def crawl(self, url, seen=None, msg=''): | ||
if seen == None: | ||
seen = set() | ||
if url in seen: | ||
return seen | ||
else: | ||
seen.add(url) | ||
|
||
_logger.info("%s %s", msg, url) | ||
r = self.url_open(url) | ||
code = r.getcode() | ||
self.assertIn( code, xrange(200, 300), "%s Fetching %s returned error response (%d)" % (msg, url, code)) | ||
|
||
if r.info().gettype() == 'text/html': | ||
doc = lxml.html.fromstring(r.read()) | ||
for link in doc.xpath('//a[@href]'): | ||
href = link.get('href') | ||
|
||
parts = urlparse.urlsplit(href) | ||
# href with any fragment removed | ||
href = urlparse.urlunsplit(( | ||
parts.scheme, | ||
parts.netloc, | ||
parts.path, | ||
parts.query, | ||
'' | ||
)) | ||
|
||
# FIXME: handle relative link (not parts.path.startswith /) | ||
if parts.netloc or \ | ||
not parts.path.startswith('/') or \ | ||
parts.path == '/web' or\ | ||
parts.path.startswith('/web/') or \ | ||
parts.path.startswith('/en_US/') or \ | ||
(parts.scheme and parts.scheme not in ('http', 'https')): | ||
continue | ||
|
||
self.crawl(href, seen, msg) | ||
return seen | ||
|
||
|
||
def test_10_crawl_public(self): | ||
t0 = time.time() | ||
seen = self.crawl('/', msg='Anonymous Coward') | ||
_logger.log(25, "public crawled %s urls in %.2fs", len(seen) ,time.time() - t0) | ||
|
||
def test_20_crawl_demo(self): | ||
t0 = time.time() | ||
self.authenticate('demo', 'demo') | ||
seen = self.crawl('/', msg='demo') | ||
_logger.log(25, "demo crawled %s urls in %.2fs", len(seen), time.time() - t0) | ||
|
||
def test_30_crawl_admin(self): | ||
t0 = time.time() | ||
self.authenticate('admin', 'admin') | ||
seen = self.crawl('/', msg='admin') | ||
_logger.log(25, "admin crawled %s urls in %.2fs", len(seen), time.time() - t0) | ||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters