Skip to content

Commit

Permalink
add phantomjs to run.py
Browse files Browse the repository at this point in the history
add logs for phantomjs_fetcher
update README
fix timeout error message for tornado fetcher proxy to phantomjs fetcher
run test exit with right exit code when test is failed
  • Loading branch information
binux committed Oct 31, 2014
1 parent 6715bf2 commit 7a82e3d
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ MAINTAINER binux <roy@binux.me>
# install python
RUN apt-get update && \
apt-get install -y python python-dev python-distribute python-pip && \
apt-get install -y libcurl4-openssl-dev libxml2-dev libxslt1-dev
apt-get install -y libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml

# install requirements
ADD requirements.txt /opt/pyspider/requirements.txt
Expand Down
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
pyspider [![Build Status](https://travis-ci.org/binux/pyspider.png?branch=master)](https://travis-ci.org/binux/pyspider)
========

a spider in python! [Try It Now!](http://demo.pyspider.org/)
A spider system in python. [Try It Now!](http://demo.pyspider.org/)

- Write script with python
- Web script editor, debugger, task monitor, project manager and result viewer
- Distributed architecture
- MySQL, MongoDB and SQLite as database backend
- Full control of crawl process with powerful API
- Javascript pages Support! (with phantomjs fetcher)


![debug demo](http://f.binux.me/debug_demo.png)
demo code: [gist:9424801](https://gist.github.com/binux/9424801)
Expand Down
38 changes: 23 additions & 15 deletions fetcher/phantomjs_fetcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// Created on 2014-10-29 22:12:14

var port, server, service,
wait_before_end = 300,
system = require('system'),
webpage = require('webpage');

Expand All @@ -26,7 +27,7 @@ if (system.args.length !== 2) {
}

var fetch = JSON.parse(request.postRaw);
console.log(JSON.stringify(fetch, null, 2));
console.debug(JSON.stringify(fetch, null, 2));

// create and set page
var page = webpage.create();
Expand Down Expand Up @@ -55,31 +56,37 @@ if (system.args.length !== 2) {
};
page.onLoadFinished = function(status) {
page_loaded = true;
if (status !== "success") {
return;
}
if (fetch.js_script && fetch.js_run_at !== "document-start") {
page.evaluateJavaScript(fetch.js_script);
}
end_time = Date.now() + 300;
setTimeout(make_result, 310, page);
console.debug("waiting "+wait_before_end+"ms before finished.");
end_time = Date.now() + wait_before_end;
setTimeout(make_result, wait_before_end+10, page);
};
page.onResourceRequested = function() {
page.onResourceRequested = function(request) {
console.debug("Starting request: #"+request.id+" ["+request.method+"]"+request.url);
end_time = null;
};
page.onResourceReceived = function(response) {
console.debug("Request finished: #"+response.id+" ["+response.statusText+"]"+response.url+" "+response.time+"ms");
if (first_response === null) {
first_response = response;
}
if (page_loaded) {
end_time = Date.now() + 300;
setTimeout(make_result, 310, page);
console.debug("waiting "+wait_before_end+"ms before finished.");
end_time = Date.now() + wait_before_end;
setTimeout(make_result, wait_before_end+10, page);
}
}
page.onResourceError=page.onResourceTimeout=function() {
page.onResourceError=page.onResourceTimeout=function(response) {
console.info("Request error: #"+response.id+" ["+response.errorCode+"="+response.errorString+"]"+response.url);
if (first_response === null) {
first_response = response;
}
if (page_loaded) {
end_time = Date.now() + 300;
setTimeout(make_result, 310, page);
console.debug("waiting "+wait_before_end+"ms before finished.");
end_time = Date.now() + wait_before_end;
setTimeout(make_result, wait_before_end+10, page);
}
}

Expand All @@ -106,14 +113,15 @@ if (system.args.length !== 2) {

var result = {
orig_url: fetch.url,
content: page.content,
headers: first_response.headers,
status_code: first_response.status,
content: first_response.errorString || page.content,
headers: first_response.headers || {},
status_code: first_response.status || 599,
url: page.url,
cookies: cookies,
time: (end_time - start_time) / 1000,
save: fetch.save
}
console.log("["+result.status_code+"] "+result.orig_url+" "+result.time)

var body = JSON.stringify(result, null, 2);
response.statusCode = 200;
Expand Down
28 changes: 19 additions & 9 deletions fetcher/tornado_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def fetch(self, task, callback=None):
callback = self.send_result
if url.startswith('data:'):
return self.data_fetch(url, task, callback)
elif task.get('fetch', {}).get('fetch_type') == 'phantomjs':
elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'):
return self.phantomjs_fetch(url, task, callback)
else:
return self.http_fetch(url, task, callback)
Expand Down Expand Up @@ -283,20 +283,30 @@ def phantomjs_fetch(self, url, task, callback):

start_time = time.time()
def handle_response(response):
try:
return task, json.loads(response.body)
except Exception as e:
if not response:
result = {
'status_code': 599,
'content': "%r" % e,
'content': "timeout error",
'time': time.time() - start_time,
'orig_url': url,
'url': url,
}
logger.exception("[599] %s, %r %.2fs", url, e, result['time'])
callback('phantomjs', task, result)
self.on_result('phantomjs', task, result)
return task, result
else:
try:
return task, json.loads(response.body)
except Exception as e:
result = {
'status_code': 599,
'content': "%r" % e,
'time': time.time() - start_time,
'orig_url': url,
'url': url,
}
logger.exception("[599] %s, %r %.2fs",
url, result['content'], result['time'])
callback('phantomjs', task, result)
self.on_result('phantomjs', task, result)
return task, result

try:
request = tornado.httpclient.HTTPRequest(
Expand Down
5 changes: 5 additions & 0 deletions libs/base_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ def crawl(self, url, **kwargs):
etag
last_modifed
fetch_type
js_run_at
js_script
load_images
priority
retries
exetime
Expand Down
20 changes: 20 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def __get__(self, instance, owner):
class g(object):
scheduler_xmlrpc_port = int(os.environ.get('SCHEDULER_XMLRPC_PORT', 23333))
fetcher_xmlrpc_port = int(os.environ.get('FETCHER_XMLRPC_PORT', 24444))
phantomjs_proxy_port = int(os.environ.get('PHANTOMJS_PROXY_PORT', 25555))
webui_host = os.environ.get('WEBUI_HOST', '0.0.0.0')
webui_port = int(os.environ.get('WEBUI_PORT', 5000))
debug = bool(os.environ.get('DEBUG'))
Expand Down Expand Up @@ -99,6 +100,15 @@ class g(object):
else:
scheduler_rpc = None

# phantomjs_proxy
if os.environ.get('PHANTOMJS_NAME'):
phantomjs_proxy = "%s:%s" % (
os.environ['PHANTOMJS_PORT_%d_TCP_ADDR' % phantomjs_proxy_port],
os.environ['PHANTOMJS_PORT_%d_TCP_PORT' % phantomjs_proxy_port]
)
else:
phantomjs_proxy = None

# run commands ------------------------------------------
def run_scheduler(g=g):
from scheduler import Scheduler
Expand All @@ -114,6 +124,7 @@ def run_scheduler(g=g):
def run_fetcher(g=g):
from fetcher.tornado_fetcher import Fetcher
fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
fetcher.phantomjs_proxy = g.phantomjs_proxy

run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host)
fetcher.run()
Expand All @@ -135,10 +146,15 @@ def run_result_worker(g=g):
def run_webui(g=g):
import cPickle as pickle

from fetcher.tornado_fetcher import Fetcher
fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
fetcher.phantomjs_proxy = g.phantomjs_proxy

from webui.app import app
app.config['taskdb'] = g.taskdb
app.config['projectdb'] = g.projectdb
app.config['resultdb'] = g.resultdb
app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
app.config['scheduler_rpc'] = g.scheduler_rpc
#app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/'
if g.demo_mode:
Expand Down Expand Up @@ -174,6 +190,10 @@ def all_in_one():
each.join()

if __name__ == '__main__':
print "running with config:"
for key in dir(g):
print "%s=%r" % (key, getattr(g, key))

if len(sys.argv) < 2:
all_in_one()
else:
Expand Down
5 changes: 4 additions & 1 deletion runtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@
glob = sys.argv[1]

suite = unittest.TestLoader().discover('test', glob)
unittest.TextTestRunner(verbosity=1).run(suite)
result = unittest.TextTestRunner(verbosity=1).run(suite)
if result.errors or result.failures:
sys.exit(1)
sys.exit(0)

0 comments on commit 7a82e3d

Please sign in to comment.