Skip to content

Commit

Permalink
Merge branch 'master' into patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
binux authored Mar 9, 2019
2 parents e0b07ef + 8178a97 commit 4d49e4d
Show file tree
Hide file tree
Showing 8 changed files with 439 additions and 27 deletions.
28 changes: 13 additions & 15 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,25 @@ sudo: required
language: python
cache: pip
python:
- "2.7"
#- "3.3" travis-ci use lxml-4.3.1 which dosen's support python 3.3
- "3.4"
- "3.5"
- "3.6"
#- "3.7" not supported by travis-ci
- 3.3
- 3.4
- 3.5
- 3.6
matrix:
allow_failures:
- python: 2.7
- python: 3.7
dist: xenial
services:
- docker
- mongodb
- rabbitmq
- redis-server
- mysql
#- elasticsearch
- postgresql
addons:
postgresql: "9.4"
apt:
packages:
- mysql-server-5.6
- mysql-client-core-5.6
- mysql-client-5.6
before_install:
- sudo apt-get update -qq
- sudo apt-get install -y beanstalkd
Expand All @@ -36,11 +35,10 @@ before_script:
- psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- sleep 10
install:
- pip install mysql-connector-python
- pip install https://github.com/marcus67/easywebdav/archive/master.zip

- if [[ $TRAVIS_PYTHON_VERSION != '3.5' ]]; then pip install --no-use-wheel lxml; else pip install lxml; fi
- if [[ $TRAVIS_PYTHON_VERSION != '3.5' ]]; then pip install --allow-all-external -e .[all,test]; else pip install -e .[all,test]; fi
- if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then sudo apt-get install libc6; fi
- if [[ $TRAVIS_PYTHON_VERSION == '3.7' ]]; then sudo apt-get install libgnutls28-dev; fi
- pip install -e .[all,test]
- pip install coveralls
script:
- coverage run setup.py test
Expand Down
15 changes: 13 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,18 @@ RUN mkdir -p /opt/phantomjs \
&& ln -s /opt/phantomjs/bin/phantomjs /usr/local/bin/phantomjs \
&& rm phantomjs.tar.bz2

# install nodejs
ENV NODEJS_VERSION=8.15.0 \
PATH=$PATH:/opt/node/bin

WORKDIR "/opt/node"

RUN apt-get -qq update && apt-get -qq install -y curl ca-certificates libx11-xcb1 libxtst6 libnss3 libasound2 libatk-bridge2.0-0 libgtk-3-0 --no-install-recommends && \
curl -sL https://nodejs.org/dist/v${NODEJS_VERSION}/node-v${NODEJS_VERSION}-linux-x64.tar.gz | tar xz --strip-components=1 && \
rm -rf /var/lib/apt/lists/*

# install requirements
RUN pip install --egg 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1'
RUN pip install 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1'
COPY requirements.txt /opt/pyspider/requirements.txt
RUN pip install -r /opt/pyspider/requirements.txt

Expand All @@ -22,7 +31,9 @@ ADD ./ /opt/pyspider
WORKDIR /opt/pyspider
RUN pip install -e .[all]

RUN npm i puppeteer express

VOLUME ["/opt/pyspider"]
ENTRYPOINT ["pyspider"]

EXPOSE 5000 23333 24444 25555
EXPOSE 5000 23333 24444 25555 22222
223 changes: 223 additions & 0 deletions pyspider/fetcher/puppeteer_fetcher.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
const express = require("express");
const puppeteer = require('puppeteer');
const bodyParser = require('body-parser');

const app = express();

app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false}));

let init_browser = true;
let browser_settings = {};

app.use(async (req, res, next) => {
if (init_browser) {
var options = req.body;
if (options.proxy) {
if (options.proxy.indexOf("://") == -1) {
options.proxy = "http://" + options.proxy;
}
browser_settings["args"] = ['--no-sandbox', "--disable-setuid-sandbox", "--proxy-server="+options.proxy];
} else {
browser_settings["args"] = ['--no-sandbox', "--disable-setuid-sandbox"];
}
browser_settings["headless"] = options.headless === "false"? false:true
browser = await puppeteer.launch(browser_settings);
init_browser=false;
console.log("init browser success!");
next();
} else {
next();
};
});


async function fetch(options) {
var page = await browser.newPage();
options.start_time = Date.now();
try {
await _fetch(page, options);
var result = await make_result(page, options);
await page.close();
return result
} catch (error) {
console.log('catch error ', error);
var result = await make_result(page, options, error);
await page.close();
return result
}
}

async function _fetch(page, options) {

width = options.js_viewport_width || 1024;
height = options.js_viewport_height || 768 * 3;
await page.setViewport({
"width": width,
"height": height
});

if (options.headers) {
await page.setExtraHTTPHeaders(options.headers);
}

if (options.headers && options.headers["User-Agent"]) {
page.setUserAgent(options.headers["User-Agent"]);
}

page.on("console", msg => {
console.log('console: ' + msg.args());
});

// Http post method
let first_request = true;
let request_reseted = false;
await page.setRequestInterception(true);
if (options.method && options.method.toLowerCase() === "post") {
page.on("request", interceptedRequest => {
request_reseted = false;
end_time = null;
if (first_request) {
first_request = false;
var data = {
"method": "POST",
"postData": options.data
};
console.log(data);
interceptedRequest.continue(data);
request_reseted = true
}
})
} else {
page.on("request", interceptedRequest => {
request_reseted = false;
end_time = null;
})
}

// load images or not
if (options.load_images && options.load_images.toLowerCase() === "false") {
page.on("request", request => {
if (!!!request_reseted) {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
}
})
} else {
page.on("request", request => {
if (!!!request_reseted)
request.continue()
})
}

let error_message = null;
page.on("error", e => {
error_message = e
});

let page_settings = {};
var page_timeout = options.timeout ? options.timeout * 1000 : 20 * 1000;
page_settings["timeout"] = page_timeout
page_settings["waitUntil"] = ["domcontentloaded", "networkidle0"];

console.log('goto ', options.url)
var response = await page.goto(options.url, page_settings);

if (error_message) {
throw error_message
}

if (options.js_script) {
console.log('running document-end script.');
script_result = await page.evaluate(options.js_script);
console.log("end script_result is: ", script_result);
options.script_result = script_result
}

if (options.screenshot_path) {
await page.screenshot({path: options.screenshot_path});
}

options.response = response
}

async function make_result(page, options, error) {
response = options.response;

var cookies = {};
var tmp_cookies = await page.cookies();
tmp_cookies.forEach(function (e) {
cookies[e.name] = e.value;
});

let status_code = null;
let headers = null;
let page_content = null;

if (!!!error) {
response = options.response;
status_code = response.status();
headers = response.headers();
page_content = await page.content();
}

return {
orig_url: options.url,
status_code: status_code || 599,
error: error,
content: page_content,
headers: headers,
url: page.url(),
cookies: cookies,
time: (Date.now() - options.start_time) / 1000,
js_script_result: options.script_result,
save: options.save
}
}

app.get("/", function (request, response) {
body = "method not allowed!";
response.status(403);
response.set({
"cache": "no-cache",
"Content-Length": body.length
});
response.send(body);
});



let max_open_pages = 5;
let opened_page_nums = 0;

app.post("/", async (request, response) => {
console.log("opened pages: " + opened_page_nums);
if (opened_page_nums >= max_open_pages){
body = "browser pages is too many, open new browser process!";
response.status(403);
response.set({
"cache": "no-cache",
"Content-Length": body.length
});
response.send(body);
} else {
opened_page_nums += 1;
let options = request.body;
result = await fetch(options);
opened_page_nums -= 1;
response.send(result)
}
});


let port = 22222;

if (process.argv.length === 3) {
port = parseInt(process.argv[2])
}

app.listen(port, function () {
console.log("puppeteer fetcher running on port " + port);
});
Loading

0 comments on commit 4d49e4d

Please sign in to comment.