Skip to content

Commit

Permalink
update dep & docker switch to ubuntu
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuyingda committed Nov 9, 2023
1 parent c35f628 commit ecba130
Show file tree
Hide file tree
Showing 8 changed files with 252 additions and 61 deletions.
56 changes: 38 additions & 18 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -1,28 +1,48 @@
FROM centos:7.4.1708
FROM --platform=amd64 ubuntu:20.04
LABEL maintainer="Sugar yingdazhu@icloud.com"
ARG nodever="10.23.0"
RUN yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 \
libXdamage.x86_64 libXext.x86_64 libXi.x86_64 libXtst.x86_64 \
cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 \
alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 -y
RUN yum install ipa-gothic-fonts xorg-x11-fonts-100dpi \
xorg-x11-fonts-75dpi xorg-x11-utils xorg-x11-fonts-cyrillic \
xorg-x11-fonts-Type1 xorg-x11-fonts-misc -y
RUN yum install wget -y
RUN rm -rf /var/cache/yum
ARG nodever="16.20.2"
RUN apt-get update && apt-get install -y \
curl \
gpg \
wget \
ca-certificates \
libx11-xcb1 \
libxcb1 \
libxcb-dri3-0 \
libxcomposite1 \
libxdamage1 \
libxi6 \
libxtst6 \
libnss3 \
libcups2 \
libxss1 \
libxrandr2 \
libasound2 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libpangocairo-1.0-0 \
libgtk-3-0 \
libgbm1 \
libxshmfence1 \
libgles2-mesa \
xvfb \
&& rm -rf /var/lib/apt/lists/*
RUN adduser work
RUN wget https://nodejs.org/dist/v${nodever}/node-v${nodever}-linux-x64.tar.xz \
-O /home/work/node-v${nodever}-linux-x64.tar.xz
RUN xz -d /home/work/node-v${nodever}-linux-x64.tar.xz
RUN tar -xvf /home/work/node-v${nodever}-linux-x64.tar -C /home/work/
RUN ln -s /home/work/node-v${nodever}-linux-x64/bin/node /usr/local/bin/node
RUN ln -s /home/work/node-v${nodever}-linux-x64/bin/npm /usr/local/bin/npm
RUN rm /home/work/node-v${nodever}-linux-x64.tar
RUN yum install git -y
-O /home/work/node-v${nodever}-linux-x64.tar.xz \
&& xz -d /home/work/node-v${nodever}-linux-x64.tar.xz \
&& tar -xvf /home/work/node-v${nodever}-linux-x64.tar -C /home/work/ \
&& mv /home/work/node-v${nodever}-linux-x64 /home/work/nodejs \
&& ln -s /home/work/nodejs/bin/node /usr/local/bin/node \
&& ln -s /home/work/nodejs/bin/npm /usr/local/bin/npm \
&& ln -s /home/work/nodejs/bin/npx /usr/local/bin/npx \
&& rm /home/work/node-v${nodever}-linux-x64.tar
RUN apt-get update && apt-get install -y git
USER work
RUN mkdir /home/work/test_env
WORKDIR /home/work/test_env
RUN git clone https://github.com/zhuyingda/webster.git
WORKDIR /home/work/test_env/webster
RUN npm install
RUN npx playwright install
RUN npm run test
56 changes: 38 additions & 18 deletions Dockerfile.demo
Original file line number Diff line number Diff line change
@@ -1,28 +1,48 @@
FROM centos:7.4.1708
FROM ubuntu:20.04
LABEL maintainer="Sugar yingdazhu@icloud.com"
ARG nodever="14.15.1"
RUN yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 \
libXdamage.x86_64 libXext.x86_64 libXi.x86_64 libXtst.x86_64 \
cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 \
alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 -y
RUN yum install ipa-gothic-fonts xorg-x11-fonts-100dpi \
xorg-x11-fonts-75dpi xorg-x11-utils xorg-x11-fonts-cyrillic \
xorg-x11-fonts-Type1 xorg-x11-fonts-misc -y
RUN yum install wget -y
RUN rm -rf /var/cache/yum
ARG nodever="16.20.2"
RUN apt-get update && apt-get install -y \
curl \
gpg \
wget \
ca-certificates \
libx11-xcb1 \
libxcb1 \
libxcb-dri3-0 \
libxcomposite1 \
libxdamage1 \
libxi6 \
libxtst6 \
libnss3 \
libcups2 \
libxss1 \
libxrandr2 \
libasound2 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libpangocairo-1.0-0 \
libgtk-3-0 \
libgbm1 \
libxshmfence1 \
libgles2-mesa \
xvfb \
&& rm -rf /var/lib/apt/lists/*
RUN adduser work
RUN wget https://nodejs.org/dist/v${nodever}/node-v${nodever}-linux-x64.tar.xz \
-O /home/work/node-v${nodever}-linux-x64.tar.xz
RUN xz -d /home/work/node-v${nodever}-linux-x64.tar.xz
RUN tar -xvf /home/work/node-v${nodever}-linux-x64.tar -C /home/work/
RUN ln -s /home/work/node-v${nodever}-linux-x64/bin/node /usr/local/bin/node
RUN ln -s /home/work/node-v${nodever}-linux-x64/bin/npm /usr/local/bin/npm
RUN rm /home/work/node-v${nodever}-linux-x64.tar
-O /home/work/node-v${nodever}-linux-x64.tar.xz \
&& xz -d /home/work/node-v${nodever}-linux-x64.tar.xz \
&& tar -xvf /home/work/node-v${nodever}-linux-x64.tar -C /home/work/ \
&& mv /home/work/node-v${nodever}-linux-x64 /home/work/nodejs \
&& ln -s /home/work/nodejs/bin/node /usr/local/bin/node \
&& ln -s /home/work/nodejs/bin/npm /usr/local/bin/npm \
&& ln -s /home/work/nodejs/bin/npx /usr/local/bin/npx \
&& rm /home/work/node-v${nodever}-linux-x64.tar
USER work
RUN mkdir /home/work/webster_startup
WORKDIR /home/work/webster_startup
RUN npm init -y
RUN npm i --save webster@latest
RUN npx playwright install
COPY example/demo_consumer.js /home/work/webster_startup/
COPY example/demo_producer.js /home/work/webster_startup/
# CMD /home/work/node-v14.15.1-linux-x64/bin/node /home/work/webster_startup/demo_consumer.js
# CMD /home/work/nodejs/bin/node /home/work/webster_startup/demo_consumer.js
58 changes: 39 additions & 19 deletions Dockerfile.runtime
Original file line number Diff line number Diff line change
@@ -1,28 +1,48 @@
FROM centos:7.4.1708
FROM ubuntu:20.04
LABEL maintainer="Sugar yingdazhu@icloud.com"
ARG nodever="14.15.1"
RUN yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 \
libXdamage.x86_64 libXext.x86_64 libXi.x86_64 libXtst.x86_64 \
cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 \
alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 -y
RUN yum install ipa-gothic-fonts xorg-x11-fonts-100dpi \
xorg-x11-fonts-75dpi xorg-x11-utils xorg-x11-fonts-cyrillic \
xorg-x11-fonts-Type1 xorg-x11-fonts-misc -y
RUN yum install wget -y
RUN rm -rf /var/cache/yum
ARG nodever="16.20.2"
RUN apt-get update && apt-get install -y \
curl \
gpg \
wget \
ca-certificates \
libx11-xcb1 \
libxcb1 \
libxcb-dri3-0 \
libxcomposite1 \
libxdamage1 \
libxi6 \
libxtst6 \
libnss3 \
libcups2 \
libxss1 \
libxrandr2 \
libasound2 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libpangocairo-1.0-0 \
libgtk-3-0 \
libgbm1 \
libxshmfence1 \
libgles2-mesa \
xvfb \
&& rm -rf /var/lib/apt/lists/*
RUN adduser work
RUN wget https://nodejs.org/dist/v${nodever}/node-v${nodever}-linux-x64.tar.xz \
-O /home/work/node-v${nodever}-linux-x64.tar.xz
RUN xz -d /home/work/node-v${nodever}-linux-x64.tar.xz
RUN tar -xvf /home/work/node-v${nodever}-linux-x64.tar -C /home/work/
RUN ln -s /home/work/node-v${nodever}-linux-x64/bin/node /usr/local/bin/node
RUN ln -s /home/work/node-v${nodever}-linux-x64/bin/npm /usr/local/bin/npm
RUN rm /home/work/node-v${nodever}-linux-x64.tar
-O /home/work/node-v${nodever}-linux-x64.tar.xz \
&& xz -d /home/work/node-v${nodever}-linux-x64.tar.xz \
&& tar -xvf /home/work/node-v${nodever}-linux-x64.tar -C /home/work/ \
&& mv /home/work/node-v${nodever}-linux-x64 /home/work/nodejs \
&& ln -s /home/work/nodejs/bin/node /usr/local/bin/node \
&& ln -s /home/work/nodejs/bin/npm /usr/local/bin/npm \
&& ln -s /home/work/nodejs/bin/npx /usr/local/bin/npx \
&& rm /home/work/node-v${nodever}-linux-x64.tar
USER work
RUN mkdir /home/work/webster_runtime
WORKDIR /home/work/webster_runtime
RUN mkdir /home/work/webster_startup
WORKDIR /home/work/webster_startup
RUN npm init -y
RUN npm i --save webster@latest
RUN npx playwright install

# put your own crawler code into image here
# COPY xxx /home/work/webster_runtime/
47 changes: 47 additions & 0 deletions Dockerfile.simple
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
FROM --platform=amd64 ubuntu:20.04
LABEL maintainer="ShadowWeaver"
ARG nodever="16.20.2"
RUN apt-get update && apt-get install -y \
curl \
gpg \
wget \
ca-certificates \
libx11-xcb1 \
libxcb1 \
libxcb-dri3-0 \
libxcomposite1 \
libxdamage1 \
libxi6 \
libxtst6 \
libnss3 \
libcups2 \
libxss1 \
libxrandr2 \
libasound2 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libpangocairo-1.0-0 \
libgtk-3-0 \
libgbm1 \
libxshmfence1 \
libgles2-mesa \
xvfb \
&& rm -rf /var/lib/apt/lists/*
RUN adduser work
RUN wget https://nodejs.org/dist/v${nodever}/node-v${nodever}-linux-x64.tar.xz \
-O /home/work/node-v${nodever}-linux-x64.tar.xz \
&& xz -d /home/work/node-v${nodever}-linux-x64.tar.xz \
&& tar -xvf /home/work/node-v${nodever}-linux-x64.tar -C /home/work/ \
&& mv /home/work/node-v${nodever}-linux-x64 /home/work/nodejs \
&& ln -s /home/work/nodejs/bin/node /usr/local/bin/node \
&& ln -s /home/work/nodejs/bin/npm /usr/local/bin/npm \
&& ln -s /home/work/nodejs/bin/npx /usr/local/bin/npx \
&& rm /home/work/node-v${nodever}-linux-x64.tar
USER work
RUN mkdir /home/work/webster_startup
WORKDIR /home/work/webster_startup
RUN npm init -y
RUN npm i --save webster@latest
RUN npx playwright install
COPY example/crawler.js /home/work/webster_startup/
# CMD /home/work/nodejs/bin/node /home/work/webster_startup/crawler.js
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@ Webster is a reliable web crawling and scraping framework written with Node.js,

Which is different from other crawling framework is that Webster can scrape the content which rendered by browser client side javascript and ajax request

## Quick Start
Let's start a simple crawler request to google website:
```
docker pull zhuyingda/webster-playground
docker run --tty -e URL="https://www.google.com/robots.txt" zhuyingda/webster-playground node crawler.js
# add cookie with sign-in session
docker run --tty -e MOD=debug -e URL="https://www.google.com/robots.txt" -e Cookie="foo=1234; bar=abcd" zhuyingda/webster-playground node crawler.js
# set user-agent
docker run --tty -e URL="https://www.google.com/robots.txt" -e Cookie="foo=1234; bar=abcd" -e UA="Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" zhuyingda/webster-playground node crawler.js
# see crawling log
docker run --tty -e MOD=debug -e URL="https://www.google.com/robots.txt" -e Cookie="foo=1234; bar=abcd" -e UA="Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" zhuyingda/webster-playground node crawler.js
```

## Requirements
- Node.js 10.x+
- Works on Linux, Mac OSX
Expand Down
66 changes: 66 additions & 0 deletions example/crawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
* usage outside docker:
* docker run --tty -e URL="https://www.tiktok.com/" -e Cookie="foo=1234; bar=abcd" -e UA="Mozilla/115.0 AppleWebKit/537.36 Chrome/116" imageId node crawler.js
* usage inside docker:
* env URL="https://www.tiktok.com/" Cookie="foo=1234; bar=abcd" UA="Mozilla/115.0 AppleWebKit/537.36 Chrome/116" node crawler.js
*/
const { spider } = require('webster');

class MySpider extends spider {
get defUserAgent() {
return `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36`;
}
get defDeviceType() {
return 'pc';
}
async parseHtml(html) {
return true;
}
}

(async () => {
const url = process.env.URL || `https://www.tiktok.com/`;
const cookie = process.env.Cookie || ``;
const ua = process.env.UA || `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36`;
if (process.env.debug) {
console.log('crawl input:', url, cookie, ua);
}

const spider = new MySpider({
type: 'browser',
engine: 'playwright',
actions: [
{
type: 'waitAfterPageLoading',
value: 200
}
],
targets: [
{
selector: 'html',
type: 'html',
field: 'result'
}
],
customHeaders: {
'Cookie': cookie,
'User-Agent': ua,
}
});

if (process.env.debug) {
console.log('crawling start');
}
let crawlResult = await spider.startRequest(url);
if (process.env.debug) {
console.log('crawling end');
}
if (crawlResult.result) {
console.log(crawlResult.result[0].text);
process.exit(0);
}
else {
console.error('crawling error:', crawlResult);
process.exit(1);
}
})();
3 changes: 2 additions & 1 deletion lib/browser/puppeteer.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ module.exports.request = async function (option) {
logger.debug('[puppeteer] headless chromium will launch');
const launchOptions = {
args: ['--no-sandbox', '--disable-setuid-sandbox'],
headless: process.env.MOD !== 'browser',
// https://developer.chrome.com/articles/new-headless/
headless: process.env.MOD === 'browser' ? false : 'new',
ignoreHTTPSErrors: true
};
if (process.env.EXE_PATH) {
Expand Down
10 changes: 5 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"name": "webster",
"version": "1.8.5",
"version": "1.9.0",
"description": "a reliable web crawling & scraping framework for Node.js.",
"main": "index.js",
"scripts": {
"test": "./node_modules/mocha/bin/mocha test/test.js",
"postinstall": "opencollective-postinstall || true",
"postinstall": "npx playwright install && opencollective-postinstall || true",
"lint": "./node_modules/.bin/eslint ./lib/"
},
"repository": {
Expand All @@ -24,15 +24,15 @@
"url": "https://github.com/zhuyingda/webster/issues"
},
"engines": {
"node": ">=10.0.0"
"node": ">=16.0.0"
},
"homepage": "https://github.com/zhuyingda/webster#readme",
"dependencies": {
"jsdom": "^16.4.0",
"log4js": "^6.3.0",
"opencollective-postinstall": "^2.0.2",
"playwright": "^1.30.0",
"puppeteer": "^19.6.3",
"playwright": "^1.39.0",
"puppeteer": "^21.5.0",
"redis": "^3.1.2",
"request": "^2.88.0",
"uuid": "^3.1.0"
Expand Down

0 comments on commit ecba130

Please sign in to comment.