Skip to content

Commit 030c7c9

Browse files
modify README.md
1 parent 283bf0d commit 030c7c9

File tree

4 files changed

+163
-9
lines changed

4 files changed

+163
-9
lines changed

README.md

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,17 @@ WebCollector is an open source web crawler framework based on Java.It provides
99

1010
## Installation
1111

12+
<!--
1213
### Using Maven
1314
1415
```xml
1516
<dependency>
1617
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
1718
<artifactId>WebCollector</artifactId>
18-
<version>2.72-beta</version>
19+
<version>2.73-alpha</version>
1920
</dependency>
2021
```
22+
-->
2123

2224
### Without Maven
2325
WebCollector jars are available on the [HomePage](https://github.com/CrawlScript/WebCollector).
@@ -28,22 +30,33 @@ WebCollector jars are available on the [HomePage](https://github.com/CrawlScript
2830

2931
## Example Index
3032

33+
Annotation versions are named with `DemoAnnotatedxxxxxx.java`.
34+
3135
### Basic
3236

33-
+ [DemoAutoNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAutoNewsCrawler.java) ([Annotation Version](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedAutoNewsCrawler.java))
37+
+ [DemoAutoNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAutoNewsCrawler.java) | [DemoAnnotatedAutoNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedAutoNewsCrawler.java)
3438
+ [DemoManualNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoManualNewsCrawler.java)
3539

3640

37-
### MetaData
38-
39-
+ [DemoBingCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoBingCrawler.java)([Annotation Version](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedBingCrawler.java))
41+
### CrawlDatum and MetaData
4042

43+
+ [DemoMetaCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoMetaCrawler.java)
44+
+ [DemoAnnotatedMatchTypeCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedMatchTypeCrawler.java)
45+
+ [DemoAnnotatedDepthCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedDepthCrawler.java)
46+
+ [DemoBingCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoBingCrawler.java) | [DemoAnnotatedBingCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedBingCrawler.java)
47+
+ [DemoAnnotatedDepthCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedDepthCrawler.java)
4148

4249
### Http Request
4350

44-
+ [DemoBingCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoBingCrawler.java)([Annotation Version](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedBingCrawler.java))
51+
+ [DemoRedirectCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoRedirectCrawler.java) | [DemoAnnotatedRedirectCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedRedirectCrawler.java)
52+
+ [DemoPostCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoPostCrawler)
4553
+ [AbuyunDynamicProxyRequester.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/AbuyunDynamicProxyRequester.java)
4654

55+
### NextFilter
56+
57+
+ [DemoNextFilter.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoNextFilter.java)
58+
+ [DemoHashSetNextFilter.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoHashSetNextFilter.java)
59+
4760

4861

4962

src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoDepthCrawler.java renamed to src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedDepthCrawler.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929
*
3030
* @author hu
3131
*/
32-
public class DemoDepthCrawler extends BreadthCrawler {
32+
public class DemoAnnotatedDepthCrawler extends BreadthCrawler {
3333

34-
public DemoDepthCrawler(String crawlPath, boolean autoParse) {
34+
public DemoAnnotatedDepthCrawler(String crawlPath, boolean autoParse) {
3535
super(crawlPath, autoParse);
3636

3737
for (int i = 1; i <= 5; i++) {
@@ -71,7 +71,7 @@ public void afterParse(Page page, CrawlDatums next) {
7171

7272

7373
public static void main(String[] args) throws Exception {
74-
DemoDepthCrawler crawler = new DemoDepthCrawler("crawl", true);
74+
DemoAnnotatedDepthCrawler crawler = new DemoAnnotatedDepthCrawler("crawl", true);
7575
crawler.getConf().setTopN(5);
7676
crawler.start(3);
7777
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright (C) 2015 hu
3+
*
4+
* This program is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU General Public License
6+
* as published by the Free Software Foundation; either version 2
7+
* of the License, or (at your option) any later version.
8+
*
9+
* This program is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with this program; if not, write to the Free Software
16+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17+
*/
18+
package cn.edu.hfut.dmic.webcollector.example;
19+
20+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
21+
import cn.edu.hfut.dmic.webcollector.model.Page;
22+
import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;
23+
24+
import java.net.URLEncoder;
25+
26+
/**
27+
* Handle 301/302 redirection
28+
* @author hu
29+
*/
30+
public class DemoAnnotatedRedirectCrawler extends RamCrawler {
31+
32+
public DemoAnnotatedRedirectCrawler(String keyword, int maxPageNum) throws Exception {
33+
for (int pageNum = 1; pageNum <= maxPageNum; pageNum++) {
34+
String url = createBingUrl(keyword, pageNum);
35+
addSeedAndReturn(url);
36+
}
37+
}
38+
39+
// 如果遇到301或者302,手动跳转(将任务加到next中)
40+
// 并且复制任务的meta
41+
@MatchCode(codes = {301, 302})
42+
public void visitRedirect(Page page, CrawlDatums next){
43+
next.addAndReturn(page.location()).meta(page.meta());
44+
}
45+
46+
47+
@Override
48+
public void visit(Page page, CrawlDatums next) {
49+
System.out.println("this page is not redirected: " + page.url());
50+
}
51+
52+
public static void main(String[] args) throws Exception {
53+
DemoAnnotatedRedirectCrawler crawler = new DemoAnnotatedRedirectCrawler("网络爬虫", 3);
54+
crawler.start();
55+
}
56+
57+
/**
58+
* 根据关键词和页号拼接Bing搜索对应的URL
59+
* @param keyword 关键词
60+
* @param pageNum 页号
61+
* @return 对应的URL
62+
* @throws Exception 异常
63+
*/
64+
public static String createBingUrl(String keyword, int pageNum) throws Exception {
65+
int first = pageNum * 10 - 9;
66+
keyword = URLEncoder.encode(keyword, "utf-8");
67+
return String.format("http://cn.bing.com/search?q=%s&first=%s", keyword, first);
68+
}
69+
70+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Copyright (C) 2015 hu
3+
*
4+
* This program is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU General Public License
6+
* as published by the Free Software Foundation; either version 2
7+
* of the License, or (at your option) any later version.
8+
*
9+
* This program is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with this program; if not, write to the Free Software
16+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17+
*/
18+
package cn.edu.hfut.dmic.webcollector.example;
19+
20+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
21+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
22+
import cn.edu.hfut.dmic.webcollector.model.Page;
23+
import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;
24+
import org.jsoup.nodes.Element;
25+
import org.jsoup.select.Elements;
26+
27+
import java.net.URLEncoder;
28+
29+
/**
30+
* Handle 301/302 redirection
31+
* @author hu
32+
*/
33+
public class DemoRedirectCrawler extends RamCrawler {
34+
35+
public DemoRedirectCrawler(String keyword, int maxPageNum) throws Exception {
36+
for (int pageNum = 1; pageNum <= maxPageNum; pageNum++) {
37+
String url = createBingUrl(keyword, pageNum);
38+
addSeedAndReturn(url);
39+
}
40+
}
41+
42+
@Override
43+
public void visit(Page page, CrawlDatums next) {
44+
// 如果遇到301或者302,手动跳转(将任务加到next中)
45+
// 并且复制任务的meta
46+
if(page.code() == 301 || page.code() == 302){
47+
next.addAndReturn(page.location()).meta(page.meta());
48+
return;
49+
}
50+
System.out.println("this page is not redirected: " + page.url());
51+
}
52+
53+
public static void main(String[] args) throws Exception {
54+
DemoRedirectCrawler crawler = new DemoRedirectCrawler("网络爬虫", 3);
55+
crawler.start();
56+
}
57+
58+
/**
59+
* 根据关键词和页号拼接Bing搜索对应的URL
60+
* @param keyword 关键词
61+
* @param pageNum 页号
62+
* @return 对应的URL
63+
* @throws Exception 异常
64+
*/
65+
public static String createBingUrl(String keyword, int pageNum) throws Exception {
66+
int first = pageNum * 10 - 9;
67+
keyword = URLEncoder.encode(keyword, "utf-8");
68+
return String.format("http://cn.bing.com/search?q=%s&first=%s", keyword, first);
69+
}
70+
71+
}

0 commit comments

Comments
 (0)