Skip to content

Commit c73ac78

Browse files
init 2.73-alpha
1 parent 21c0f42 commit c73ac78

File tree

13 files changed

+912
-180
lines changed

13 files changed

+912
-180
lines changed

WebCollector/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
88
<artifactId>WebCollector</artifactId>
9-
<version>2.72-beta</version>
9+
<version>2.73-alpha</version>
1010
<description>A java crawler for information collection</description>
1111
<url>https://github.com/CrawlScript/WebCollector</url>
1212
<packaging>jar</packaging>

WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/crawler/AutoParseCrawler.java

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import cn.edu.hfut.dmic.webcollector.fetcher.Executor;
2121
import cn.edu.hfut.dmic.webcollector.fetcher.Visitor;
22+
import cn.edu.hfut.dmic.webcollector.fetcher.VisitorMethodDispatcher;
2223
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
2324
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
2425
import cn.edu.hfut.dmic.webcollector.model.Links;
@@ -46,14 +47,23 @@ public abstract class AutoParseCrawler extends Crawler implements Executor, Visi
4647
protected Visitor visitor;
4748
protected Requester requester;
4849

50+
protected VisitorMethodDispatcher visitorMethodDispatcher;
51+
4952
public AutoParseCrawler(boolean autoParse) {
5053
this.autoParse = autoParse;
5154
this.requester = new OkHttpRequester();
5255
this.visitor = this;
5356
this.executor = this;
5457
}
5558

56-
// @Override
59+
@Override
60+
public void start(int depth) throws Exception {
61+
this.visitorMethodDispatcher = new VisitorMethodDispatcher(visitor, autoParse, regexRule);
62+
ConfigurationUtils.setTo(this, this.visitorMethodDispatcher);
63+
super.start(depth);
64+
}
65+
66+
// @Override
5767
// public Page getResponse(CrawlDatum crawlDatum) throws Exception {
5868
// HttpRequest request = new HttpRequest(crawlDatum);
5969
// return request.responsePage();
@@ -75,28 +85,12 @@ protected void registerOtherConfigurations() {
7585
@Override
7686
public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
7787
Page page = requester.getResponse(datum);
78-
visitor.visit(page, next);
79-
if (autoParse && !regexRule.isEmpty()) {
80-
parseLink(page, next);
81-
}
82-
afterParse(page, next);
83-
}
84-
85-
protected void afterParse(Page page, CrawlDatums next) {
88+
// visitor.visit(page, next);
89+
visitorMethodDispatcher.dispatch(page, next);
8690

8791
}
8892

89-
protected void parseLink(Page page, CrawlDatums next) {
90-
String conteType = page.contentType();
91-
if (conteType != null && conteType.contains("text/html")) {
92-
Document doc = page.doc();
93-
if (doc != null) {
94-
Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg());
95-
next.add(links);
96-
}
97-
}
9893

99-
}
10094

10195
/**
10296
* 添加URL正则约束
@@ -151,14 +145,14 @@ public Visitor getVisitor() {
151145
return visitor;
152146
}
153147

154-
/**
155-
* 设置Visitor
156-
*
157-
* @param visitor Visitor
158-
*/
159-
public void setVisitor(Visitor visitor) {
160-
this.visitor = visitor;
161-
}
148+
// /**
149+
// * 设置Visitor
150+
// *
151+
// * @param visitor Visitor
152+
// */
153+
// public void setVisitor(Visitor visitor) {
154+
// this.visitor = visitor;
155+
// }
162156

163157
public Requester getRequester() {
164158
return requester;

WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/crawler/Crawler.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -397,14 +397,14 @@ public Executor getExecutor() {
397397
return executor;
398398
}
399399

400-
/**
401-
* 设置执行器
402-
*
403-
* @param executor 执行器
404-
*/
405-
public void setExecutor(Executor executor) {
406-
this.executor = executor;
407-
}
400+
// /**
401+
// * 设置执行器
402+
// *
403+
// * @param executor 执行器
404+
// */
405+
// public void setExecutor(Executor executor) {
406+
// this.executor = executor;
407+
// }
408408

409409
/**
410410
* 返回每次迭代爬取的网页数量上限
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*
2+
* Copyright (C) 2015 hu
3+
*
4+
* This program is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU General Public License
6+
* as published by the Free Software Foundation; either version 2
7+
* of the License, or (at your option) any later version.
8+
*
9+
* This program is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with this program; if not, write to the Free Software
16+
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17+
*/
18+
package cn.edu.hfut.dmic.webcollector.example;
19+
20+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
21+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
22+
import cn.edu.hfut.dmic.webcollector.model.Page;
23+
import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;
24+
import org.jsoup.nodes.Element;
25+
import org.jsoup.select.Elements;
26+
27+
import java.net.URLEncoder;
28+
29+
/**
30+
* 本教程演示了WebCollector 2.20的新特性:
31+
* 1)MetaData:
32+
* MetaData是每个爬取任务的附加信息,灵活应用MetaData可以大大简化爬虫的设计.
33+
* 例如Post请求往往需要包含参数,而传统爬虫单纯使用URL来保存参数的方法不适合复杂的POST请求.
34+
* 一些爬取任务希望获取遍历树的深度信息,这也可以通过MetaData轻松实现,可参见教程DemoDepthCrawler
35+
*
36+
* 2)RamCrawler:
37+
* RamCrawler不需要依赖文件系统或数据库,适合一次性的爬取任务.
38+
* 如果希望编写长期任务,请使用BreadthCrawler.
39+
*
40+
* 本教程实现了一个爬取Bing搜索前n页结果的爬虫,爬虫的结果直接输出到标准输出流
41+
* 如果希望将爬取结果输出到ArrayList等数据结构中,在类中定义一个ArrayList的成员变量,
42+
* 输出时将结果插入ArrayList即可,这里需要注意的是爬虫是多线程的,而ArrayList不是线程
43+
* 安全的,因此在执行插入操作时,可使用synchronized(this){ //插入操作}的方式上锁保证安全。
44+
*
45+
* 本教程中对Bing搜索的解析规则可能会随Bing搜索的改版而失效
46+
*
47+
* @author hu
48+
*/
49+
public class DemoAnnotatedBingCrawler extends RamCrawler {
50+
51+
public DemoAnnotatedBingCrawler(String keyword, int maxPageNum) throws Exception {
52+
for (int pageNum = 1; pageNum <= maxPageNum; pageNum++) {
53+
String url = createBingUrl(keyword, pageNum);
54+
addSeedAndReturn(url)
55+
.type("searchEngine")
56+
.meta("keyword", keyword)
57+
.meta("pageNum", pageNum)
58+
.meta("depth", 1);
59+
}
60+
}
61+
62+
// 如果遇到301或者302,手动跳转(将任务加到next中)
63+
// 并且复制任务的meta
64+
@MatchCode(codes = {301, 302})
65+
public void visitRedirect(Page page, CrawlDatums next){
66+
next.addAndReturn(page.location()).meta(page.meta());
67+
}
68+
69+
@MatchType(types = "searchEngine")
70+
public void visitSearchEngine(Page page, CrawlDatums next) {
71+
String keyword = page.meta("keyword");
72+
int pageNum = page.metaAsInt("pageNum");
73+
System.out.println("成功抓取关键词" + keyword + "的第" + pageNum + "页搜索结果");
74+
Elements results = page.select("li.b_algo>h2>a");
75+
76+
for (int rank = 0; rank < results.size(); rank++) {
77+
Element result = results.get(rank);
78+
/*
79+
我们希望继续爬取每条搜索结果指向的网页,这里统称为外链。
80+
我们希望在访问外链时仍然能够知道外链处于搜索引擎的第几页、第几条,
81+
所以将页号和排序信息放入后续的CrawlDatum中,为了能够区分外链和
82+
搜索引擎结果页面,type设置为outlink,这里的值完全由
83+
用户定义,可以设置一个任意的值
84+
*/
85+
String href = result.attr("abs:href");
86+
next.addAndReturn(href)
87+
.type("outlink")
88+
.meta("keyword", keyword)
89+
.meta("pageNum", pageNum)
90+
.meta("rank", rank);
91+
}
92+
}
93+
94+
@MatchType(types = "outlink")
95+
public void visitOutlink(Page page, CrawlDatums next) {
96+
int depth = page.metaAsInt("depth");
97+
int pageNum = page.metaAsInt("pageNum");
98+
int rank = page.metaAsInt("rank");
99+
String referer=page.meta("referer");
100+
101+
String line = String.format("第%s页第%s个结果:%s(%s字节)\tdepth=%s\treferer=%s",
102+
pageNum, rank + 1, page.doc().title(),page.content().length, depth, referer);
103+
System.out.println(line);
104+
}
105+
106+
@Override
107+
public void visit(Page page, CrawlDatums next) {
108+
109+
}
110+
111+
/*
112+
在经典爬虫中,每个网页都有一个referer信息,表示当前网页的链接来源。
113+
例如我们首先访问新浪首页,然后从新浪首页中解析出了新的新闻链接,
114+
则这些网页的referer值都是新浪首页。WebCollector不直接保存referer值,
115+
但我们可以通过下面的方式,将referer信息保存在metaData中,达到同样的效果。
116+
经典爬虫中锚文本的存储也可以通过下面方式实现。
117+
118+
在一些需求中,希望得到当前页面在遍历树中的深度,利用metaData很容易实现
119+
这个功能,在将CrawlDatum添加到next中时,将其depth设置为当前访问页面
120+
的depth+1即可。
121+
*/
122+
@AfterParse
123+
public void afterParse(Page page, CrawlDatums next){
124+
int depth = page.metaAsInt("depth");
125+
next.meta("depth", depth + 1).meta("referer", page.url());
126+
}
127+
128+
129+
130+
public static void main(String[] args) throws Exception {
131+
DemoAnnotatedBingCrawler crawler = new DemoAnnotatedBingCrawler("网络爬虫", 3);
132+
crawler.start();
133+
}
134+
135+
/**
136+
* 根据关键词和页号拼接Bing搜索对应的URL
137+
* @param keyword 关键词
138+
* @param pageNum 页号
139+
* @return 对应的URL
140+
* @throws Exception 异常
141+
*/
142+
public static String createBingUrl(String keyword, int pageNum) throws Exception {
143+
int first = pageNum * 10 - 9;
144+
keyword = URLEncoder.encode(keyword, "utf-8");
145+
return String.format("http://cn.bing.com/search?q=%s&first=%s", keyword, first);
146+
}
147+
148+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
package cn.edu.hfut.dmic.webcollector.example;
2+
3+
4+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
5+
import cn.edu.hfut.dmic.webcollector.model.Page;
6+
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
7+
8+
public class DemoAnnotatedCrawler extends BreadthCrawler{
9+
10+
/**
11+
* 构造一个基于RocksDB的爬虫
12+
* RocksDB文件夹为crawlPath,crawlPath中维护了历史URL等信息
13+
* 不同任务不要使用相同的crawlPath
14+
* 两个使用相同crawlPath的爬虫并行爬取会产生错误
15+
*
16+
* @param crawlPath RocksDB使用的文件夹
17+
* @param autoParse 是否根据设置的正则自动探测新URL
18+
*/
19+
public DemoAnnotatedCrawler(String crawlPath, boolean autoParse) {
20+
super(crawlPath, autoParse);
21+
addSeed("https://blog.csdn.net/", "seed");
22+
addRegex("https://blog.csdn.net/.*");
23+
}
24+
25+
26+
27+
@MatchUrlRegexRule(urlRegexRule = {
28+
"https://blog.csdn.net/.*"
29+
})
30+
@MatchNullType()
31+
public void visitMain(Page page, CrawlDatums next) {
32+
System.out.println("this is regex seed");
33+
}
34+
35+
// @MatchUrl(urlRegex = "https://blog.csdn.net/.*")
36+
// public void visitOther(Page page, CrawlDatums next) {
37+
// System.out.println("this is other");
38+
// }
39+
40+
41+
// @MatchUrl(urlRegex = "https://blog.csdn.net/.*")
42+
// public void visitOther(Page page, CrawlDatums next) {
43+
// System.out.println("this is other");
44+
// }
45+
46+
47+
// @MatchType(types = "seed")
48+
// public void visitSeed(Page page, CrawlDatums next) {
49+
// System.out.println("this is type seed");
50+
// }
51+
52+
53+
@Override
54+
public void visit(Page page, CrawlDatums next) {
55+
// System.out.println("this is default");
56+
}
57+
58+
public static void main(String[] args) throws Exception {
59+
DemoAnnotatedCrawler crawler = new DemoAnnotatedCrawler("crawl", true);
60+
crawler.start(2);
61+
}
62+
}

0 commit comments

Comments
 (0)