Skip to content

Commit 14e373c

Browse files
modify README.md
1 parent 030c7c9 commit 14e373c

File tree

4 files changed

+73
-73
lines changed

4 files changed

+73
-73
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Annotation versions are named with `DemoAnnotatedxxxxxx.java`.
4848

4949
### Http Request
5050

51+
+ [DemoCookieCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoCookieCrawler)
5152
+ [DemoRedirectCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoRedirectCrawler.java) | [DemoAnnotatedRedirectCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedRedirectCrawler.java)
5253
+ [DemoPostCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoPostCrawler)
5354
+ [AbuyunDynamicProxyRequester.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/AbuyunDynamicProxyRequester.java)

src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedCrawler.java

Lines changed: 0 additions & 62 deletions
This file was deleted.
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package cn.edu.hfut.dmic.webcollector.example;
2+
3+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
4+
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
5+
import cn.edu.hfut.dmic.webcollector.model.Page;
6+
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
7+
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
8+
import okhttp3.Request;
9+
10+
/**
11+
* 教程:使用WebCollector自定义Http请求
12+
* 可以自定义User-Agent和Cookie
13+
*
14+
* @author hu
15+
*/
16+
public class DemoCookieCrawler extends BreadthCrawler {
17+
18+
// 自定义的请求插件
19+
// 可以自定义User-Agent和Cookie
20+
public static class MyRequester extends OkHttpRequester {
21+
22+
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36";
23+
String cookie = "name=abcdef";
24+
25+
// 每次发送请求前都会执行这个方法来构建请求
26+
@Override
27+
public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
28+
// 这里使用的是OkHttp中的Request.Builder
29+
// 可以参考OkHttp的文档来修改请求头
30+
System.out.println("request with cookie: " + cookie);
31+
return super.createRequestBuilder(crawlDatum)
32+
.addHeader("User-Agent", userAgent)
33+
.addHeader("Cookie", cookie);
34+
}
35+
36+
}
37+
38+
public DemoCookieCrawler(String crawlPath) {
39+
super(crawlPath, true);
40+
41+
// 设置请求插件
42+
setRequester(new MyRequester());
43+
44+
// 爬取github about下面的网页
45+
addSeed("https://github.com/about");
46+
addRegex("https://github.com/about/.*");
47+
48+
}
49+
50+
public void visit(Page page, CrawlDatums crawlDatums) {
51+
System.out.println(page.doc().title());
52+
}
53+
54+
public static void main(String[] args) throws Exception {
55+
DemoCookieCrawler crawler = new DemoCookieCrawler("crawl");
56+
crawler.start(2);
57+
}
58+
}

src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,20 @@ public void setAutoParse(boolean autoParse) {
237237
this.autoParse = autoParse;
238238
}
239239

240+
protected void parseLink(Page page, CrawlDatums next) {
241+
String conteType = page.contentType();
242+
if (conteType != null && conteType.contains("text/html")) {
243+
Document doc = page.doc();
244+
if (doc != null) {
245+
Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg());
246+
next.add(links);
247+
}
248+
}
249+
250+
}
251+
252+
253+
240254
// public static void main(String[] args) throws Exception {
241255
// Visitor visitor = new Visitor() {
242256
//
@@ -258,16 +272,5 @@ public void setAutoParse(boolean autoParse) {
258272
// }
259273
//
260274
//
261-
protected void parseLink(Page page, CrawlDatums next) {
262-
String conteType = page.contentType();
263-
if (conteType != null && conteType.contains("text/html")) {
264-
Document doc = page.doc();
265-
if (doc != null) {
266-
Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg());
267-
next.add(links);
268-
}
269-
}
270-
271-
}
272275

273276
}

0 commit comments

Comments
 (0)