Skip to content

Commit ae816c2

Browse files
DemoExceptionCrawler
1 parent 6e82886 commit ae816c2

File tree

3 files changed

+16
-41
lines changed

3 files changed

+16
-41
lines changed

README.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ Annotation versions are named with `DemoAnnotatedxxxxxx.java`.
3535
### Basic
3636

3737
+ [DemoAutoNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAutoNewsCrawler.java) | [DemoAnnotatedAutoNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedAutoNewsCrawler.java)
38-
+ [DemoManualNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoManualNewsCrawler.java)
39-
38+
+ [DemoManualNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoManualNewsCrawler.java) | [DemoAnnotatedManualNewsCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedManualNewsCrawler.java)
39+
+ [DemoExceptionCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoExceptionCrawler.java)
4040

4141
### CrawlDatum and MetaData
4242

@@ -46,27 +46,23 @@ Annotation versions are named with `DemoAnnotatedxxxxxx.java`.
4646
+ [DemoBingCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoBingCrawler.java) | [DemoAnnotatedBingCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedBingCrawler.java)
4747
+ [DemoAnnotatedDepthCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedDepthCrawler.java)
4848

49-
### Http Request
49+
### Http Request and Javascript
5050

5151
+ [DemoCookieCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoCookieCrawler.java)
5252
+ [DemoRedirectCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoRedirectCrawler.java) | [DemoAnnotatedRedirectCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedRedirectCrawler.java)
5353
+ [DemoPostCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoPostCrawler)
5454
+ [DemoRandomProxyCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoRandomProxyCrawler.java)
5555
+ [AbuyunDynamicProxyRequester.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/AbuyunDynamicProxyRequester.java)
56+
+ [DemoSeleniumCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoSeleniumCrawler.java)
5657

5758
### NextFilter
5859

5960
+ [DemoNextFilter.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoNextFilter.java)
6061
+ [DemoHashSetNextFilter.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoHashSetNextFilter.java)
6162

6263

63-
### Exception
6464

65-
+ [DemoExceptionCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoExceptionCrawler.java)
66-
67-
### Javascript
6865

69-
+ [DemoSeleniumCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoSeleniumCrawler.java)
7066

7167

7268
## Quickstart

src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoExceptionCrawler.java

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
44
import cn.edu.hfut.dmic.webcollector.model.Page;
55
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
6+
import cn.edu.hfut.dmic.webcollector.util.ExceptionUtils;
67

78
/**
89
* Crawling news from github news
@@ -20,44 +21,22 @@ public DemoExceptionCrawler(String crawlPath, boolean autoParse) {
2021
super(crawlPath, autoParse);
2122
/*start pages*/
2223
this.addSeed("https://blog.github.com/");
23-
for(int pageIndex = 2; pageIndex <= 5; pageIndex++) {
24-
String seedUrl = String.format("https://blog.github.com/page/%d/", pageIndex);
25-
this.addSeed(seedUrl);
26-
}
27-
28-
/*fetch url like "https://blog.github.com/2018-07-13-graphql-for-octokit/" */
29-
this.addRegex("https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}-[^/]+/");
30-
/*do not fetch jpg|png|gif*/
31-
//this.addRegex("-.*\\.(jpg|png|gif).*");
32-
/*do not fetch url contains #*/
33-
//this.addRegex("-.*#.*");
34-
35-
setThreads(50);
36-
getConf().setTopN(100);
24+
}
3725

38-
//enable resumable mode
39-
//setResumable(true);
26+
public void myMethod() throws Exception{
27+
throw new Exception("this is an exception");
4028
}
4129

4230
@Override
4331
public void visit(Page page, CrawlDatums next) {
44-
String url = page.url();
45-
/*if page is news page*/
46-
if (page.matchUrl("https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}[^/]+/")) {
47-
48-
/*extract title and content of news by css selector*/
49-
String title = page.select("h1[class=lh-condensed]").first().text();
50-
String content = page.selectText("div.content.markdown-body");
51-
52-
System.out.println("URL:\n" + url);
53-
System.out.println("title:\n" + title);
54-
System.out.println("content:\n" + content);
55-
56-
/*If you want to add urls to crawl,add them to nextLink*/
57-
/*WebCollector automatically filters links that have been fetched before*/
58-
/*If autoParse is true and the link you add to nextLinks does not match the
59-
regex rules,the link will also been filtered.*/
60-
//next.add("http://xxxxxx.com");
32+
try {
33+
this.myMethod();
34+
} catch (Exception e) {
35+
// 当捕捉到异常时,且认为这个网页需要重新爬取时
36+
// 应该使用ExceptionUtils.fail(e)
37+
// 无视或者throw异常在编译时会报错,因为visit方法没有throws异常
38+
// 该方法会抛出RuntimeException,不会强制要求visit方法加上throws
39+
ExceptionUtils.fail(e);
6140
}
6241
}
6342

webcollector-2.73-alpha-bin.zip

-578 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)