3
3
import cn .edu .hfut .dmic .webcollector .model .CrawlDatums ;
4
4
import cn .edu .hfut .dmic .webcollector .model .Page ;
5
5
import cn .edu .hfut .dmic .webcollector .plugin .rocks .BreadthCrawler ;
6
+ import cn .edu .hfut .dmic .webcollector .util .ExceptionUtils ;
6
7
7
8
/**
8
9
* Crawling news from github news
@@ -20,44 +21,22 @@ public DemoExceptionCrawler(String crawlPath, boolean autoParse) {
20
21
super (crawlPath , autoParse );
21
22
/*start pages*/
22
23
this .addSeed ("https://blog.github.com/" );
23
- for (int pageIndex = 2 ; pageIndex <= 5 ; pageIndex ++) {
24
- String seedUrl = String .format ("https://blog.github.com/page/%d/" , pageIndex );
25
- this .addSeed (seedUrl );
26
- }
27
-
28
- /*fetch url like "https://blog.github.com/2018-07-13-graphql-for-octokit/" */
29
- this .addRegex ("https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}-[^/]+/" );
30
- /*do not fetch jpg|png|gif*/
31
- //this.addRegex("-.*\\.(jpg|png|gif).*");
32
- /*do not fetch url contains #*/
33
- //this.addRegex("-.*#.*");
34
-
35
- setThreads (50 );
36
- getConf ().setTopN (100 );
24
+ }
37
25
38
- //enable resumable mode
39
- //setResumable(true );
26
+ public void myMethod () throws Exception {
27
+ throw new Exception ( "this is an exception" );
40
28
}
41
29
42
30
@ Override
43
31
public void visit (Page page , CrawlDatums next ) {
44
- String url = page .url ();
45
- /*if page is news page*/
46
- if (page .matchUrl ("https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}[^/]+/" )) {
47
-
48
- /*extract title and content of news by css selector*/
49
- String title = page .select ("h1[class=lh-condensed]" ).first ().text ();
50
- String content = page .selectText ("div.content.markdown-body" );
51
-
52
- System .out .println ("URL:\n " + url );
53
- System .out .println ("title:\n " + title );
54
- System .out .println ("content:\n " + content );
55
-
56
- /*If you want to add urls to crawl,add them to nextLink*/
57
- /*WebCollector automatically filters links that have been fetched before*/
58
- /*If autoParse is true and the link you add to nextLinks does not match the
59
- regex rules,the link will also been filtered.*/
60
- //next.add("http://xxxxxx.com");
32
+ try {
33
+ this .myMethod ();
34
+ } catch (Exception e ) {
35
+ // 当捕捉到异常时,且认为这个网页需要重新爬取时
36
+ // 应该使用ExceptionUtils.fail(e)
37
+ // 无视或者throw异常在编译时会报错,因为visit方法没有throws异常
38
+ // 该方法会抛出RuntimeException,不会强制要求visit方法加上throws
39
+ ExceptionUtils .fail (e );
61
40
}
62
41
}
63
42
0 commit comments