@@ -3,17 +3,9 @@ WebCollector is an open source web crawler framework based on Java.It provides
3
3
some simple interfaces for crawling the Web,you can setup a
4
4
multi-threaded web crawler in less than 5 minutes.
5
5
6
-
7
-
8
-
9
6
## HomePage
10
7
[ https://github.com/CrawlScript/WebCollector ] ( https://github.com/CrawlScript/WebCollector )
11
8
12
- <!--
13
- ## Document
14
- [WebCollector-GitDoc](https://github.com/CrawlScript/WebCollector-GitDoc)
15
- -->
16
-
17
9
18
10
## Installation
19
11
@@ -34,13 +26,19 @@ WebCollector jars are available on the [HomePage](https://github.com/CrawlScript
34
26
35
27
36
28
29
+ ## Example Index
30
+
31
+ + [ AbuyunDynamicProxyRequester.java] ( src/main/java/cn/edu/hfut/dmic/webcollector/example/AbuyunDynamicProxyRequester.java )
32
+
33
+
34
+
37
35
38
36
## Quickstart
39
37
Lets crawl some news from github news.This demo prints out the titles and contents extracted from news of github news.
40
38
41
39
### Automatically Detecting URLs
42
40
43
- [ AutoNewsCrawler .java] ( https://github.com/CrawlScript/WebCollector/blob/master/AutoNewsCrawler .java) :
41
+ [ DemoAutoNewsCrawler .java] ( src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAutoNewsCrawler .java) :
44
42
45
43
``` java
46
44
@@ -53,14 +51,14 @@ import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
53
51
*
54
52
* @author hu
55
53
*/
56
- public class AutoNewsCrawler extends BreadthCrawler {
54
+ public class DemoAutoNewsCrawler extends BreadthCrawler {
57
55
/**
58
56
* @param crawlPath crawlPath is the path of the directory which maintains
59
57
* information of this crawler
60
58
* @param autoParse if autoParse is true,BreadthCrawler will auto extract
61
59
* links which match regex rules from pag
62
60
*/
63
- public AutoNewsCrawler (String crawlPath , boolean autoParse ) {
61
+ public DemoAutoNewsCrawler (String crawlPath , boolean autoParse ) {
64
62
super (crawlPath, autoParse);
65
63
/* start pages*/
66
64
this . addSeed(" https://blog.github.com/" );
@@ -106,7 +104,7 @@ public class AutoNewsCrawler extends BreadthCrawler {
106
104
}
107
105
108
106
public static void main (String [] args ) throws Exception {
109
- AutoNewsCrawler crawler = new AutoNewsCrawler (" crawl" , true );
107
+ DemoAutoNewsCrawler crawler = new DemoAutoNewsCrawler (" crawl" , true );
110
108
/* start crawl with depth of 4*/
111
109
crawler. start(4 );
112
110
}
@@ -119,7 +117,7 @@ public class AutoNewsCrawler extends BreadthCrawler {
119
117
### Manually Detecting URLs
120
118
121
119
122
- [ ManualNewsCrawler .java] ( https://github.com/CrawlScript/WebCollector/blob/master/ManualNewsCrawler .java) :
120
+ [ DemoManualNewsCrawler .java] ( src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoManualNewsCrawler .java) :
123
121
124
122
``` java
125
123
@@ -132,14 +130,14 @@ import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
132
130
*
133
131
* @author hu
134
132
*/
135
- public class ManualNewsCrawler extends BreadthCrawler {
133
+ public class DemoManualNewsCrawler extends BreadthCrawler {
136
134
/**
137
135
* @param crawlPath crawlPath is the path of the directory which maintains
138
136
* information of this crawler
139
137
* @param autoParse if autoParse is true,BreadthCrawler will auto extract
140
138
* links which match regex rules from pag
141
139
*/
142
- public ManualNewsCrawler (String crawlPath , boolean autoParse ) {
140
+ public DemoManualNewsCrawler (String crawlPath , boolean autoParse ) {
143
141
super (crawlPath, autoParse);
144
142
// add 5 start pages and set their type to "list"
145
143
// "list" is not a reserved word, you can use other string instead
@@ -182,7 +180,7 @@ public class ManualNewsCrawler extends BreadthCrawler {
182
180
}
183
181
184
182
public static void main (String [] args ) throws Exception {
185
- ManualNewsCrawler crawler = new ManualNewsCrawler (" crawl" , false );
183
+ DemoManualNewsCrawler crawler = new DemoManualNewsCrawler (" crawl" , false );
186
184
187
185
crawler. getConf(). setExecuteInterval(5000 );
188
186
0 commit comments