CrawlScript
diff --git a/‎WebCollector/pom.xml
Lines changed: 1 addition & 1 deletion b/‎WebCollector/pom.xml
Lines changed: 1 addition & 1 deletion
diff --git a/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/crawler/AutoParseCrawler.java
Lines changed: 21 additions & 27 deletions b/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/crawler/AutoParseCrawler.java
Lines changed: 21 additions & 27 deletions
diff --git a/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/crawler/Crawler.java
Lines changed: 8 additions & 8 deletions b/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/crawler/Crawler.java
Lines changed: 8 additions & 8 deletions
diff --git a/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedBingCrawler.java
Lines changed: 148 additions & 0 deletions b/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedBingCrawler.java
Lines changed: 148 additions & 0 deletions
diff --git a/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedCrawler.java
Lines changed: 62 additions & 0 deletions b/‎WebCollector/src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoAnnotatedCrawler.java
Lines changed: 62 additions & 0 deletions
@@ -6,7 +6,7 @@
 
     <groupId>cn.edu.hfut.dmic.webcollector</groupId>
     <artifactId>WebCollector</artifactId>
-    <version>2.72-beta</version>
+    <version>2.73-alpha</version>
     <description>A java crawler for information collection</description>
     <url>https://github.com/CrawlScript/WebCollector</url>
     <packaging>jar</packaging>
 
@@ -19,6 +19,7 @@
 
 import cn.edu.hfut.dmic.webcollector.fetcher.Executor;
 import cn.edu.hfut.dmic.webcollector.fetcher.Visitor;
+import cn.edu.hfut.dmic.webcollector.fetcher.VisitorMethodDispatcher;
 import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
 import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
 import cn.edu.hfut.dmic.webcollector.model.Links;
@@ -46,14 +47,23 @@ public abstract class AutoParseCrawler extends Crawler implements Executor, Visi
     protected Visitor visitor;
     protected Requester requester;
 
+    protected VisitorMethodDispatcher visitorMethodDispatcher;
+
     public AutoParseCrawler(boolean autoParse) {
         this.autoParse = autoParse;
         this.requester = new OkHttpRequester();
         this.visitor = this;
         this.executor = this;
     }
 
-//    @Override
+    @Override
+    public void start(int depth) throws Exception {
+        this.visitorMethodDispatcher = new VisitorMethodDispatcher(visitor, autoParse, regexRule);
+        ConfigurationUtils.setTo(this, this.visitorMethodDispatcher);
+        super.start(depth);
+    }
+
+    //    @Override
 //    public Page getResponse(CrawlDatum crawlDatum) throws Exception {
 //        HttpRequest request = new HttpRequest(crawlDatum);
 //        return request.responsePage();
@@ -75,28 +85,12 @@ protected void registerOtherConfigurations() {
     @Override
     public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
         Page page = requester.getResponse(datum);
-        visitor.visit(page, next);
-        if (autoParse && !regexRule.isEmpty()) {
-            parseLink(page, next);
-        }
-        afterParse(page, next);
-    }
-
-    protected void afterParse(Page page, CrawlDatums next) {
+//        visitor.visit(page, next);
+        visitorMethodDispatcher.dispatch(page, next);
 
     }
 
-    protected void parseLink(Page page, CrawlDatums next) {
-        String conteType = page.contentType();
-        if (conteType != null && conteType.contains("text/html")) {
-            Document doc = page.doc();
-            if (doc != null) {
-                Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg());
-                next.add(links);
-            }
-        }
 
-    }
 
     /**
      * 添加URL正则约束
@@ -151,14 +145,14 @@ public Visitor getVisitor() {
         return visitor;
     }
 
-    /**
-     * 设置Visitor
-     *
-     * @param visitor Visitor
-     */
-    public void setVisitor(Visitor visitor) {
-        this.visitor = visitor;
-    }
+//    /**
+//     * 设置Visitor
+//     *
+//     * @param visitor Visitor
+//     */
+//    public void setVisitor(Visitor visitor) {
+//        this.visitor = visitor;
+//    }
 
     public Requester getRequester() {
         return requester;
 
@@ -397,14 +397,14 @@ public Executor getExecutor() {
         return executor;
     }
 
-    /**
-     * 设置执行器
-     *
-     * @param executor 执行器
-     */
-    public void setExecutor(Executor executor) {
-        this.executor = executor;
-    }
+//    /**
+//     * 设置执行器
+//     *
+//     * @param executor 执行器
+//     */
+//    public void setExecutor(Executor executor) {
+//        this.executor = executor;
+//    }
 
     /**
      * 返回每次迭代爬取的网页数量上限
 
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2015 hu
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+package cn.edu.hfut.dmic.webcollector.example;
+
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
+import cn.edu.hfut.dmic.webcollector.model.Page;
+import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.net.URLEncoder;
+
+/**
+ * 本教程演示了WebCollector 2.20的新特性:
+ *  1)MetaData:
+ *    MetaData是每个爬取任务的附加信息,灵活应用MetaData可以大大简化爬虫的设计.
+ *    例如Post请求往往需要包含参数，而传统爬虫单纯使用URL来保存参数的方法不适合复杂的POST请求.
+ *    一些爬取任务希望获取遍历树的深度信息，这也可以通过MetaData轻松实现，可参见教程DemoDepthCrawler
+ *    
+ *  2)RamCrawler:
+ *    RamCrawler不需要依赖文件系统或数据库，适合一次性的爬取任务.
+ *    如果希望编写长期任务，请使用BreadthCrawler.
+ * 
+ * 本教程实现了一个爬取Bing搜索前n页结果的爬虫，爬虫的结果直接输出到标准输出流
+ * 如果希望将爬取结果输出到ArrayList等数据结构中，在类中定义一个ArrayList的成员变量，
+ * 输出时将结果插入ArrayList即可，这里需要注意的是爬虫是多线程的，而ArrayList不是线程
+ * 安全的，因此在执行插入操作时，可使用synchronized(this){ //插入操作}的方式上锁保证安全。
+ * 
+ * 本教程中对Bing搜索的解析规则可能会随Bing搜索的改版而失效
+ * 
+ * @author hu
+ */
+public class DemoAnnotatedBingCrawler extends RamCrawler {
+
+    public DemoAnnotatedBingCrawler(String keyword, int maxPageNum) throws Exception {
+        for (int pageNum = 1; pageNum <= maxPageNum; pageNum++) {
+            String url = createBingUrl(keyword, pageNum);
+            addSeedAndReturn(url)
+                    .type("searchEngine")
+                    .meta("keyword", keyword)
+                    .meta("pageNum", pageNum)
+                    .meta("depth", 1);
+        }
+    }
+
+    // 如果遇到301或者302，手动跳转（将任务加到next中）
+    // 并且复制任务的meta
+    @MatchCode(codes = {301, 302})
+    public void visitRedirect(Page page, CrawlDatums next){
+        next.addAndReturn(page.location()).meta(page.meta());
+    }
+
+    @MatchType(types = "searchEngine")
+    public void visitSearchEngine(Page page, CrawlDatums next) {
+        String keyword = page.meta("keyword");
+        int pageNum = page.metaAsInt("pageNum");
+        System.out.println("成功抓取关键词" + keyword + "的第" + pageNum + "页搜索结果");
+        Elements results = page.select("li.b_algo>h2>a");
+
+        for (int rank = 0; rank < results.size(); rank++) {
+            Element result = results.get(rank);
+            /*
+            我们希望继续爬取每条搜索结果指向的网页，这里统称为外链。
+            我们希望在访问外链时仍然能够知道外链处于搜索引擎的第几页、第几条，
+            所以将页号和排序信息放入后续的CrawlDatum中，为了能够区分外链和
+            搜索引擎结果页面，type设置为outlink，这里的值完全由
+            用户定义，可以设置一个任意的值
+            */
+            String href = result.attr("abs:href");
+            next.addAndReturn(href)
+                    .type("outlink")
+                    .meta("keyword", keyword)
+                    .meta("pageNum", pageNum)
+                    .meta("rank", rank);
+        }
+    }
+
+    @MatchType(types = "outlink")
+    public void visitOutlink(Page page, CrawlDatums next) {
+        int depth = page.metaAsInt("depth");
+        int pageNum = page.metaAsInt("pageNum");
+        int rank = page.metaAsInt("rank");
+        String referer=page.meta("referer");
+
+        String line = String.format("第%s页第%s个结果:%s(%s字节)\tdepth=%s\treferer=%s",
+                pageNum, rank + 1, page.doc().title(),page.content().length, depth, referer);
+        System.out.println(line);
+    }
+
+    @Override
+    public void visit(Page page, CrawlDatums next) {
+
+    }
+
+    /*
+    在经典爬虫中，每个网页都有一个referer信息，表示当前网页的链接来源。
+    例如我们首先访问新浪首页，然后从新浪首页中解析出了新的新闻链接，
+    则这些网页的referer值都是新浪首页。WebCollector不直接保存referer值，
+    但我们可以通过下面的方式，将referer信息保存在metaData中，达到同样的效果。
+    经典爬虫中锚文本的存储也可以通过下面方式实现。
+
+    在一些需求中，希望得到当前页面在遍历树中的深度，利用metaData很容易实现
+    这个功能，在将CrawlDatum添加到next中时，将其depth设置为当前访问页面
+    的depth+1即可。
+     */
+    @AfterParse
+    public void afterParse(Page page, CrawlDatums next){
+        int depth = page.metaAsInt("depth");
+        next.meta("depth", depth + 1).meta("referer", page.url());
+    }
+
+
+
+    public static void main(String[] args) throws Exception {
+        DemoAnnotatedBingCrawler crawler = new DemoAnnotatedBingCrawler("网络爬虫", 3);
+        crawler.start();
+    }
+
+    /**
+     * 根据关键词和页号拼接Bing搜索对应的URL
+     * @param keyword 关键词
+     * @param pageNum 页号
+     * @return 对应的URL
+     * @throws Exception 异常 
+     */
+    public static String createBingUrl(String keyword, int pageNum) throws Exception {
+        int first = pageNum * 10 - 9;
+        keyword = URLEncoder.encode(keyword, "utf-8");
+        return String.format("http://cn.bing.com/search?q=%s&first=%s", keyword, first);
+    }
+
+}
@@ -0,0 +1,62 @@
+package cn.edu.hfut.dmic.webcollector.example;
+
+
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
+import cn.edu.hfut.dmic.webcollector.model.Page;
+import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
+
+public class DemoAnnotatedCrawler extends BreadthCrawler{
+
+    /**
+     * 构造一个基于RocksDB的爬虫
+     * RocksDB文件夹为crawlPath，crawlPath中维护了历史URL等信息
+     * 不同任务不要使用相同的crawlPath
+     * 两个使用相同crawlPath的爬虫并行爬取会产生错误
+     *
+     * @param crawlPath RocksDB使用的文件夹
+     * @param autoParse 是否根据设置的正则自动探测新URL
+     */
+    public DemoAnnotatedCrawler(String crawlPath, boolean autoParse) {
+        super(crawlPath, autoParse);
+        addSeed("https://blog.csdn.net/", "seed");
+        addRegex("https://blog.csdn.net/.*");
+    }
+
+
+
+    @MatchUrlRegexRule(urlRegexRule = {
+            "https://blog.csdn.net/.*"
+    })
+    @MatchNullType()
+    public void visitMain(Page page, CrawlDatums next) {
+        System.out.println("this is regex seed");
+    }
+
+//    @MatchUrl(urlRegex = "https://blog.csdn.net/.*")
+//    public void visitOther(Page page, CrawlDatums next) {
+//        System.out.println("this is other");
+//    }
+
+
+//    @MatchUrl(urlRegex = "https://blog.csdn.net/.*")
+//    public void visitOther(Page page, CrawlDatums next) {
+//        System.out.println("this is other");
+//    }
+
+
+//    @MatchType(types = "seed")
+//    public void visitSeed(Page page, CrawlDatums next) {
+//        System.out.println("this is type seed");
+//    }
+
+
+    @Override
+    public void visit(Page page, CrawlDatums next) {
+//        System.out.println("this is default");
+    }
+
+    public static void main(String[] args) throws Exception {
+        DemoAnnotatedCrawler crawler = new DemoAnnotatedCrawler("crawl", true);
+        crawler.start(2);
+    }
+}