diff --git a/src/main/java/com/myapp/crawler4j/Controller.java b/src/main/java/com/myapp/crawler4j/Controller.java index 07a5e37..d131a5b 100644 --- a/src/main/java/com/myapp/crawler4j/Controller.java +++ b/src/main/java/com/myapp/crawler4j/Controller.java @@ -23,19 +23,21 @@ public static void fetchProxyIp() { List crawlList = new ArrayList(size); for (int i = 0 ; i < list.size() ; i++) { try { + Crawl c = new Crawl(); CrawlConfig config = new CrawlConfig(); config.setMaxDepthOfCrawling(0); config.setPolitenessDelay(0); config.setCrawlStorageFolder(crawlStorageFolder + "/Controller"+i); - crawlList.get(i).setCrawlConfig(config); + c.setCrawlConfig(config); PageFetcher pageFetcher = new PageFetcher(config); - crawlList.get(i).setPageFetcherer(pageFetcher); + c.setPageFetcherer(pageFetcher); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed(list.get(i)); - crawlList.get(i).setCrawlController(controller); - crawlList.get(i).setCrawlName("Controller"+i); + c.setCrawlController(controller); + c.setCrawlName("Controller"+i); + crawlList.add(c); } catch (Exception e) { e.printStackTrace(); } diff --git a/src/main/java/com/myapp/crawler4j/MyCraler.java b/src/main/java/com/myapp/crawler4j/MyCraler.java index fce6f67..18c637a 100644 --- a/src/main/java/com/myapp/crawler4j/MyCraler.java +++ b/src/main/java/com/myapp/crawler4j/MyCraler.java @@ -21,6 +21,10 @@ public class MyCraler extends WebCrawler { private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$"); + private static String page1 = "http://www.kxdaili.com/"; + private static String page2 = "http://www.ip181.com/"; + + /** * This method receives two parameters. The first parameter is the page * in which we have discovered this new url and the second parameter is @@ -56,9 +60,7 @@ public void visit(Page page) { try { Document doc = Jsoup.parse(html); - - //doc = new Document(text); - if ("http://www.kxdaili.com/".equals(url)) { + if (page1.equals(url)) { for (int i = 1; i < 10; i++) { //System.out.println(doc.toString()); Elements trs = doc.select("table").get(1).select("tr"); @@ -71,7 +73,7 @@ public void visit(Page page) { Client.proxyPool.add(ip, port); redisOnMessageUtil.Push(area, ip, port); } - } else if ("http://www.ip181.com/".equals(url)) { + } else if (page2.equals(url)) { for (int i = 1; i < 50; i++) { Elements trs = doc.select("table").select("tr"); Elements tds = trs.get(i).select("td"); diff --git a/src/main/java/com/myapp/main/main.java b/src/main/java/com/myapp/main/main.java index 1dbe464..4e6d70d 100644 --- a/src/main/java/com/myapp/main/main.java +++ b/src/main/java/com/myapp/main/main.java @@ -65,7 +65,6 @@ public A(int j, int z,CountDownLatch latch) { public void run() { System.out.println("#####多线程分片跑区间:" + (j * z + 1) + "-" + ((j + 1) * z)); for (int i = j * z + 1; i < (j + 1) * z; i++) { - HttpProxy httpProxy = proxyPool.borrow(); HttpStatus code = ProxyIpCheck.Check(httpProxy.getProxy()); System.err.println("name:" + Thread.currentThread().getName() + httpProxy.getProxy() + ":" + code);