Skip to content

Commit

Permalink
Fix:crawler4j.MyCraler
Browse files Browse the repository at this point in the history
  • Loading branch information
gaorui authored and gaorui committed Nov 7, 2017
1 parent c7adf91 commit 7d54158
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 9 deletions.
10 changes: 6 additions & 4 deletions src/main/java/com/myapp/crawler4j/Controller.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,21 @@ public static void fetchProxyIp() {
List<Crawl> crawlList = new ArrayList(size);
for (int i = 0 ; i < list.size() ; i++) {
try {
Crawl c = new Crawl();
CrawlConfig config = new CrawlConfig();
config.setMaxDepthOfCrawling(0);
config.setPolitenessDelay(0);
config.setCrawlStorageFolder(crawlStorageFolder + "/Controller"+i);
crawlList.get(i).setCrawlConfig(config);
c.setCrawlConfig(config);
PageFetcher pageFetcher = new PageFetcher(config);
crawlList.get(i).setPageFetcherer(pageFetcher);
c.setPageFetcherer(pageFetcher);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed(list.get(i));
crawlList.get(i).setCrawlController(controller);
crawlList.get(i).setCrawlName("Controller"+i);
c.setCrawlController(controller);
c.setCrawlName("Controller"+i);
crawlList.add(c);
} catch (Exception e) {
e.printStackTrace();
}
Expand Down
10 changes: 6 additions & 4 deletions src/main/java/com/myapp/crawler4j/MyCraler.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ public class MyCraler extends WebCrawler {
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
+ "|png|mp3|mp3|zip|gz))$");

private static String page1 = "http://www.kxdaili.com/";
private static String page2 = "http://www.ip181.com/";


/**
* This method receives two parameters. The first parameter is the page
* in which we have discovered this new url and the second parameter is
Expand Down Expand Up @@ -56,9 +60,7 @@ public void visit(Page page) {

try {
Document doc = Jsoup.parse(html);

//doc = new Document(text);
if ("http://www.kxdaili.com/".equals(url)) {
if (page1.equals(url)) {
for (int i = 1; i < 10; i++) {
//System.out.println(doc.toString());
Elements trs = doc.select("table").get(1).select("tr");
Expand All @@ -71,7 +73,7 @@ public void visit(Page page) {
Client.proxyPool.add(ip, port);
redisOnMessageUtil.Push(area, ip, port);
}
} else if ("http://www.ip181.com/".equals(url)) {
} else if (page2.equals(url)) {
for (int i = 1; i < 50; i++) {
Elements trs = doc.select("table").select("tr");
Elements tds = trs.get(i).select("td");
Expand Down
1 change: 0 additions & 1 deletion src/main/java/com/myapp/main/main.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ public A(int j, int z,CountDownLatch latch) {
public void run() {
System.out.println("#####多线程分片跑区间:" + (j * z + 1) + "-" + ((j + 1) * z));
for (int i = j * z + 1; i < (j + 1) * z; i++) {

HttpProxy httpProxy = proxyPool.borrow();
HttpStatus code = ProxyIpCheck.Check(httpProxy.getProxy());
System.err.println("name:" + Thread.currentThread().getName() + httpProxy.getProxy() + ":" + code);
Expand Down

0 comments on commit 7d54158

Please sign in to comment.