webClient.options.isJavaScriptEnabled = false //2-8 RULE,提高爬虫抓取性能，我们不启用JS解释器，默认为true

jack · jack · commit 68515a8fa806 · 2018-05-31T12:01:16.000+08:00
diff --git a/src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt b/src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt
@@ -1,6 +1,6 @@
 package com.light.saber.controller
 
-import com.light.saber.crawler.KnowledgeCrawlerOfSpring4All
+import com.light.saber.crawler.*
 import com.light.saber.service.CrawKnowledgeService
 import org.springframework.beans.factory.annotation.Autowired
 import org.springframework.web.bind.annotation.GetMapping
@@ -17,6 +17,10 @@ class KnowledgeCrawController {
     lateinit var KnowledgeCrawlerOfSpring4All: KnowledgeCrawlerOfSpring4All
 
 
+    @Autowired
+    lateinit var KnowledgeCrawlerOfConcurrentProgramming: KnowledgeCrawlerOfConcurrentProgramming
+
+
     @GetMapping("/knowledge/doCrawJianShu")
     fun doCrawJianShu(): String {
         Thread {
@@ -100,4 +104,14 @@ class KnowledgeCrawController {
         return "DONE"
     }
 
+
+    @GetMapping("/knowledge/KnowledgeCrawlerOfConcurrentProgramming")
+    fun KnowledgeCrawlerOfConcurrentProgramming(): String {
+        Thread {
+            KnowledgeCrawlerOfConcurrentProgramming.doCraw()
+        }.start()
+
+        return "DONE"
+    }
+
 }
diff --git a/src/main/kotlin/com/light/saber/crawler/KnowledgeCrawlerOfConcurrentProgramming.kt b/src/main/kotlin/com/light/saber/crawler/KnowledgeCrawlerOfConcurrentProgramming.kt
@@ -0,0 +1,57 @@
+package com.light.saber.crawler
+
+import com.alibaba.microtek.crawler.KnowledgeWebCrawler
+import com.light.saber.service.KnowledgeService
+import com.light.saber.webclient.CrawlerWebClient
+import org.jsoup.Jsoup
+import org.jsoup.nodes.Element
+import org.jsoup.select.Elements
+import org.springframework.beans.factory.annotation.Autowired
+import org.springframework.beans.factory.annotation.Qualifier
+import org.springframework.scheduling.annotation.Scheduled
+import org.springframework.stereotype.Service
+
+@Service
+class KnowledgeCrawlerOfConcurrentProgramming : KnowledgeWebCrawler {
+    @Autowired
+    @Qualifier("knowledgeService")
+    lateinit var KnowledgeService: KnowledgeService
+    @Autowired
+    lateinit var CrawlerWebClient: CrawlerWebClient
+
+    override fun getArticleBody(e: Element) = e.getElementsByClass("post_content").html()
+
+    override fun pageUrls(page: Int): String = "http://ifeve.com/page/${page}"
+
+    override fun getArticleListDocument(url: String, className: String): Elements {
+        val articlePageHtml = CrawlerWebClient.getPageHtmlText(url)
+        val articlePageDocument = Jsoup.parse(articlePageHtml)
+        return articlePageDocument.getElementsByClass(className)
+    }
+
+    override fun getArticleUrl(e: Element) = e.child(0).attr("href")
+
+    override fun getArticleTitle(e: Element) = e.child(0).html()
+
+    @Scheduled(cron = "0 30 1 1/1 * ?")
+    override fun doCraw() {
+        for (p in 1..103) {
+            val pageUrl = pageUrls(p)
+            val pageList = getArticleListDocument(pageUrl, "title")
+            pageList.forEach {
+                try {
+                    val articleUrl = getArticleUrl(it)
+                    val articleTitle = getArticleTitle(it)
+                    val articleHTML = CrawlerWebClient.getPageHtmlText(articleUrl)
+                    val articleDocument = Jsoup.parse(articleHTML)
+                    val articleBody = getArticleBody(articleDocument)
+                    println(articleTitle)
+                    println(articleUrl)
+                    KnowledgeService.doSaveKnowledge(articleUrl, articleTitle, articleBody)
+                } catch (e: Exception) {
+                    e.printStackTrace()
+                }
+            }
+        }
+    }
+}
diff --git a/src/main/kotlin/com/light/saber/crawler/KnowledgeWebCrawler.kt b/src/main/kotlin/com/light/saber/crawler/KnowledgeWebCrawler.kt
@@ -1,13 +1,13 @@
-package com.light.saber.crawler
+package com.alibaba.microtek.crawler
 
 import org.jsoup.nodes.Element
 import org.jsoup.select.Elements
 
 interface KnowledgeWebCrawler {
     fun pageUrls(page: Int): String
     fun getArticleListDocument(url: String, className: String): Elements
-    fun getArticleUrl(e: Element, className: String): String
-    fun getArticleTitle(e: Element, className: String): String
+    fun getArticleUrl(e: Element): String
+    fun getArticleTitle(e: Element): String
     fun getArticleBody(e: Element): String
 
     fun doCraw()
diff --git a/src/main/kotlin/com/light/saber/webclient/CrawlerWebClient.kt b/src/main/kotlin/com/light/saber/webclient/CrawlerWebClient.kt
@@ -2,31 +2,63 @@ package com.light.saber.webclient
 
 
 import com.gargoylesoftware.htmlunit.WebClient
+import com.gargoylesoftware.htmlunit.WebRequest
+import com.gargoylesoftware.htmlunit.WebResponse
 import com.gargoylesoftware.htmlunit.html.HtmlPage
+import com.gargoylesoftware.htmlunit.util.WebConnectionWrapper
 import org.springframework.stereotype.Service
+import java.io.IOException
+
 
 @Service
 class CrawlerWebClient {
 
     private fun instanceWebClient(javaScriptTimeout: Long): WebClient {
         val webClient = WebClient()
-        if (javaScriptTimeout > 0) {
-            webClient.javaScriptTimeout = javaScriptTimeout
-        }
-        webClient.options.isJavaScriptEnabled = true //启用JS解释器，默认为true
+        webClient.javaScriptTimeout = javaScriptTimeout
+        webClient.options.isJavaScriptEnabled = false //2-8 RULE,提高爬虫抓取性能，我们不启用JS解释器，默认为true
         webClient.options.isCssEnabled = false
         webClient.options.isThrowExceptionOnScriptError = false //js运行错误时，是否抛出异常
         webClient.options.isUseInsecureSSL = true
-        return webClient as WebClient
+        webClient.options.isDoNotTrackEnabled = false
+        webClient.webConnection = object : WebConnectionWrapper(webClient) {
+            @Throws(IOException::class)
+            override fun getResponse(request: WebRequest): WebResponse {
+                var response = super.getResponse(request)
+                val requestUrl = request.getUrl()
+                println("requestUrl=$requestUrl")
+                println("response=${response.contentAsString}")
+                return response
+            }
+        }
+        return webClient
     }
 
     fun getPageHtmlText(url: String): String? {
-        val webClient = instanceWebClient(3000)
-        return try {
-            webClient.getPage<HtmlPage>(url).asXml()
+        var result: String? = ""
+        val webClient = instanceWebClient(7000)
+        try {
+            result = webClient.getPage<HtmlPage>(url).asXml()
         } catch (e: Exception) {
-            null
+            e.printStackTrace()
+            result = null
         }
+        webClient.close()
+
+        return result
     }
 
 }
+
+
+/**
+可以通过page.executeJavaScript来执行js
+
+例如：
+
+HtmlPage page = wc.getPage("http://xxx.com/");
+wc.waitForBackgroundJavaScript(30 * 1000); /* will wait JavaScript to execute up to 30s */
+ScriptResult result = page.executeJavaScript("document.getElementById('pk_1248827').onmouseover(window.event)");
+HtmlPage jspage = (HtmlPage) result.getNewPage();
+
+ */
diff --git a/src/main/resources/templates/common/head.ftl b/src/main/resources/templates/common/head.ftl
@@ -50,6 +50,7 @@
                     <dd><a href="/knowledge/doCrawInfoQKnowledge" target="_blank">抓取InfoQ</a></dd>
                     <dd><a href="/knowledge/doCrawBlockChainKnowledge" target="_blank">BlockChain</a></dd>
                     <dd><a href="/knowledge/KnowledgeCrawlerOfSpring4All" target="_blank">Spring4All</a></dd>
+                    <dd><a href="/knowledge/KnowledgeCrawlerOfConcurrentProgramming" target="_blank">并发编程网</a></dd>
                     <dd><a href="">超链接</a></dd>
                 </dl>
             </li>