Skip to content

Commit 68515a8

Browse files
author
jack
committed
webClient.options.isJavaScriptEnabled = false //2-8 RULE,提高爬虫抓取性能,我们不启用JS解释器,默认为true
1 parent 68c3003 commit 68515a8

File tree

5 files changed

+117
-13
lines changed

5 files changed

+117
-13
lines changed

src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package com.light.saber.controller
22

3-
import com.light.saber.crawler.KnowledgeCrawlerOfSpring4All
3+
import com.light.saber.crawler.*
44
import com.light.saber.service.CrawKnowledgeService
55
import org.springframework.beans.factory.annotation.Autowired
66
import org.springframework.web.bind.annotation.GetMapping
@@ -17,6 +17,10 @@ class KnowledgeCrawController {
1717
lateinit var KnowledgeCrawlerOfSpring4All: KnowledgeCrawlerOfSpring4All
1818

1919

20+
@Autowired
21+
lateinit var KnowledgeCrawlerOfConcurrentProgramming: KnowledgeCrawlerOfConcurrentProgramming
22+
23+
2024
@GetMapping("/knowledge/doCrawJianShu")
2125
fun doCrawJianShu(): String {
2226
Thread {
@@ -100,4 +104,14 @@ class KnowledgeCrawController {
100104
return "DONE"
101105
}
102106

107+
108+
@GetMapping("/knowledge/KnowledgeCrawlerOfConcurrentProgramming")
109+
fun KnowledgeCrawlerOfConcurrentProgramming(): String {
110+
Thread {
111+
KnowledgeCrawlerOfConcurrentProgramming.doCraw()
112+
}.start()
113+
114+
return "DONE"
115+
}
116+
103117
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package com.light.saber.crawler
2+
3+
import com.alibaba.microtek.crawler.KnowledgeWebCrawler
4+
import com.light.saber.service.KnowledgeService
5+
import com.light.saber.webclient.CrawlerWebClient
6+
import org.jsoup.Jsoup
7+
import org.jsoup.nodes.Element
8+
import org.jsoup.select.Elements
9+
import org.springframework.beans.factory.annotation.Autowired
10+
import org.springframework.beans.factory.annotation.Qualifier
11+
import org.springframework.scheduling.annotation.Scheduled
12+
import org.springframework.stereotype.Service
13+
14+
@Service
15+
class KnowledgeCrawlerOfConcurrentProgramming : KnowledgeWebCrawler {
16+
@Autowired
17+
@Qualifier("knowledgeService")
18+
lateinit var KnowledgeService: KnowledgeService
19+
@Autowired
20+
lateinit var CrawlerWebClient: CrawlerWebClient
21+
22+
override fun getArticleBody(e: Element) = e.getElementsByClass("post_content").html()
23+
24+
override fun pageUrls(page: Int): String = "http://ifeve.com/page/${page}"
25+
26+
override fun getArticleListDocument(url: String, className: String): Elements {
27+
val articlePageHtml = CrawlerWebClient.getPageHtmlText(url)
28+
val articlePageDocument = Jsoup.parse(articlePageHtml)
29+
return articlePageDocument.getElementsByClass(className)
30+
}
31+
32+
override fun getArticleUrl(e: Element) = e.child(0).attr("href")
33+
34+
override fun getArticleTitle(e: Element) = e.child(0).html()
35+
36+
@Scheduled(cron = "0 30 1 1/1 * ?")
37+
override fun doCraw() {
38+
for (p in 1..103) {
39+
val pageUrl = pageUrls(p)
40+
val pageList = getArticleListDocument(pageUrl, "title")
41+
pageList.forEach {
42+
try {
43+
val articleUrl = getArticleUrl(it)
44+
val articleTitle = getArticleTitle(it)
45+
val articleHTML = CrawlerWebClient.getPageHtmlText(articleUrl)
46+
val articleDocument = Jsoup.parse(articleHTML)
47+
val articleBody = getArticleBody(articleDocument)
48+
println(articleTitle)
49+
println(articleUrl)
50+
KnowledgeService.doSaveKnowledge(articleUrl, articleTitle, articleBody)
51+
} catch (e: Exception) {
52+
e.printStackTrace()
53+
}
54+
}
55+
}
56+
}
57+
}

src/main/kotlin/com/light/saber/crawler/KnowledgeWebCrawler.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
package com.light.saber.crawler
1+
package com.alibaba.microtek.crawler
22

33
import org.jsoup.nodes.Element
44
import org.jsoup.select.Elements
55

66
interface KnowledgeWebCrawler {
77
fun pageUrls(page: Int): String
88
fun getArticleListDocument(url: String, className: String): Elements
9-
fun getArticleUrl(e: Element, className: String): String
10-
fun getArticleTitle(e: Element, className: String): String
9+
fun getArticleUrl(e: Element): String
10+
fun getArticleTitle(e: Element): String
1111
fun getArticleBody(e: Element): String
1212

1313
fun doCraw()

src/main/kotlin/com/light/saber/webclient/CrawlerWebClient.kt

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,63 @@ package com.light.saber.webclient
22

33

44
import com.gargoylesoftware.htmlunit.WebClient
5+
import com.gargoylesoftware.htmlunit.WebRequest
6+
import com.gargoylesoftware.htmlunit.WebResponse
57
import com.gargoylesoftware.htmlunit.html.HtmlPage
8+
import com.gargoylesoftware.htmlunit.util.WebConnectionWrapper
69
import org.springframework.stereotype.Service
10+
import java.io.IOException
11+
712

813
@Service
914
class CrawlerWebClient {
1015

1116
private fun instanceWebClient(javaScriptTimeout: Long): WebClient {
1217
val webClient = WebClient()
13-
if (javaScriptTimeout > 0) {
14-
webClient.javaScriptTimeout = javaScriptTimeout
15-
}
16-
webClient.options.isJavaScriptEnabled = true //启用JS解释器,默认为true
18+
webClient.javaScriptTimeout = javaScriptTimeout
19+
webClient.options.isJavaScriptEnabled = false //2-8 RULE,提高爬虫抓取性能,我们不启用JS解释器,默认为true
1720
webClient.options.isCssEnabled = false
1821
webClient.options.isThrowExceptionOnScriptError = false //js运行错误时,是否抛出异常
1922
webClient.options.isUseInsecureSSL = true
20-
return webClient as WebClient
23+
webClient.options.isDoNotTrackEnabled = false
24+
webClient.webConnection = object : WebConnectionWrapper(webClient) {
25+
@Throws(IOException::class)
26+
override fun getResponse(request: WebRequest): WebResponse {
27+
var response = super.getResponse(request)
28+
val requestUrl = request.getUrl()
29+
println("requestUrl=$requestUrl")
30+
println("response=${response.contentAsString}")
31+
return response
32+
}
33+
}
34+
return webClient
2135
}
2236

2337
fun getPageHtmlText(url: String): String? {
24-
val webClient = instanceWebClient(3000)
25-
return try {
26-
webClient.getPage<HtmlPage>(url).asXml()
38+
var result: String? = ""
39+
val webClient = instanceWebClient(7000)
40+
try {
41+
result = webClient.getPage<HtmlPage>(url).asXml()
2742
} catch (e: Exception) {
28-
null
43+
e.printStackTrace()
44+
result = null
2945
}
46+
webClient.close()
47+
48+
return result
3049
}
3150

3251
}
52+
53+
54+
/**
55+
可以通过page.executeJavaScript来执行js
56+
57+
例如:
58+
59+
HtmlPage page = wc.getPage("http://xxx.com/");
60+
wc.waitForBackgroundJavaScript(30 * 1000); /* will wait JavaScript to execute up to 30s */
61+
ScriptResult result = page.executeJavaScript("document.getElementById('pk_1248827').onmouseover(window.event)");
62+
HtmlPage jspage = (HtmlPage) result.getNewPage();
63+
64+
*/

src/main/resources/templates/common/head.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
<dd><a href="/knowledge/doCrawInfoQKnowledge" target="_blank">抓取InfoQ</a></dd>
5151
<dd><a href="/knowledge/doCrawBlockChainKnowledge" target="_blank">BlockChain</a></dd>
5252
<dd><a href="/knowledge/KnowledgeCrawlerOfSpring4All" target="_blank">Spring4All</a></dd>
53+
<dd><a href="/knowledge/KnowledgeCrawlerOfConcurrentProgramming" target="_blank">并发编程网</a></dd>
5354
<dd><a href="">超链接</a></dd>
5455
</dl>
5556
</li>

0 commit comments

Comments
 (0)