Scraping MDN Web API documentation pages in parallel using Futures
amm --class-based TestScrapingDocs.sc
Upstream Example: 11.2 - ScrapingDocs:
Diff:
diff --git a/11.2 - ScrapingDocs/ScrapingDocs.sc b/13.6 - ParallelScrapingDocs/ScrapingDocs.sc
index 2fb7417..1549bb4 100644
--- a/11.2 - ScrapingDocs/ScrapingDocs.sc
+++ b/13.6 - ParallelScrapingDocs/ScrapingDocs.sc
@@ -1,9 +1,12 @@
import $ivy.`org.jsoup:jsoup:1.13.1`, org.jsoup._
import collection.JavaConverters._
+import scala.concurrent._, duration.Duration.Inf, java.util.concurrent.Executors
+implicit val ec = ExecutionContext.fromExecutorService(Executors.newFixedThreadPool(8))
+
val indexDoc = Jsoup.connect("https://developer.mozilla.org/en-US/docs/Web/API").get()
val links = indexDoc.select("h2#Interfaces").nextAll.select("div.index a").asScala
val linkData = links.map(link => (link.attr("href"), link.attr("title"), link.text))
-val articles = for ((url, tooltip, name) <- linkData) yield {
+val articlesFutures = for ((url, tooltip, name) <- linkData) yield Future{
println("Scraping " + name)
val doc = Jsoup.connect("https://developer.mozilla.org" + url).get()
val summary = doc.select("article#wikiArticle > p").asScala.headOption match {
@@ -15,3 +18,5 @@ val articles = for ((url, tooltip, name) <- linkData) yield {
.map(el => (el.text, el.nextElementSibling match {case null => ""; case x => x.text}))
(url, tooltip, name, summary, methodsAndProperties)
}
+
+val articles = articlesFutures.map(Await.result(_, Inf))