Skip to content

Latest commit

 

History

History

13.6 - ParallelScrapingDocs

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
 
 
 
 
 
 

Example 13.6 - ParallelScrapingDocs

Scraping MDN Web API documentation pages in parallel using Futures

amm --class-based TestScrapingDocs.sc

Upstream Example: 11.2 - ScrapingDocs:

Diff:

diff --git a/11.2 - ScrapingDocs/ScrapingDocs.sc b/13.6 - ParallelScrapingDocs/ScrapingDocs.sc
index 2fb7417..1549bb4 100644
--- a/11.2 - ScrapingDocs/ScrapingDocs.sc	
+++ b/13.6 - ParallelScrapingDocs/ScrapingDocs.sc	
@@ -1,9 +1,12 @@
 import $ivy.`org.jsoup:jsoup:1.13.1`, org.jsoup._
 import collection.JavaConverters._
+import scala.concurrent._, duration.Duration.Inf, java.util.concurrent.Executors
+implicit val ec = ExecutionContext.fromExecutorService(Executors.newFixedThreadPool(8))
+
 val indexDoc = Jsoup.connect("https://developer.mozilla.org/en-US/docs/Web/API").get()
 val links = indexDoc.select("h2#Interfaces").nextAll.select("div.index a").asScala
 val linkData = links.map(link => (link.attr("href"), link.attr("title"), link.text))
-val articles = for ((url, tooltip, name) <- linkData) yield {
+val articlesFutures = for ((url, tooltip, name) <- linkData) yield Future{
   println("Scraping " + name)
   val doc = Jsoup.connect("https://developer.mozilla.org" + url).get()
   val summary = doc.select("article#wikiArticle > p").asScala.headOption match {
@@ -15,3 +18,5 @@ val articles = for ((url, tooltip, name) <- linkData) yield {
     .map(el => (el.text, el.nextElementSibling match {case null => ""; case x => x.text}))
   (url, tooltip, name, summary, methodsAndProperties)
 }
+
+val articles = articlesFutures.map(Await.result(_, Inf))

Downstream Examples