Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
## Overview

# funfunnet-crawler
뻔뻔넷 크롤러 입니다.


Expand Down
16 changes: 15 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,25 @@ Seq(unmanagedResourceDirectories in Compile += baseDirectory.value / "conf")

libraryDependencies ++= Seq(
"com.typesafe.akka" %% "akka-actor" % Versions.akka,
"com.typesafe.akka" %% "akka-testkit" % Versions.akka,
"com.typesafe.akka" %% "akka-testkit" % Versions.akka % Test,
"com.typesafe.akka" %% "akka-http" % Versions.akka_http,
// "com.typesafe.akka" %% "akka-http-testkit" % Versions.akka_http % Test,
"org.jsoup" % "jsoup" % "1.10.3",

"com.enragedginger" %% "akka-quartz-scheduler" % "1.6.1-akka-2.5.x",
"com.typesafe" % "config" % "1.2.1",

"com.typesafe.scala-logging" %% "scala-logging" % "3.7.2",
"ch.qos.logback" % "logback-classic" % "1.2.3",

"org.json4s" %% "json4s-native" % "3.5.3",

"org.scalatest" %% "scalatest" % "3.0.1" % "test"
)
dependencyOverrides ++= Set(
// akka-http bug : akka-stream 2.4.9 version을 가져옴
"com.typesafe.akka" %% "akka-stream" % Versions.akka
)

// scalastyle configurations
// test task 수행시, scalastyle에 실패하면 에러를 발생시킴
Expand Down
7 changes: 3 additions & 4 deletions conf/application.conf
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
akka {
quartz {
schedules {
Every30Seconds {
description = "A cron job that fires off every 30 seconds"
expression = "*/30 * * ? * *"
// calendar = "OnlyBusinessHours"
Crawling {
description = "A cron job that fires off every 1 min"
expression = "0 * * ? * *"
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion project/Versions.scala
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
object Versions {
lazy val scala = "2.12.2"
lazy val scala = "2.12.3"
lazy val akka = "2.5.3"
lazy val akka_http = "10.0.9"
}
15 changes: 3 additions & 12 deletions src/main/scala/net/funfunnet/crawler/Application.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,13 @@ package net.funfunnet.crawler

import akka.actor.{ActorSystem, Props}
import com.typesafe.akka.extension.quartz.QuartzSchedulerExtension
import scala.concurrent.duration._
import net.funfunnet.crawler.actor.{Start, Supervisor}

object Application extends App {

val system = ActorSystem("funfunnet-crawler")

import system.dispatcher

val crawlingBySchedulerActor = system.actorOf(Props[CrawlingActor], name = "crawlingBySchedulerActor")
val crawlingByQuartzActor = system.actorOf(Props[CrawlingActor], name = "crawlingByQuartzActor")


//call by scheduler
val cancellable =
system.scheduler.schedule(5 seconds, 10 seconds, crawlingBySchedulerActor, "scheduler")
val supervisor = system.actorOf(Props[Supervisor], name = "supervisor")

//call by quartz
QuartzSchedulerExtension(system).schedule("Every30Seconds", crawlingByQuartzActor, "quartz")
QuartzSchedulerExtension(system).schedule("Crawling", supervisor, Start)
}
7 changes: 7 additions & 0 deletions src/main/scala/net/funfunnet/crawler/actor/Messages.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package net.funfunnet.crawler.actor

import net.funfunnet.crawler.model.{Article, SiteSource}

case class Start()
case class Crawl(siteSource: SiteSource)
case class Result(article: Article)
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package net.funfunnet.crawler
package net.funfunnet.crawler.actor

import akka.actor.{Actor, ActorLogging}

class CrawlingActor extends Actor with ActorLogging{
class Processor extends Actor with ActorLogging{

override def receive: Receive = {
case msg => log.info(s"start crawling by $msg")
Expand Down
30 changes: 30 additions & 0 deletions src/main/scala/net/funfunnet/crawler/actor/Supervisor.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package net.funfunnet.crawler.actor

import akka.actor.{Actor, ActorLogging, Props}
import net.funfunnet.crawler.actor.medium.MediumArticleListCrawler
import net.funfunnet.crawler.model.{Site, SiteSource}

class Supervisor extends Actor with ActorLogging {
val actors = List(
context.system.actorOf(Props[MediumArticleListCrawler], name = Site.Medium.toString)
)
val crawlers = actors.map(x => x.path.name -> x).toMap

override def receive: Receive = {
case Start =>
log.info("start")
SiteSource.findAll().foreach(self ! Crawl(_))
case Crawl(siteSource) =>
log.info(s"crawl to ${siteSource.name}")
crawlers.get(siteSource.site.toString) match {
case Some(ref) => ref ! siteSource
case _ => log.error(s"could not found crawler actor : ${siteSource.site.toString}")
}
case Result(article) =>
//TODO 저장 관련 처리
log.info(s"result : title:${article.title}, url:${article.url}")
case x =>
log.warning(s"unknown message type : $x")
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package net.funfunnet.crawler.actor.medium

import java.net.URLEncoder
import java.time.{LocalDateTime, ZonedDateTime}
import java.time.format.DateTimeFormatter

import akka.actor.{Actor, ActorLogging}
import akka.http.scaladsl.Http
import akka.http.scaladsl.model.{HttpRequest, HttpResponse, StatusCodes, Uri}
import akka.pattern.pipe
import akka.stream.{ActorMaterializer, ActorMaterializerSettings}
import akka.util.ByteString
import net.funfunnet.crawler.actor.Result
import net.funfunnet.crawler.common.TimeUtils
import net.funfunnet.crawler.model.Article
import org.jsoup.Jsoup

class MediumArticleCrawler extends Actor with ActorLogging {

import context.dispatcher

private val DATE_REGEX = "\"datePublished\":\"(.{1,30})\",\"dateModified".r

private val http = Http(context.system)

final implicit val materializer: ActorMaterializer =
ActorMaterializer(ActorMaterializerSettings(context.system))

override def receive: Receive = {
case url: String =>
http.singleRequest(HttpRequest(uri = encodeUrl(url))).pipeTo(self)(sender())

case HttpResponse(StatusCodes.OK, headers, entity, _) =>
val sd = sender()
entity.dataBytes.runFold(ByteString(""))(_ ++ _).foreach { body =>
sd ! Result(findArticle(body.utf8String))
}
case resp@HttpResponse(code, _, _, _) =>
log.info("Request failed, response code: " + code)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

error로 로깅해야하지 않나요?

resp.discardEntityBytes()
}

def encodeUrl(url: String) : String = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

몬가 이미 만들어진 라이브러리가 있을 법한 함수네요.
이미 만들어진게 없다면 공통 유틸리티로 빼주시면 다른 모듈에서도 사용할 수 있을 것 같네요.

val prefix = url.substring(0, url.lastIndexOf("/") + 1)
val params = url.substring(url.lastIndexOf("/") + 1)
prefix + URLEncoder.encode(params, "UTF-8")
}

def findArticle(html: String): Article = {
val doc = Jsoup.parse(html)
val title = doc.select("meta[property=og:title]").attr("content")
val desc = doc.select("meta[property=og:description]").attr("content")
val url = doc.select("meta[property=og:url]").attr("content")
val image = doc.select("meta[property=og:image]").attr("content")

val dateStr = DATE_REGEX.findFirstMatchIn(html).map(x => x.group(1)).get
val createdAt = TimeUtils.parseIsoTime(dateStr)

Article(title, desc, image, url, createdAt)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package net.funfunnet.crawler.actor.medium

import akka.actor.{Actor, ActorLogging, Props}
import akka.http.scaladsl.Http
import akka.http.scaladsl.model.{HttpRequest, HttpResponse, StatusCodes}
import akka.pattern.pipe
import akka.stream.{ActorMaterializer, ActorMaterializerSettings}
import akka.util.ByteString
import net.funfunnet.crawler.model.SiteSource

class MediumArticleListCrawler extends Actor with ActorLogging {

import context.dispatcher

private val MEDIUMID_REGEX = "https:\\/\\/medium.com\\/(.{1,20})\\/latest".r
private val UNIQUESLUG_REGEX = "\"uniqueSlug\":\"(.{1,100})\",\"previewContent\"".r
private val http = Http(context.system)
private val articleCrawler =
context.system.actorOf(Props[MediumArticleCrawler], name = "mediumArticleCrawler")

final implicit val materializer: ActorMaterializer =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

딱히 사용하는 곳은 없는 것 같은데, 어떤 역할을 하는 건가요??

ActorMaterializer(ActorMaterializerSettings(context.system))

override def receive: Receive = {
case source: SiteSource =>
http.singleRequest(HttpRequest(uri = source.url)).pipeTo(self)(sender())

case HttpResponse(StatusCodes.OK, headers, entity, _) =>
log.info("response ok")
val sd = sender()
entity.dataBytes.runFold(ByteString(""))(_ ++ _).foreach { body =>
log.info(s"Got response, body length: ${body.length}")
findArticleUrls(body.utf8String).foreach(x => {
articleCrawler.tell(x, sd)
})
}
log.info("end of response ok")
case resp@HttpResponse(code, _, _, _) =>
log.info("Request failed, response code: " + code)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

이것도 error 아닐까욤??

resp.discardEntityBytes()
}

def findPrefixUrl(html: String): Option[String] = MEDIUMID_REGEX.findFirstMatchIn(html)
.map(x => x.group(1))
.map(x => s"https://medium.com/$x")

def findUniqueSlugs(html: String): List[String] =
UNIQUESLUG_REGEX.findAllMatchIn(html).map(x => x.group(1)).toList

def findArticleUrls(html: String): List[String] =
findPrefixUrl(html).map(x => findUniqueSlugs(html).map(y => s"$x/$y"))
.getOrElse(Nil)
}
9 changes: 9 additions & 0 deletions src/main/scala/net/funfunnet/crawler/common/TimeUtils.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package net.funfunnet.crawler.common

import java.time.format.DateTimeFormatter
import java.time.{LocalDateTime, ZonedDateTime}

object TimeUtils {
def parseIsoTime(text: String): LocalDateTime =
ZonedDateTime.parse(text, DateTimeFormatter.ISO_DATE_TIME).toLocalDateTime
}
11 changes: 11 additions & 0 deletions src/main/scala/net/funfunnet/crawler/model/Article.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package net.funfunnet.crawler.model

import java.time.LocalDateTime

case class Article(
title: String,
desc: String,
image: String,
url: String,
createdAt: LocalDateTime
)
7 changes: 7 additions & 0 deletions src/main/scala/net/funfunnet/crawler/model/Site.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package net.funfunnet.crawler.model

object Site extends Enumeration {
type Site = Value

val Medium = Value
}
20 changes: 20 additions & 0 deletions src/main/scala/net/funfunnet/crawler/model/SiteSource.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package net.funfunnet.crawler.model

import net.funfunnet.crawler.model.Site.Site

case class SiteSource(id: Int, name: String, site: Site, url: String)

object SiteSource {

def findAll(): List[SiteSource] = {
//TODO db에서 가져오도록 변경
List(
SiteSource(id = 1, name = "Rainist Engineering", site = Site.Medium,
url = "https://medium.com/rainist-engineering/latest"),
SiteSource(id = 2, name = "디지털 세상을 만드는 아날로거", site = Site.Medium,
url = "https://medium.com/@goinhacker/latest"),
SiteSource(id = 3, name = "Lazysoul", site = Site.Medium,
url = "https://medium.com/@lazysoul/latest")
)
}
}
240 changes: 240 additions & 0 deletions src/test/resources/crawler/medium/article.html

Large diffs are not rendered by default.

242 changes: 242 additions & 0 deletions src/test/resources/crawler/medium/latest.html

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions src/test/scala/net/funfunnet/crawler/actor/ActorTest.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package net.funfunnet.crawler.actor

class ActorTest {

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package net.funfunnet.crawler.actor.common

import net.funfunnet.crawler.common.TimeUtils
import org.scalatest.FunSuite

class TimeUtilsTest extends FunSuite {
test("parseIsoTime") {
val dateText = "2017-09-15T06:09:11.760Z"
val localDateTime = TimeUtils.parseIsoTime(dateText)
assertResult(2017)(localDateTime.getYear)
assertResult(9)(localDateTime.getMonthValue)
assertResult(15)(localDateTime.getDayOfMonth)

assertResult(6)(localDateTime.getHour)
assertResult(9)(localDateTime.getMinute)
assertResult(11)(localDateTime.getSecond)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package net.funfunnet.crawler.actor.medium

import akka.actor.ActorSystem
import akka.testkit.{ImplicitSender, TestActorRef, TestKit}
import com.typesafe.scalalogging.LazyLogging
import org.scalatest._

import scala.io.Source

class MediumArticleCrawlerTest extends TestKit(ActorSystem("MediumArticleListCrawlerTest"))
with ImplicitSender
with FunSuiteLike
with Matchers
with BeforeAndAfterAll
with LazyLogging {

lazy val html = Source.fromURL(getClass.getClassLoader.getResource("crawler/medium/article.html"))
.getLines().mkString

lazy val crawler = TestActorRef(new MediumArticleCrawler()).underlyingActor

override def afterAll {
TestKit.shutdownActorSystem(system)
}

test("encodeUrl") {
val url = "https://medium.com/rainist-engineering/레이니스트의-기술-블로그를-시작하며-2d757ea69844"
val encoded = "https://medium.com/rainist-engineering/" +
"%EB%A0%88%EC%9D%B4%EB%8B%88%EC%8A%A4%ED%8A%B8%EC%9D%98-%EA%B8%B0%EC%88%A0-" +
"%EB%B8%94%EB%A1%9C%EA%B7%B8%EB%A5%BC-%EC%8B%9C%EC%9E%91%ED%95%98%EB%A9%B0-2d757ea69844"
assertResult(encoded)(crawler.encodeUrl(url))
}

test("findArticle") {
val article = crawler.findArticle(html)
assertResult("Kotlin, AWS 그리고 레이니스트와 함께라면 육군훈련소에서도 외롭지 않아 – Rainist Engineering – Medium") {
article.title
}
assertResult("Kotlin과 AWS를 활용해 간단한 Slack 봇을 만든 경험을 통해 뱅크샐러드 Android 앱의 Architecture를 간단히 공유합니다.") {
article.desc
}
assertResult("https://cdn-images-1.medium.com/max/1200/1*KIa4eZARMVelBWNRMd3D9Q.png") {
article.image
}
assertResult("https://medium.com/rainist-engineering/writing-aws-lambda-function-in-kotlin-b3faf3f55777") {
article.url
}

assertResult(6)(article.createdAt.getHour)
assertResult(9)(article.createdAt.getMinute)
assertResult(11)(article.createdAt.getSecond)
}

}
Loading