-
Notifications
You must be signed in to change notification settings - Fork 0
Medium Crawler 개발 완료 #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| ## Overview | ||
|
|
||
| # funfunnet-crawler | ||
| 뻔뻔넷 크롤러 입니다. | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| object Versions { | ||
| lazy val scala = "2.12.2" | ||
| lazy val scala = "2.12.3" | ||
| lazy val akka = "2.5.3" | ||
| lazy val akka_http = "10.0.9" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| package net.funfunnet.crawler.actor | ||
|
|
||
| import net.funfunnet.crawler.model.{Article, SiteSource} | ||
|
|
||
| case class Start() | ||
| case class Crawl(siteSource: SiteSource) | ||
| case class Result(article: Article) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| package net.funfunnet.crawler.actor | ||
|
|
||
| import akka.actor.{Actor, ActorLogging, Props} | ||
| import net.funfunnet.crawler.actor.medium.MediumArticleListCrawler | ||
| import net.funfunnet.crawler.model.{Site, SiteSource} | ||
|
|
||
| class Supervisor extends Actor with ActorLogging { | ||
| val actors = List( | ||
| context.system.actorOf(Props[MediumArticleListCrawler], name = Site.Medium.toString) | ||
| ) | ||
| val crawlers = actors.map(x => x.path.name -> x).toMap | ||
|
|
||
| override def receive: Receive = { | ||
| case Start => | ||
| log.info("start") | ||
| SiteSource.findAll().foreach(self ! Crawl(_)) | ||
| case Crawl(siteSource) => | ||
| log.info(s"crawl to ${siteSource.name}") | ||
| crawlers.get(siteSource.site.toString) match { | ||
| case Some(ref) => ref ! siteSource | ||
| case _ => log.error(s"could not found crawler actor : ${siteSource.site.toString}") | ||
| } | ||
| case Result(article) => | ||
| //TODO 저장 관련 처리 | ||
| log.info(s"result : title:${article.title}, url:${article.url}") | ||
| case x => | ||
| log.warning(s"unknown message type : $x") | ||
| } | ||
|
|
||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| package net.funfunnet.crawler.actor.medium | ||
|
|
||
| import java.net.URLEncoder | ||
| import java.time.{LocalDateTime, ZonedDateTime} | ||
| import java.time.format.DateTimeFormatter | ||
|
|
||
| import akka.actor.{Actor, ActorLogging} | ||
| import akka.http.scaladsl.Http | ||
| import akka.http.scaladsl.model.{HttpRequest, HttpResponse, StatusCodes, Uri} | ||
| import akka.pattern.pipe | ||
| import akka.stream.{ActorMaterializer, ActorMaterializerSettings} | ||
| import akka.util.ByteString | ||
| import net.funfunnet.crawler.actor.Result | ||
| import net.funfunnet.crawler.common.TimeUtils | ||
| import net.funfunnet.crawler.model.Article | ||
| import org.jsoup.Jsoup | ||
|
|
||
| class MediumArticleCrawler extends Actor with ActorLogging { | ||
|
|
||
| import context.dispatcher | ||
|
|
||
| private val DATE_REGEX = "\"datePublished\":\"(.{1,30})\",\"dateModified".r | ||
|
|
||
| private val http = Http(context.system) | ||
|
|
||
| final implicit val materializer: ActorMaterializer = | ||
| ActorMaterializer(ActorMaterializerSettings(context.system)) | ||
|
|
||
| override def receive: Receive = { | ||
| case url: String => | ||
| http.singleRequest(HttpRequest(uri = encodeUrl(url))).pipeTo(self)(sender()) | ||
|
|
||
| case HttpResponse(StatusCodes.OK, headers, entity, _) => | ||
| val sd = sender() | ||
| entity.dataBytes.runFold(ByteString(""))(_ ++ _).foreach { body => | ||
| sd ! Result(findArticle(body.utf8String)) | ||
| } | ||
| case resp@HttpResponse(code, _, _, _) => | ||
| log.info("Request failed, response code: " + code) | ||
| resp.discardEntityBytes() | ||
| } | ||
|
|
||
| def encodeUrl(url: String) : String = { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 몬가 이미 만들어진 라이브러리가 있을 법한 함수네요. |
||
| val prefix = url.substring(0, url.lastIndexOf("/") + 1) | ||
| val params = url.substring(url.lastIndexOf("/") + 1) | ||
| prefix + URLEncoder.encode(params, "UTF-8") | ||
| } | ||
|
|
||
| def findArticle(html: String): Article = { | ||
| val doc = Jsoup.parse(html) | ||
| val title = doc.select("meta[property=og:title]").attr("content") | ||
| val desc = doc.select("meta[property=og:description]").attr("content") | ||
| val url = doc.select("meta[property=og:url]").attr("content") | ||
| val image = doc.select("meta[property=og:image]").attr("content") | ||
|
|
||
| val dateStr = DATE_REGEX.findFirstMatchIn(html).map(x => x.group(1)).get | ||
| val createdAt = TimeUtils.parseIsoTime(dateStr) | ||
|
|
||
| Article(title, desc, image, url, createdAt) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| package net.funfunnet.crawler.actor.medium | ||
|
|
||
| import akka.actor.{Actor, ActorLogging, Props} | ||
| import akka.http.scaladsl.Http | ||
| import akka.http.scaladsl.model.{HttpRequest, HttpResponse, StatusCodes} | ||
| import akka.pattern.pipe | ||
| import akka.stream.{ActorMaterializer, ActorMaterializerSettings} | ||
| import akka.util.ByteString | ||
| import net.funfunnet.crawler.model.SiteSource | ||
|
|
||
| class MediumArticleListCrawler extends Actor with ActorLogging { | ||
|
|
||
| import context.dispatcher | ||
|
|
||
| private val MEDIUMID_REGEX = "https:\\/\\/medium.com\\/(.{1,20})\\/latest".r | ||
| private val UNIQUESLUG_REGEX = "\"uniqueSlug\":\"(.{1,100})\",\"previewContent\"".r | ||
| private val http = Http(context.system) | ||
| private val articleCrawler = | ||
| context.system.actorOf(Props[MediumArticleCrawler], name = "mediumArticleCrawler") | ||
|
|
||
| final implicit val materializer: ActorMaterializer = | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 딱히 사용하는 곳은 없는 것 같은데, 어떤 역할을 하는 건가요?? |
||
| ActorMaterializer(ActorMaterializerSettings(context.system)) | ||
|
|
||
| override def receive: Receive = { | ||
| case source: SiteSource => | ||
| http.singleRequest(HttpRequest(uri = source.url)).pipeTo(self)(sender()) | ||
|
|
||
| case HttpResponse(StatusCodes.OK, headers, entity, _) => | ||
| log.info("response ok") | ||
| val sd = sender() | ||
| entity.dataBytes.runFold(ByteString(""))(_ ++ _).foreach { body => | ||
| log.info(s"Got response, body length: ${body.length}") | ||
| findArticleUrls(body.utf8String).foreach(x => { | ||
| articleCrawler.tell(x, sd) | ||
| }) | ||
| } | ||
| log.info("end of response ok") | ||
| case resp@HttpResponse(code, _, _, _) => | ||
| log.info("Request failed, response code: " + code) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 이것도 error 아닐까욤?? |
||
| resp.discardEntityBytes() | ||
| } | ||
|
|
||
| def findPrefixUrl(html: String): Option[String] = MEDIUMID_REGEX.findFirstMatchIn(html) | ||
| .map(x => x.group(1)) | ||
| .map(x => s"https://medium.com/$x") | ||
|
|
||
| def findUniqueSlugs(html: String): List[String] = | ||
| UNIQUESLUG_REGEX.findAllMatchIn(html).map(x => x.group(1)).toList | ||
|
|
||
| def findArticleUrls(html: String): List[String] = | ||
| findPrefixUrl(html).map(x => findUniqueSlugs(html).map(y => s"$x/$y")) | ||
| .getOrElse(Nil) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| package net.funfunnet.crawler.common | ||
|
|
||
| import java.time.format.DateTimeFormatter | ||
| import java.time.{LocalDateTime, ZonedDateTime} | ||
|
|
||
| object TimeUtils { | ||
| def parseIsoTime(text: String): LocalDateTime = | ||
| ZonedDateTime.parse(text, DateTimeFormatter.ISO_DATE_TIME).toLocalDateTime | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| package net.funfunnet.crawler.model | ||
|
|
||
| import java.time.LocalDateTime | ||
|
|
||
| case class Article( | ||
| title: String, | ||
| desc: String, | ||
| image: String, | ||
| url: String, | ||
| createdAt: LocalDateTime | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| package net.funfunnet.crawler.model | ||
|
|
||
| object Site extends Enumeration { | ||
| type Site = Value | ||
|
|
||
| val Medium = Value | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| package net.funfunnet.crawler.model | ||
|
|
||
| import net.funfunnet.crawler.model.Site.Site | ||
|
|
||
| case class SiteSource(id: Int, name: String, site: Site, url: String) | ||
|
|
||
| object SiteSource { | ||
|
|
||
| def findAll(): List[SiteSource] = { | ||
| //TODO db에서 가져오도록 변경 | ||
| List( | ||
| SiteSource(id = 1, name = "Rainist Engineering", site = Site.Medium, | ||
| url = "https://medium.com/rainist-engineering/latest"), | ||
| SiteSource(id = 2, name = "디지털 세상을 만드는 아날로거", site = Site.Medium, | ||
| url = "https://medium.com/@goinhacker/latest"), | ||
| SiteSource(id = 3, name = "Lazysoul", site = Site.Medium, | ||
| url = "https://medium.com/@lazysoul/latest") | ||
| ) | ||
| } | ||
| } |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| package net.funfunnet.crawler.actor | ||
|
|
||
| class ActorTest { | ||
|
|
||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| package net.funfunnet.crawler.actor.common | ||
|
|
||
| import net.funfunnet.crawler.common.TimeUtils | ||
| import org.scalatest.FunSuite | ||
|
|
||
| class TimeUtilsTest extends FunSuite { | ||
| test("parseIsoTime") { | ||
| val dateText = "2017-09-15T06:09:11.760Z" | ||
| val localDateTime = TimeUtils.parseIsoTime(dateText) | ||
| assertResult(2017)(localDateTime.getYear) | ||
| assertResult(9)(localDateTime.getMonthValue) | ||
| assertResult(15)(localDateTime.getDayOfMonth) | ||
|
|
||
| assertResult(6)(localDateTime.getHour) | ||
| assertResult(9)(localDateTime.getMinute) | ||
| assertResult(11)(localDateTime.getSecond) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| package net.funfunnet.crawler.actor.medium | ||
|
|
||
| import akka.actor.ActorSystem | ||
| import akka.testkit.{ImplicitSender, TestActorRef, TestKit} | ||
| import com.typesafe.scalalogging.LazyLogging | ||
| import org.scalatest._ | ||
|
|
||
| import scala.io.Source | ||
|
|
||
| class MediumArticleCrawlerTest extends TestKit(ActorSystem("MediumArticleListCrawlerTest")) | ||
| with ImplicitSender | ||
| with FunSuiteLike | ||
| with Matchers | ||
| with BeforeAndAfterAll | ||
| with LazyLogging { | ||
|
|
||
| lazy val html = Source.fromURL(getClass.getClassLoader.getResource("crawler/medium/article.html")) | ||
| .getLines().mkString | ||
|
|
||
| lazy val crawler = TestActorRef(new MediumArticleCrawler()).underlyingActor | ||
|
|
||
| override def afterAll { | ||
| TestKit.shutdownActorSystem(system) | ||
| } | ||
|
|
||
| test("encodeUrl") { | ||
| val url = "https://medium.com/rainist-engineering/레이니스트의-기술-블로그를-시작하며-2d757ea69844" | ||
| val encoded = "https://medium.com/rainist-engineering/" + | ||
| "%EB%A0%88%EC%9D%B4%EB%8B%88%EC%8A%A4%ED%8A%B8%EC%9D%98-%EA%B8%B0%EC%88%A0-" + | ||
| "%EB%B8%94%EB%A1%9C%EA%B7%B8%EB%A5%BC-%EC%8B%9C%EC%9E%91%ED%95%98%EB%A9%B0-2d757ea69844" | ||
| assertResult(encoded)(crawler.encodeUrl(url)) | ||
| } | ||
|
|
||
| test("findArticle") { | ||
| val article = crawler.findArticle(html) | ||
| assertResult("Kotlin, AWS 그리고 레이니스트와 함께라면 육군훈련소에서도 외롭지 않아 – Rainist Engineering – Medium") { | ||
| article.title | ||
| } | ||
| assertResult("Kotlin과 AWS를 활용해 간단한 Slack 봇을 만든 경험을 통해 뱅크샐러드 Android 앱의 Architecture를 간단히 공유합니다.") { | ||
| article.desc | ||
| } | ||
| assertResult("https://cdn-images-1.medium.com/max/1200/1*KIa4eZARMVelBWNRMd3D9Q.png") { | ||
| article.image | ||
| } | ||
| assertResult("https://medium.com/rainist-engineering/writing-aws-lambda-function-in-kotlin-b3faf3f55777") { | ||
| article.url | ||
| } | ||
|
|
||
| assertResult(6)(article.createdAt.getHour) | ||
| assertResult(9)(article.createdAt.getMinute) | ||
| assertResult(11)(article.createdAt.getSecond) | ||
| } | ||
|
|
||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
error로 로깅해야하지 않나요?