Skip to content

Commit

Permalink
Replace scala-uri library from ExtractDomain. (#524)
Browse files Browse the repository at this point in the history
- Pull down public_suffix_list.dat, and parse it to ExtractDomain
  instead of using scala-uri
- Remove scala-uri
- Remove cats shading
- Resolves #521
  • Loading branch information
ruebot authored Nov 1, 2021
1 parent aea0d14 commit f2b4379
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 29 deletions.
13 changes: 0 additions & 13 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,6 @@
</transformer>
</transformers>

<!-- Shade cats per: https://github.com/lemonlabsuk/scala-uri/issues/341#issuecomment-918529726-->
<relocations>
<relocation>
<pattern>cats.</pattern>
<shadedPattern>cats.shaded.</shadedPattern>
</relocation>
</relocations>

<!-- This fixes the issue "Invalid signature file digest for Manifest main attributes"
cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html -->
<filters>
Expand Down Expand Up @@ -584,11 +576,6 @@
<artifactId>hadoop-aws</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>io.lemonlabs</groupId>
<artifactId>scala-uri_${scala.binary.version}</artifactId>
<version>3.5.0</version>
</dependency>
</dependencies>

<developers>
Expand Down
59 changes: 43 additions & 16 deletions src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,38 +15,65 @@
*/
package io.archivesunleashed.matchbox

import io.lemonlabs.uri.Url
import io.lemonlabs.uri.config.UriConfig
import io.lemonlabs.uri.decoding.PercentDecoder
import java.net.URL

/** Extracts the host domain name from a full url string. */
object ExtractDomain {

implicit val c: UriConfig = UriConfig(
decoder = PercentDecoder(ignoreInvalidPercentEncoding = true)
)
lazy val Suffixes: Set[String] = {
val source = scala.io.Source
.fromURL(
"https://publicsuffix.org/list/public_suffix_list.dat",
"utf-8"
)
try {
source.getLines
.map(_.trim)
.filter(_.nonEmpty)
.filter(!_.startsWith("//"))
.toSet
} catch {
case _: Exception =>
Set.empty
} finally {
source.close()
}
}

/** Extract source domains from a full url string.
*
* @param url a url as a string
* @return domain host, source or null if url is null.
*/
def apply(url: String): String = {
val maybeUri: Option[URL] = checkUrl(url)
maybeUri match {
case Some(uri) =>
try {
Url.parse(uri.toString).apexDomain.mkString
} catch {
case e: Exception =>
""
}

val maybeUrl: Option[URL] = checkUrl(url)

maybeUrl match {

case Some(url) =>
val host = url.getHost.mkString
resolve(host)
case None =>
""
}
}

def resolve(host: String): String = resolve(host, Suffixes)

def resolve(host: String, suffixes: Set[String]): String = {
val hostSplit = host.split('.')
hostSplit.tails
.filter(_.length > 1)
.find { domain =>
val suffix = domain.tail
suffixes.contains(suffix.mkString(".")) || (suffix.length > 1 && {
suffixes.contains("*." + suffix.tail.mkString("."))
})
}
.getOrElse(hostSplit)
.mkString(".")
}

def checkUrl(url: String): Option[URL] = {
try {
Some(new URL(url.replace("\\", "/")))
Expand Down

0 comments on commit f2b4379

Please sign in to comment.