Skip to content

Commit

Permalink
Fix relative links extraction (#504)
Browse files Browse the repository at this point in the history
Set baseUri to be `src` instead of `base` when extracting links, and deleted `base` parameter.

The issue occurred because relative links cannot be extracted by ` link.attr("abs:href")` when baseUri is not set.
As I look through the code, param `base` is never provided anywhere when `ExtractLinks` is called, so default value `""` is always used, and baseUri is never set. However, `baseUri` is required to be able to extract relative links. 

* resolves #501 
* update tests
* remove unnecessary test results comment

Co-authored-by: Kai Zhong <kaizhchn@hotmail.com>
Co-authored-by: nruest <ruestn@gmail.com>
  • Loading branch information
3 people committed Jan 18, 2021
1 parent 363a8e1 commit 074810b
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,11 @@ object ExtractLinks {
*
* @param src the src link
* @param html the content from which links are to be extracted
* @param base an optional base URI
* @return a sequence of (source, target, anchortext).
*/
def apply(
src: String,
html: String,
base: String = ""
html: String
): Seq[(String, String, String)] = {
val srcMaybe: Option[String] = Option(src)
val htmlMaybe: Option[String] = Option(html)
Expand All @@ -48,7 +46,7 @@ object ExtractLinks {
val it = links.iterator()
while (it.hasNext) {
val link = it.next()
if (base.nonEmpty) link.setBaseUri(base)
link.setBaseUri(src)
val target = link.attr("abs:href")
if (target.nonEmpty) {
output += ((valid_src, target, link.text))
Expand Down
3 changes: 1 addition & 2 deletions src/test/scala/io/archivesunleashed/ArcTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ class ArcTest extends FunSuite with BeforeAndAfter {
val links = RecordLoader
.loadArchives(arcPath, sc)
.map(r => ExtractLinks(r.getUrl, r.getContentString))
.reduce((a, b) => a ++ b)
assert(links.size == 664)
assert(links.count == 300L)
}

test("Detect language RDD") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {
}

test("Domain graph extractor DF") {
val TESTLENGTH = 9
val TESTRESULT = 280
val TESTLENGTH = 10
val df = RecordLoader.loadArchives(arcPath, sc).webgraph()
val dfResult = DomainGraphExtractor(df).collect()

Expand All @@ -49,7 +48,12 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {
assert(dfResult(0).get(0) == "20080430")
assert(dfResult(0).get(1) == "archive.org")
assert(dfResult(0).get(2) == "archive.org")
assert(dfResult(0).get(3) == TESTRESULT)
assert(dfResult(0).get(3) == 37477)

assert(dfResult(1).get(0) == "20080430")
assert(dfResult(1).get(1) == "archive.org")
assert(dfResult(1).get(2) == "wiki.etree.org")
assert(dfResult(1).get(3) == 21)
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class WebGraphExtractorTest extends FunSuite with BeforeAndAfter {
test("Web graph extractor DF") {
val df = RecordLoader.loadArchives(arcPath, sc).webgraph()
val dfResults = WebGraphExtractor(df).collect()
val RESULTSLENGTH = 622
val RESULTSLENGTH = 37825

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_1.getAs[String](mime_type) == "text/html")

val r_2 = hyperlinks.select("Dest", "Anchor").take(3)(2)
assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html")
assert(r_2(1) == "Advanced Search")
assert(r_2(0) == "http://www.archive.org/web/web.php")
assert(r_2(1) == "Web")

val r_3 = imagegraph.take(100)(99)
assert(r_3.get(0) == "20080430")
Expand Down
30 changes: 15 additions & 15 deletions src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,21 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "2008")
assert(results(0).get(3) == "http://www.archive.org/create/")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "2008")
assert(
results(1).get(
3
) == "http://web.archive.org/collections/web/advanced.html"
) == "http://www.archive.org/web/web.php"
)

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "2008")
assert(results(2).get(3) == "http://www.sloan.org")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

test("Extract dates YYYYMM DF") {
Expand Down Expand Up @@ -135,21 +135,21 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "200804")
assert(results(0).get(3) == "http://www.archive.org/create/")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "200804")
assert(
results(1).get(
3
) == "http://web.archive.org/collections/web/advanced.html"
) == "http://www.archive.org/web/web.php"
)

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "200804")
assert(results(2).get(3) == "http://www.sloan.org")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

test("Extract dates MM DF") {
Expand Down Expand Up @@ -189,21 +189,21 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "04")
assert(results(0).get(3) == "http://www.archive.org/create/")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "04")
assert(
results(1).get(
3
) == "http://web.archive.org/collections/web/advanced.html"
) == "http://www.archive.org/web/web.php"
)

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "04")
assert(results(2).get(3) == "http://www.sloan.org")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

test("Extract dates DD DF") {
Expand Down Expand Up @@ -243,21 +243,21 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "30")
assert(results(0).get(3) == "http://www.archive.org/create/")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "30")
assert(
results(1).get(
3
) == "http://web.archive.org/collections/web/advanced.html"
) == "http://www.archive.org/web/web.php"
)

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "30")
assert(results(2).get(3) == "http://www.sloan.org")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

test("Extract dates YYYYMMDD DF") {
Expand Down Expand Up @@ -297,21 +297,21 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "20080430")
assert(results(0).get(3) == "http://www.archive.org/create/")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
assert(
results(1).get(
3
) == "http://web.archive.org/collections/web/advanced.html"
) == "http://www.archive.org/web/web.php"
)

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "20080430")
assert(results(2).get(3) == "http://www.sloan.org")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

after {
Expand Down
15 changes: 3 additions & 12 deletions src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -72,33 +72,24 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {
)
.head(3)

// Results should be:
// +--------------------------------+-----------+----------+----------------------------------------------------+
// |url |Domain |crawl_date|destination_page |
// +--------------------------------+-----------+----------+----------------------------------------------------+
// |http://www.archive.org/index.php|archive.org|20080430 |http://www.archive.org/create/ |
// |http://www.archive.org/index.php|archive.org|20080430 |http://web.archive.org/collections/web/advanced.html|
// |http://www.archive.org/index.php|archive.org|20080430 |http://www.sloan.org |
// +--------------------------------+-----------+----------+----------------------------------------------------+

assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "20080430")
assert(results(0).get(3) == "http://www.archive.org/create/")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
assert(
results(1).get(
3
) == "http://web.archive.org/collections/web/advanced.html"
) == "http://www.archive.org/web/web.php"
)

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "20080430")
assert(results(2).get(3) == "http://www.sloan.org")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,23 @@ class ExtractLinksTest extends FunSuite {
assert("Twitter" == extracted.last._3)
}

test("Extract relative links RDD") {
test("Extract relative links with no src RDD") {
val fragmentLocal: String = "Here is <a href=\"http://www.google.com\">" +
"a search engine</a>.\nHere is a <a href=\"page.html\">a relative URL</a>.\n"
val fooFragmentLocal = "http://www.foobar.org/page.html"
val extracted: Seq[(String, String, String)] =
ExtractLinks("", fragmentLocal, fooFragment)
ExtractLinks("", fragmentLocal)
assert(extracted.size == 1)
assert(url == extracted.head._2)
assert(head == extracted.head._3)
}

test("Extract relative links with src RDD") {
val fragmentLocal: String = "Here is <a href=\"http://www.google.com\">" +
"a search engine</a>.\nHere is a <a href=\"page.html\">a relative URL</a>.\n"
val fooFragmentLocal = "http://www.foobar.org/page.html"
val extracted: Seq[(String, String, String)] =
ExtractLinks(fooFragment, fragmentLocal)
assert(extracted.size == 2)
assert(url == extracted.head._2)
assert(head == extracted.head._3)
Expand All @@ -64,12 +75,12 @@ class ExtractLinksTest extends FunSuite {
"Here is a fake url <a href=\"http://www.google.com\"> bogus search engine</a>."
// scalastyle:off null
assert(
ExtractLinks(null, fragment, fooFragment) == mutable
ExtractLinks(null, fragment) == mutable
.MutableList[(String, String, String)]()
)
// scalastyle:on null
assert(
ExtractLinks("", "", fooFragment) == mutable
ExtractLinks("", "") == mutable
.MutableList[(String, String, String)]()
)
}
Expand Down

0 comments on commit 074810b

Please sign in to comment.