Skip to content

Commit

Permalink
Change crawl_date format to YYYYMMDDHHMMSS, update hasDate filter. (#526
Browse files Browse the repository at this point in the history
)

- Update hasDate filter to match patterns since it only matched literals
  previously
- Resolves #525
- Update tests as required
  • Loading branch information
ruebot authored Jan 20, 2022
1 parent 5fa2d5a commit 73354e8
Show file tree
Hide file tree
Showing 22 changed files with 77 additions and 53 deletions.
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable])
if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
ExtractDate(
r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate,
ExtractDate.DateComponent.YYYYMMDD
ExtractDate.DateComponent.YYYYMMDDHHMMSS
)
} else {
ExtractDate(
Expand All @@ -106,7 +106,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable])
r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate
)
),
ExtractDate.DateComponent.YYYYMMDD
ExtractDate.DateComponent.YYYYMMDDHHMMSS
)
}
}
Expand Down
36 changes: 22 additions & 14 deletions src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,34 @@ object ExtractDate {

/** An enum specifying years, months, days or a combination. */
type DateComponent = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD, YYYYMMDDHHMMSS = Value
}

import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM}
import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM, YYYYMMDD}

/** Extracts the wanted date component from a date.
*
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDD
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDDHHMMSS
* @param dateFormat an enum describing the portion of the date wanted
*/
def apply(fullDate: String, dateFormat: DateComponent): String = {
val startSS = 0
val yearSS = 4
val monthSS = 6
val daySS = 8
val hourSS = 10
val minuteSS = 12
val secondSS = 14
val maybeFullDate: Option[String] = Option(fullDate)
maybeFullDate match {
case Some(fulldate) =>
dateFormat match {
case YYYY => fullDate.substring(startSS, yearSS)
case MM => fullDate.substring(yearSS, monthSS)
case DD => fullDate.substring(monthSS, daySS)
case YYYYMM => fullDate.substring(startSS, monthSS)
case _ => fullDate.substring(startSS, daySS)
case YYYY => fullDate.substring(startSS, yearSS)
case MM => fullDate.substring(yearSS, monthSS)
case DD => fullDate.substring(monthSS, daySS)
case YYYYMM => fullDate.substring(startSS, monthSS)
case YYYYMMDD => fullDate.substring(startSS, daySS)
case _ => fullDate.substring(startSS, secondSS)
}
case None =>
""
Expand All @@ -53,23 +57,27 @@ object ExtractDate {

/** Extracts a provided date component from a date (for DataFrames).
*
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDD
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDDHHMMSS
* @param dateFormat in String format
*/
def apply(fullDate: String, dateFormat: String): String = {
val startSS = 0
val yearSS = 4
val monthSS = 6
val daySS = 8
val hourSS = 10
val minuteSS = 12
val secondSS = 14
val maybeFullDate: Option[String] = Option(fullDate)
maybeFullDate match {
case Some(fulldate) =>
dateFormat match {
case "YYYY" => fullDate.substring(startSS, yearSS)
case "MM" => fullDate.substring(yearSS, monthSS)
case "DD" => fullDate.substring(monthSS, daySS)
case "YYYYMM" => fullDate.substring(startSS, monthSS)
case _ => fullDate.substring(startSS, daySS)
case "YYYY" => fullDate.substring(startSS, yearSS)
case "MM" => fullDate.substring(yearSS, monthSS)
case "DD" => fullDate.substring(monthSS, daySS)
case "YYYYMM" => fullDate.substring(startSS, monthSS)
case "YYYYMMDD" => fullDate.substring(startSS, daySS)
case _ => fullDate.substring(startSS, secondSS)
}
case None =>
""
Expand Down
11 changes: 10 additions & 1 deletion src/main/scala/io/archivesunleashed/udfs/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,16 @@ package object udfs extends Serializable {
.exists(identity)
})
def hasDate: UserDefinedFunction =
udf((date_ : String, date: Seq[String]) => date.contains(date_))
udf((date: String, dates: Seq[String]) => {
dates
.map(re =>
date match {
case re.r() => true
case _ => false
}
)
.exists(identity)
})
def hasDomains: UserDefinedFunction =
udf((domain: String, domains: Seq[String]) => domains.contains(domain))
def hasHTTPStatus: UserDefinedFunction =
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
private var sc: SparkContext = _
private val exampleArc = "example.arc.gz"
private val exampleWarc = "example.warc.gz"
private val exampleDate = "20080430"
private val exampleDate = "20080430204825"
private val exampleUrl = "archive.org"
private val exampleStatusCode1 = "000"
private val exampleStatusCode2 = "200"
Expand Down Expand Up @@ -79,7 +79,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep
)
assert(
textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep
textSampleWarc.deep == Array(exampleDate, exampleDate, "20080430204826").deep
)
}

Expand Down
8 changes: 4 additions & 4 deletions src/test/scala/io/archivesunleashed/RecordDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,15 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
import spark.implicits._
// scalastyle:on

val expected = Array("20080430")
val date = Array("2008.*")
val base = RecordLoader
.loadArchives(arcPath, sc)
.all()
.select($"crawl_date")
.filter(hasDate($"crawl_date", lit(expected)))
.take(1)(0)(0)
.filter(hasDate($"crawl_date", lit(date)))
.count()

assert(base.toString == "20080430")
assert(base == 261)
}

after {
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/RecordRDDTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
test("Discard date RDD") {
val base = RecordLoader.loadArchives(arcPath, sc)
val date = "20080430"
val r = base.filter(x => x.getCrawlDate != date).collect()
val r = base.filter(x => !(x.getCrawlDate.contains(date))).collect()
val r2 = base.discardDate(date).take(3)
assert(r.deep == Array().deep)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190817")
assert(dfResults(0).get(0) == "20190817230242")
assert(dfResults(0).get(1) == "https://ruebot.net/files/feniz.mp3")
assert(dfResults(0).get(2) == "feniz.mp3")
assert(dfResults(0).get(3) == "mp3")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,21 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {
}

test("Domain graph extractor DF") {
val TESTLENGTH = 10
val TESTLENGTH = 82
val df = RecordLoader.loadArchives(arcPath, sc).webgraph()
val dfResult = DomainGraphExtractor(df).collect()

assert(dfResult.length == TESTLENGTH)

assert(dfResult(0).get(0) == "20080430")
assert(dfResult(0).get(0) == "20080430205151")
assert(dfResult(0).get(1) == "archive.org")
assert(dfResult(0).get(2) == "archive.org")
assert(dfResult(0).get(3) == 37511)
assert(dfResult(0).get(3) == 10566)

assert(dfResult(1).get(0) == "20080430")
assert(dfResult(1).get(0) == "20080430204948")
assert(dfResult(1).get(1) == "archive.org")
assert(dfResult(1).get(2) == "etree.org")
assert(dfResult(1).get(3) == 31)
assert(dfResult(1).get(2) == "archive.org")
assert(dfResult(1).get(3) == 7143)
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class ImageGraphExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 788

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204826")
assert(dfResults(0).get(1) == "http://www.archive.org/")
assert(dfResults(0).get(2) == "http://www.archive.org/images/logoc.jpg")
assert(dfResults(0).get(3) == "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 55

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204829")
assert(dfResults(0).get(1) == "http://www.archive.org/images/logoc.jpg")
assert(dfResults(0).get(2) == "logoc.jpg")
assert(dfResults(0).get(3) == "jpg")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190812")
assert(dfResults(0).get(0) == "20190812222529")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class PresentationProgramInformationExtractorTest
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(dfResults(0).get(0) == "20190815004338")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 4

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(dfResults(0).get(0) == "20190815004345")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190817")
assert(dfResults(0).get(0) == "20190817230310")
assert(
dfResults(0).get(1) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class WebGraphExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 37826

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204826")
assert(dfResults(0).get(1) == "http://www.archive.org/")
assert(dfResults(0).get(2) == "http://www.archive.org")
assert(dfResults(0).get(3) == "http://www.archive.org")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class WebPagesExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 94

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204826")
assert(dfResults(0).get(1) == "archive.org")
assert(dfResults(0).get(2) == "http://www.archive.org/")
assert(dfResults(0).get(3) == "text/html")
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class WgetWarcTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20210511")
assert(dfResults(0).get(0) == "20210511181400")
assert(dfResults(0).get(1) == "http://www.archiveteam.org/")
assert(dfResults(1).get(0) == "20210511")
assert(dfResults(1).get(0) == "20210511181401")
assert(dfResults(1).get(1) == "https://wiki.archiveteam.org/")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class WordProcessorInformationExtractorTest
val RESULTSLENGTH = 3

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(dfResults(0).get(0) == "20190815004423")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_2(1) == "Web")

val r_3 = imagegraph.take(100)(99)
assert(r_3.get(0) == "20080430")
assert(r_3.get(0) == "20080430204841")
assert(
r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
.select(
removePrefixWWW(extractDomain($"url")).as("Domain"),
$"url".as("url"),
extractDate($"crawl_date", lit("YYYYMMDD")).as("crawl_date"),
extractDate($"crawl_date", lit("YYYYMMDDHHMMSS")).as("crawl_date"),
explode_outer(extractLinks($"url", $"content")).as("link")
)
.filter(
Expand All @@ -296,12 +296,12 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {

assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "20080430")
assert(results(0).get(2) == "20080430204826")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
assert(results(1).get(2) == "20080430204826")
assert(
results(1).get(
3
Expand All @@ -310,7 +310,7 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "20080430")
assert(results(2).get(2) == "20080430204826")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {

assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "20080430")
assert(results(0).get(2) == "20080430204826")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
assert(results(1).get(2) == "20080430204826")
assert(
results(1).get(
3
Expand All @@ -88,7 +88,7 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "20080430")
assert(results(2).get(2) == "20080430204826")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

Expand Down
13 changes: 10 additions & 3 deletions src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ import io.archivesunleashed.matchbox.ExtractDate.DateComponent.{
MM,
YYYY,
YYYYMM,
YYYYMMDD
YYYYMMDD,
YYYYMMDDHHMMSS
}
import org.junit.runner.RunWith
import org.scalatest.FunSuite
Expand All @@ -31,18 +32,24 @@ import org.scalatest.junit.JUnitRunner
class ExtractDateTest extends FunSuite {

test("Date extraction RDD") {
val date = "20151204"
val date = "20151204235402"
val startSS = 0
val yearSS = 4
val monthSS = 6
val daySS = 8
val hourSS = 10
val minuteSS = 12
val secondSS = 14
assert(ExtractDate(date, YYYY) == date.substring(startSS, yearSS))
assert(ExtractDate(date, MM) == date.substring(yearSS, monthSS))
assert(ExtractDate(date, DD) == date.substring(monthSS, daySS))
assert(ExtractDate(date, YYYYMM) == date.substring(startSS, monthSS))
assert(ExtractDate(date, YYYYMMDD) == date.substring(startSS, daySS))
assert(
ExtractDate(date, YYYYMMDDHHMMSS) == date.substring(startSS, secondSS)
)
// scalastyle:off null
assert(ExtractDate(null, YYYYMMDD) == "")
assert(ExtractDate(null, YYYYMMDDHHMMSS) == "")
// scalastyle:on null
}
}

0 comments on commit 73354e8

Please sign in to comment.