Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change crawl_date format to YYYYMMDDHHMMSS, update hasDate filter. #526

Merged
merged 1 commit into from
Jan 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable])
if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
ExtractDate(
r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate,
ExtractDate.DateComponent.YYYYMMDD
ExtractDate.DateComponent.YYYYMMDDHHMMSS
)
} else {
ExtractDate(
Expand All @@ -106,7 +106,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable])
r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate
)
),
ExtractDate.DateComponent.YYYYMMDD
ExtractDate.DateComponent.YYYYMMDDHHMMSS
)
}
}
Expand Down
36 changes: 22 additions & 14 deletions src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,34 @@ object ExtractDate {

/** An enum specifying years, months, days or a combination. */
type DateComponent = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD, YYYYMMDDHHMMSS = Value
}

import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM}
import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM, YYYYMMDD}

/** Extracts the wanted date component from a date.
*
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDD
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDDHHMMSS
* @param dateFormat an enum describing the portion of the date wanted
*/
def apply(fullDate: String, dateFormat: DateComponent): String = {
val startSS = 0
val yearSS = 4
val monthSS = 6
val daySS = 8
val hourSS = 10
val minuteSS = 12
val secondSS = 14
val maybeFullDate: Option[String] = Option(fullDate)
maybeFullDate match {
case Some(fulldate) =>
dateFormat match {
case YYYY => fullDate.substring(startSS, yearSS)
case MM => fullDate.substring(yearSS, monthSS)
case DD => fullDate.substring(monthSS, daySS)
case YYYYMM => fullDate.substring(startSS, monthSS)
case _ => fullDate.substring(startSS, daySS)
case YYYY => fullDate.substring(startSS, yearSS)
case MM => fullDate.substring(yearSS, monthSS)
case DD => fullDate.substring(monthSS, daySS)
case YYYYMM => fullDate.substring(startSS, monthSS)
case YYYYMMDD => fullDate.substring(startSS, daySS)
case _ => fullDate.substring(startSS, secondSS)
}
case None =>
""
Expand All @@ -53,23 +57,27 @@ object ExtractDate {

/** Extracts a provided date component from a date (for DataFrames).
*
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDD
* @param fullDate date returned by `WARecord.getCrawlDate`, formatted as YYYYMMDDHHMMSS
* @param dateFormat in String format
*/
def apply(fullDate: String, dateFormat: String): String = {
val startSS = 0
val yearSS = 4
val monthSS = 6
val daySS = 8
val hourSS = 10
val minuteSS = 12
val secondSS = 14
val maybeFullDate: Option[String] = Option(fullDate)
maybeFullDate match {
case Some(fulldate) =>
dateFormat match {
case "YYYY" => fullDate.substring(startSS, yearSS)
case "MM" => fullDate.substring(yearSS, monthSS)
case "DD" => fullDate.substring(monthSS, daySS)
case "YYYYMM" => fullDate.substring(startSS, monthSS)
case _ => fullDate.substring(startSS, daySS)
case "YYYY" => fullDate.substring(startSS, yearSS)
case "MM" => fullDate.substring(yearSS, monthSS)
case "DD" => fullDate.substring(monthSS, daySS)
case "YYYYMM" => fullDate.substring(startSS, monthSS)
case "YYYYMMDD" => fullDate.substring(startSS, daySS)
case _ => fullDate.substring(startSS, secondSS)
}
case None =>
""
Expand Down
11 changes: 10 additions & 1 deletion src/main/scala/io/archivesunleashed/udfs/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,16 @@ package object udfs extends Serializable {
.exists(identity)
})
def hasDate: UserDefinedFunction =
udf((date_ : String, date: Seq[String]) => date.contains(date_))
udf((date: String, dates: Seq[String]) => {
dates
.map(re =>
date match {
case re.r() => true
case _ => false
}
)
.exists(identity)
})
def hasDomains: UserDefinedFunction =
udf((domain: String, domains: Seq[String]) => domains.contains(domain))
def hasHTTPStatus: UserDefinedFunction =
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
private var sc: SparkContext = _
private val exampleArc = "example.arc.gz"
private val exampleWarc = "example.warc.gz"
private val exampleDate = "20080430"
private val exampleDate = "20080430204825"
private val exampleUrl = "archive.org"
private val exampleStatusCode1 = "000"
private val exampleStatusCode2 = "200"
Expand Down Expand Up @@ -79,7 +79,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep
)
assert(
textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep
textSampleWarc.deep == Array(exampleDate, exampleDate, "20080430204826").deep
)
}

Expand Down
8 changes: 4 additions & 4 deletions src/test/scala/io/archivesunleashed/RecordDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,15 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
import spark.implicits._
// scalastyle:on

val expected = Array("20080430")
val date = Array("2008.*")
val base = RecordLoader
.loadArchives(arcPath, sc)
.all()
.select($"crawl_date")
.filter(hasDate($"crawl_date", lit(expected)))
.take(1)(0)(0)
.filter(hasDate($"crawl_date", lit(date)))
.count()

assert(base.toString == "20080430")
assert(base == 261)
}

after {
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/RecordRDDTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
test("Discard date RDD") {
val base = RecordLoader.loadArchives(arcPath, sc)
val date = "20080430"
val r = base.filter(x => x.getCrawlDate != date).collect()
val r = base.filter(x => !(x.getCrawlDate.contains(date))).collect()
val r2 = base.discardDate(date).take(3)
assert(r.deep == Array().deep)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190817")
assert(dfResults(0).get(0) == "20190817230242")
assert(dfResults(0).get(1) == "https://ruebot.net/files/feniz.mp3")
assert(dfResults(0).get(2) == "feniz.mp3")
assert(dfResults(0).get(3) == "mp3")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,21 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {
}

test("Domain graph extractor DF") {
val TESTLENGTH = 10
val TESTLENGTH = 82
val df = RecordLoader.loadArchives(arcPath, sc).webgraph()
val dfResult = DomainGraphExtractor(df).collect()

assert(dfResult.length == TESTLENGTH)

assert(dfResult(0).get(0) == "20080430")
assert(dfResult(0).get(0) == "20080430205151")
assert(dfResult(0).get(1) == "archive.org")
assert(dfResult(0).get(2) == "archive.org")
assert(dfResult(0).get(3) == 37511)
assert(dfResult(0).get(3) == 10566)

assert(dfResult(1).get(0) == "20080430")
assert(dfResult(1).get(0) == "20080430204948")
assert(dfResult(1).get(1) == "archive.org")
assert(dfResult(1).get(2) == "etree.org")
assert(dfResult(1).get(3) == 31)
assert(dfResult(1).get(2) == "archive.org")
assert(dfResult(1).get(3) == 7143)
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class ImageGraphExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 788

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204826")
assert(dfResults(0).get(1) == "http://www.archive.org/")
assert(dfResults(0).get(2) == "http://www.archive.org/images/logoc.jpg")
assert(dfResults(0).get(3) == "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 55

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204829")
assert(dfResults(0).get(1) == "http://www.archive.org/images/logoc.jpg")
assert(dfResults(0).get(2) == "logoc.jpg")
assert(dfResults(0).get(3) == "jpg")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190812")
assert(dfResults(0).get(0) == "20190812222529")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class PresentationProgramInformationExtractorTest
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(dfResults(0).get(0) == "20190815004338")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 4

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(dfResults(0).get(0) == "20190815004345")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190817")
assert(dfResults(0).get(0) == "20190817230310")
assert(
dfResults(0).get(1) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class WebGraphExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 37826

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204826")
assert(dfResults(0).get(1) == "http://www.archive.org/")
assert(dfResults(0).get(2) == "http://www.archive.org")
assert(dfResults(0).get(3) == "http://www.archive.org")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class WebPagesExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 94

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(0) == "20080430204826")
assert(dfResults(0).get(1) == "archive.org")
assert(dfResults(0).get(2) == "http://www.archive.org/")
assert(dfResults(0).get(3) == "text/html")
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class WgetWarcTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20210511")
assert(dfResults(0).get(0) == "20210511181400")
assert(dfResults(0).get(1) == "http://www.archiveteam.org/")
assert(dfResults(1).get(0) == "20210511")
assert(dfResults(1).get(0) == "20210511181401")
assert(dfResults(1).get(1) == "https://wiki.archiveteam.org/")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class WordProcessorInformationExtractorTest
val RESULTSLENGTH = 3

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(dfResults(0).get(0) == "20190815004423")
assert(
dfResults(0).get(
1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_2(1) == "Web")

val r_3 = imagegraph.take(100)(99)
assert(r_3.get(0) == "20080430")
assert(r_3.get(0) == "20080430204841")
assert(
r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
.select(
removePrefixWWW(extractDomain($"url")).as("Domain"),
$"url".as("url"),
extractDate($"crawl_date", lit("YYYYMMDD")).as("crawl_date"),
extractDate($"crawl_date", lit("YYYYMMDDHHMMSS")).as("crawl_date"),
explode_outer(extractLinks($"url", $"content")).as("link")
)
.filter(
Expand All @@ -296,12 +296,12 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {

assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "20080430")
assert(results(0).get(2) == "20080430204826")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
assert(results(1).get(2) == "20080430204826")
assert(
results(1).get(
3
Expand All @@ -310,7 +310,7 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "20080430")
assert(results(2).get(2) == "20080430204826")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {

assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "20080430")
assert(results(0).get(2) == "20080430204826")
assert(results(0).get(3) == "http://www.archive.org/")

assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
assert(results(1).get(2) == "20080430204826")
assert(
results(1).get(
3
Expand All @@ -88,7 +88,7 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {

assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
assert(results(2).get(2) == "20080430")
assert(results(2).get(2) == "20080430204826")
assert(results(2).get(3) == "http://www.archive.org/details/movies")
}

Expand Down
13 changes: 10 additions & 3 deletions src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ import io.archivesunleashed.matchbox.ExtractDate.DateComponent.{
MM,
YYYY,
YYYYMM,
YYYYMMDD
YYYYMMDD,
YYYYMMDDHHMMSS
}
import org.junit.runner.RunWith
import org.scalatest.FunSuite
Expand All @@ -31,18 +32,24 @@ import org.scalatest.junit.JUnitRunner
class ExtractDateTest extends FunSuite {

test("Date extraction RDD") {
val date = "20151204"
val date = "20151204235402"
val startSS = 0
val yearSS = 4
val monthSS = 6
val daySS = 8
val hourSS = 10
val minuteSS = 12
val secondSS = 14
assert(ExtractDate(date, YYYY) == date.substring(startSS, yearSS))
assert(ExtractDate(date, MM) == date.substring(yearSS, monthSS))
assert(ExtractDate(date, DD) == date.substring(monthSS, daySS))
assert(ExtractDate(date, YYYYMM) == date.substring(startSS, monthSS))
assert(ExtractDate(date, YYYYMMDD) == date.substring(startSS, daySS))
assert(
ExtractDate(date, YYYYMMDDHHMMSS) == date.substring(startSS, secondSS)
)
// scalastyle:off null
assert(ExtractDate(null, YYYYMMDD) == "")
assert(ExtractDate(null, YYYYMMDDHHMMSS) == "")
// scalastyle:on null
}
}