Skip to content

Commit

Permalink
Implements extracting last_modified_date of a resource where availa…
Browse files Browse the repository at this point in the history
…ble. (#547)

* Adds `getLastModified` for `SparklingArchiveRecord`
* Adds `CovertLastModifiedDate` to convert RFC 1123 dates to `yyyyMMddHHmmss`
  * See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified
* Implement `last_modified_date` column for
  * `.all()`
  * `.webpages()`
  * `.images()`
  * `.pdfs()`
  * `.audio()`
  * `.videos()`
  * `.spreadsheets()`
  * `.presentationProgramFiles()`
  * `.wordProcessorFiles()`
  * `.css()`
  * `.html()`
  * `.js()`
  * `.json()`
  * `.plainText()`
  * `.xml()`
* Update tests
* Resolves #546
  • Loading branch information
ruebot committed Nov 7, 2022
1 parent eeaa464 commit cdf8e76
Show file tree
Hide file tree
Showing 26 changed files with 259 additions and 105 deletions.
3 changes: 3 additions & 0 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,7 @@ trait ArchiveRecord extends Serializable {

/** Returns payload digest (SHA1). */
def getPayloadDigest: String

/** Returns last-modified date from HTTPS headers as a string. */
def getLastModified: String
}
11 changes: 10 additions & 1 deletion src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ import io.archivesunleashed.matchbox.ExtractDomain
import org.apache.tika.io.BoundedInputStream
import org.archive.webservices.sparkling.http.HttpMessage
import org.archive.webservices.sparkling.io.IOUtil
import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier}
import org.archive.webservices.sparkling.util.{
ManagedVal,
RegexUtil,
ValueSupplier
}
import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord}
import scala.util.Try

Expand Down Expand Up @@ -60,6 +64,11 @@ class SparklingArchiveRecord(
new SparklingArchiveRecord(filename, meta, payload, maxBodyLength)
}

override def getLastModified: String =
http(warc)
.flatMap(_.headerMap.get("last-modified"))
.getOrElse("")

override def getArchiveFilename: String = filename

override def getCrawlDate: String =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ object AudioInformationExtractor {
// scalastyle:on
d.select(
$"crawl_date",
$"last_modified_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ object ImageInformationExtractor {
// scalastyle:on
d.select(
$"crawl_date",
$"last_modified_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ object PDFInformationExtractor {
// scalastyle:on
d.select(
$"crawl_date",
$"last_modified_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ object PresentationProgramInformationExtractor {
// scalastyle:on
d.select(
$"crawl_date",
$"last_modified_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ object SpreadsheetInformationExtractor {
// scalastyle:on
d.select(
$"crawl_date",
$"last_modified_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ object VideoInformationExtractor {
// scalastyle:on
d.select(
$"crawl_date",
$"last_modified_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ object WordProcessorInformationExtractor {
// scalastyle:on
d.select(
$"crawl_date",
$"last_modified_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.matchbox

/** Converts RFC 1123 dates to yyyyMMddHHmmss. */
object CovertLastModifiedDate {
val months = Seq(
"jan",
"feb",
"mar",
"apr",
"may",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec"
).zipWithIndex.map { case (s, d) => (s, ("0" + (d + 1)).takeRight(2)) }

/** Converts last_modified_date to yyyyMMddHHmmss.
*
* @param lastModifiedDate date returned by `getLastModified`, formatted as RFC 1123
* @return last_modified_date as yyyyMMddHHmmss.
*/
def apply(lastModifiedDate: String): String = {
if (lastModifiedDate.isEmpty) {
""
} else {
// Credit: Helge Holzmann (@helgeho)
// Adapted from https://github.com/archivesunleashed/aut/pull/547#issuecomment-1302094573
val lc = lastModifiedDate.toLowerCase
val date = months.find(m => lc.contains(m._1)).map(_._2).flatMap { m =>
val d = lc
.replace(":", "")
.split(' ')
.drop(1)
.map(d => (d.length, d))
.toMap
for (y <- d.get(4); n <- d.get(2); t <- d.get(6))
yield y + m + n + t
}
date match {
case Some(date) =>
date
case None =>
""
}
}
}
}
Loading

0 comments on commit cdf8e76

Please sign in to comment.