Skip to content

Commit

Permalink
Add ComputeSHA1 method; resolves #363.
Browse files Browse the repository at this point in the history
- Update tests where needed
- Add SHA1 method to ExtractImageDetails
- Add SHA1 to DataFrames binary extraction and analysis
  • Loading branch information
ruebot committed Oct 8, 2019
1 parent 9b3e025 commit 21f9434
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 25 deletions.
32 changes: 32 additions & 0 deletions src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.matchbox

import java.security.MessageDigest

/** Compute SHA1 checksum. */
// scalastyle:off object.name
object ComputeSHA1 {
// scalastyle:on object.name
/** Computes the MD5 checksum of a byte array (eg. an image).
*
* @param bytes
* @return SHA1 checksum.
*/
def apply(bytes: Array[Byte]): String = {
MessageDigest.getInstance("SHA1").digest(bytes).map("%02x".format(_)).mkString
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class ImageDetails(imageUrl: String, imageType: String, bytes: Array[Byte]) {
val height = dimensions._2
val url: String = imageUrl
val mimeType: String = imageType
val hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val body: String = Base64.getEncoder.encodeToString(bytes)
}

Expand Down
61 changes: 38 additions & 23 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ package object archivesunleashed {
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), mimeTypeTika)
(r.getUrl, filename, extension, r.getMimeType, mimeTypeTika,
image.width, image.height, image.hash, image.body)
image.width, image.height, image.md5Hash, image.sha1Hash, image.body)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -175,6 +175,7 @@ package object archivesunleashed {
.add(StructField("width", IntegerType, true))
.add(StructField("height", IntegerType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand All @@ -190,15 +191,16 @@ package object archivesunleashed {
.filter(r => r._2 == "application/pdf")
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -207,6 +209,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand All @@ -222,15 +225,16 @@ package object archivesunleashed {
.filter(r => r._2.startsWith("audio/"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -239,6 +243,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand All @@ -254,15 +259,16 @@ package object archivesunleashed {
.filter(r => r._2.startsWith("video/"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -271,6 +277,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand Down Expand Up @@ -311,7 +318,8 @@ package object archivesunleashed {
&& r._2 == "text/plain"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
Expand All @@ -325,9 +333,9 @@ package object archivesunleashed {
}
val extension = GetExtensionMime(url.getPath(), mimeType)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -336,6 +344,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand Down Expand Up @@ -363,15 +372,16 @@ package object archivesunleashed {
|| r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12")
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -380,6 +390,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand Down Expand Up @@ -412,15 +423,16 @@ package object archivesunleashed {
|| r._2 == "application/rtf")
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -429,6 +441,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand All @@ -447,15 +460,16 @@ package object archivesunleashed {
|| !r.getUrl.toLowerCase.endsWith(".html"))
.map(r => {
val bytes = r.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r.getUrl, filename, extension, r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
Expand All @@ -464,6 +478,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
.extractImageDetailsDF()

val extracted = df.select("url", "mime_type_web_server", "mime_type_tika",
"width", "height", "md5")
"width", "height", "md5", "sha1")
.orderBy(desc("md5")).head(2).toList
assert(extracted.size == 2)
assert("http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,9 @@ class StringUtilsTest extends FunSuite {
val s: String = "unesco.org";
assert(ComputeMD5(s.getBytes) == "8e8decc8e8107bcf9d3896f3222b77d8");
}
test ("sh1 hash") {
val s: String = "unesco.org";
assert(ComputeSHA1(s.getBytes) == "2d0e5377157172045d87befe46e157cda42c4f6e");
}

}

0 comments on commit 21f9434

Please sign in to comment.