copy over scala code to new directory for pyspark

archivesunleashed · MapleOx · Sep 28, 2017 · Oct 3, 2017 · Oct 18, 2017 · Oct 18, 2017
commit 4b46c1f82d31a3158de78f5bcfd80da7945e8403
diff --git a/src/main/pyspark/archive/io/ArcRecord.scala b/src/main/pyspark/archive/io/ArcRecord.scala
@@ -0,0 +1,48 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.archive.io
+
+import org.apache.spark.SerializableWritable
+import io.archivesunleashed.data.ArcRecordUtils
+import io.archivesunleashed.io.ArcRecordWritable
+import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent
+import io.archivesunleashed.spark.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain}
+
+class ArcRecord(r: SerializableWritable[ArcRecordWritable]) extends ArchiveRecord {
+  val getCrawlDate: String = ExtractDate(r.t.getRecord.getMetaData.getDate, DateComponent.YYYYMMDD)
+
+  val getCrawlMonth: String = ExtractDate(r.t.getRecord.getMetaData.getDate, DateComponent.YYYYMM)
+
+  val getMimeType: String = r.t.getRecord.getMetaData.getMimetype
+
+  val getUrl: String = r.t.getRecord.getMetaData.getUrl
+
+  val getDomain: String = ExtractDomain(r.t.getRecord.getMetaData.getUrl)
+
+  val getContentBytes: Array[Byte] = ArcRecordUtils.getBodyContent(r.t.getRecord)
+
+  val getContentString: String = new String(getContentBytes)
+
+  val getImageBytes: Array[Byte] = {
+    if (getContentString.startsWith("HTTP/"))
+      getContentBytes.slice(
+        getContentString.indexOf(RemoveHttpHeader.headerEnd)
+          + RemoveHttpHeader.headerEnd.length, getContentBytes.length)
+    else
+      getContentBytes
+  }
+}
diff --git a/src/main/pyspark/archive/io/ArchiveRecord.scala b/src/main/pyspark/archive/io/ArchiveRecord.scala
@@ -0,0 +1,35 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.archive.io
+
+trait ArchiveRecord extends Serializable {
+  val getCrawlDate: String
+
+  val getCrawlMonth: String
+
+  val getUrl: String
+
+  val getDomain: String
+
+  val getMimeType: String
+
+  val getContentString: String
+
+  val getContentBytes: Array[Byte]
+
+  val getImageBytes: Array[Byte]
+}
diff --git a/src/main/pyspark/archive/io/GenericArchiveRecord.scala b/src/main/pyspark/archive/io/GenericArchiveRecord.scala
@@ -0,0 +1,96 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.archive.io
+
+import java.text.SimpleDateFormat
+
+import org.apache.spark.SerializableWritable
+import org.archive.io.arc.ARCRecord
+import org.archive.io.warc.WARCRecord
+
+import org.archive.util.ArchiveUtils
+import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils}
+import io.archivesunleashed.io.GenericArchiveRecordWritable
+import io.archivesunleashed.io.GenericArchiveRecordWritable.ArchiveFormat
+import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent
+import io.archivesunleashed.spark.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain}
+
+class GenericArchiveRecord(r: SerializableWritable[GenericArchiveRecordWritable]) extends ArchiveRecord {
+  var arcRecord: ARCRecord = null
+  var warcRecord: WARCRecord = null
+
+  if (r.t.getFormat == ArchiveFormat.ARC)
+    arcRecord = r.t.getRecord.asInstanceOf[ARCRecord]
+  else if (r.t.getFormat == ArchiveFormat.WARC)
+    warcRecord = r.t.getRecord.asInstanceOf[WARCRecord]
+
+
+  val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")
+
+  val getCrawlDate: String = {
+    if (r.t.getFormat == ArchiveFormat.ARC) {
+      ExtractDate(arcRecord.getMetaData.getDate, DateComponent.YYYYMMDD)
+    } else {
+      ExtractDate(ArchiveUtils.get14DigitDate(ISO8601.parse(warcRecord.getHeader.getDate)), DateComponent.YYYYMMDD)
+    }
+  }
+
+  val getCrawlMonth: String = {
+    if (r.t.getFormat == ArchiveFormat.ARC) {
+      ExtractDate(arcRecord.getMetaData.getDate, DateComponent.YYYYMM)
+    } else {
+      ExtractDate(ArchiveUtils.get14DigitDate(ISO8601.parse(warcRecord.getHeader.getDate)), DateComponent.YYYYMM)
+    }
+  }
+
+  val getContentBytes: Array[Byte] = {
+    if (r.t.getFormat == ArchiveFormat.ARC) {
+      ArcRecordUtils.getBodyContent(arcRecord)
+    } else {
+      WarcRecordUtils.getContent(warcRecord)
+    }
+  }
+
+  val getContentString: String = new String(getContentBytes)
+
+  val getMimeType = {
+    if (r.t.getFormat == ArchiveFormat.ARC) {
+      arcRecord.getMetaData.getMimetype
+    } else {
+      WarcRecordUtils.getWarcResponseMimeType(getContentBytes)
+    }
+  }
+
+  val getUrl = {
+    if (r.t.getFormat == ArchiveFormat.ARC) {
+      arcRecord.getMetaData.getUrl
+    } else {
+      warcRecord.getHeader.getUrl
+    }
+  }
+
+  val getDomain: String = ExtractDomain(getUrl)
+
+  val getImageBytes: Array[Byte] = {
+    if (getContentString.startsWith("HTTP/"))
+      getContentBytes.slice(
+        getContentString.indexOf(RemoveHttpHeader.headerEnd)
+          + RemoveHttpHeader.headerEnd.length, getContentBytes.length)
+    else
+      getContentBytes
+  }
+}
diff --git a/src/main/pyspark/archive/io/WarcRecord.scala b/src/main/pyspark/archive/io/WarcRecord.scala
@@ -0,0 +1,53 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.archive.io
+
+import java.text.SimpleDateFormat
+
+import org.apache.spark.SerializableWritable
+import org.archive.util.ArchiveUtils
+import io.archivesunleashed.data.WarcRecordUtils
+import io.archivesunleashed.io.WarcRecordWritable
+import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent
+import io.archivesunleashed.spark.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain}
+
+class WarcRecord(r: SerializableWritable[WarcRecordWritable]) extends ArchiveRecord {
+  val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")
+
+  val getCrawlDate: String = ExtractDate(ArchiveUtils.get14DigitDate(ISO8601.parse(r.t.getRecord.getHeader.getDate)), DateComponent.YYYYMMDD)
+
+  val getCrawlMonth: String = ExtractDate(ArchiveUtils.get14DigitDate(ISO8601.parse(r.t.getRecord.getHeader.getDate)), DateComponent.YYYYMM)
+
+  val getContentBytes: Array[Byte] = WarcRecordUtils.getContent(r.t.getRecord)
+
+  val getContentString: String = new String(getContentBytes)
+
+  val getMimeType = WarcRecordUtils.getWarcResponseMimeType(getContentBytes)
+
+  val getUrl = r.t.getRecord.getHeader.getUrl
+
+  val getDomain = ExtractDomain(getUrl)
+
+  val getImageBytes: Array[Byte] = {
+    if (getContentString.startsWith("HTTP/"))
+      getContentBytes.slice(
+        getContentString.indexOf(RemoveHttpHeader.headerEnd)
+          + RemoveHttpHeader.headerEnd.length, getContentBytes.length)
+    else
+      getContentBytes
+  }
+}
diff --git a/src/main/pyspark/matchbox/ComputeImageSize.scala b/src/main/pyspark/matchbox/ComputeImageSize.scala
@@ -0,0 +1,41 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.matchbox
+
+import java.io.ByteArrayInputStream
+import javax.imageio.ImageIO
+
+/**
+  * Created by youngbinkim on 7/7/16.
+  */
+object ComputeImageSize {
+  def apply(bytes: Array[Byte]): (Int, Int) = {
+    val in = new ByteArrayInputStream(bytes)
+
+    try {
+      val image = ImageIO.read(in)
+      if (image == null)
+        return (0, 0)
+      (image.getWidth(), image.getHeight())
+    } catch {
+      case e: Throwable => {
+        e.printStackTrace()
+        return (0, 0)
+      }
+    }
+  }
+}
diff --git a/src/main/pyspark/matchbox/ComputeMD5.scala b/src/main/pyspark/matchbox/ComputeMD5.scala
@@ -0,0 +1,35 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.matchbox
+
+import java.security.MessageDigest
+
+
+/**
+  * compute MD5 checksum..
+  *
+  */
+object ComputeMD5 {
+  /**
+    *
+    * @param bytes
+    * @return
+    */
+  def apply(bytes: Array[Byte]): String = {
+    new String(MessageDigest.getInstance("MD5").digest(bytes))
+  }
+}
diff --git a/src/main/pyspark/matchbox/DetectLanguage.scala b/src/main/pyspark/matchbox/DetectLanguage.scala
@@ -0,0 +1,26 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.matchbox
+
+import org.apache.tika.language.LanguageIdentifier
+
+object DetectLanguage {
+  def apply(input: String): String = {
+    if (input.isEmpty) ""
+    else new LanguageIdentifier(input).getLanguage
+  }
+}
diff --git a/src/main/pyspark/matchbox/DetectMimeTypeTika.scala b/src/main/pyspark/matchbox/DetectMimeTypeTika.scala
@@ -0,0 +1,39 @@
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.pyspark.matchbox
+
+import java.io.ByteArrayInputStream
+
+import org.apache.tika.Tika
+import org.apache.tika.detect.DefaultDetector
+import org.apache.tika.parser.AutoDetectParser
+
+/**
+  * A UDF to detect mime types
+  */
+object DetectMimeTypeTika {
+  def apply(content: String): String = {
+    if (content.isEmpty) "N/A"
+    else {
+      val is = new ByteArrayInputStream(content.getBytes)
+      val detector = new DefaultDetector()
+      val parser = new AutoDetectParser(detector)
+      val mimetype = new Tika(detector, parser).detect(is)
+      mimetype
+    }
+  }
+}