archivesunleashed · MapleOx · Sep 28, 2017 · Oct 3, 2017 · Oct 18, 2017 · Oct 18, 2017
diff --git a/src/main/python/RecordLoader.py b/src/main/python/RecordLoader.py
@@ -20,47 +20,47 @@
 from RecordRDD import RecordRDD
 
 def loadArcAsRDD(path, sc, spark):
-  rlph = sc._jvm.io.archivesunleashed.pyspark.matchbox.RecordLoaderPythonHelper
+  rlph = sc._jvm.io.archivesunleashed.spark.pythonhelpers.RecordLoaderPythonHelper
   df = rlph.loadArc(path, sc._jsc, spark._jsparkSession)
   df.createTempView("df")
   pdf = spark.table("df")
   spark.catalog.dropTempView("df")
   return pdf.rdd
 
 def loadArcAsDF(path, sc, spark):
-  rlph = sc._jvm.io.archivesunleashed.pyspark.matchbox.RecordLoaderPythonHelper
+  rlph = sc._jvm.io.archivesunleashed.spark.pythonhelpers.RecordLoaderPythonHelper
   df = rlph.loadArc(path, sc._jsc, spark._jsparkSession)
   df.createTempView("df")
   pdf = spark.table("df")
   spark.catalog.dropTempView("df")
   return pdf
 
 def loadWarcAsRDD(path, sc, spark):
-  rlph = sc._jvm.io.archivesunleashed.pyspark.matchbox.RecordLoaderPythonHelper
+  rlph = sc._jvm.io.archivesunleashed.spark.pythonhelpers.RecordLoaderPythonHelper
   df = rlph.loadWarc(path, sc._jsc, spark._jsparkSession)
   df.createTempView("df")
   pdf = spark.table("df")
   spark.catalog.dropTempView("df")
   return pdf.rdd
 
 def loadWarcAsDF(path, sc, spark):
-  rlph = sc._jvm.io.archivesunleashed.pyspark.matchbox.RecordLoaderPythonHelper
+  rlph = sc._jvm.io.archivesunleashed.spark.pythonhelpers.RecordLoaderPythonHelper
   df = rlph.loadWarc(path, sc._jsc, spark._jsparkSession)
   df.createTempView("df")
   pdf = spark.table("df")
   spark.catalog.dropTempView("df")
   return pdf
 
 def loadArchivesAsDF(path, sc, spark):
-  rlph = sc._jvm.io.archivesunleashed.pyspark.matchbox.RecordLoaderPythonHelper
+  rlph = sc._jvm.io.archivesunleashed.spark.pythonhelpers.RecordLoaderPythonHelper
   df = rlph.loadArchives(path, sc._jsc, spark._jsparkSession)
   df.createTempView("df")
   pdf = spark.table("df")
   spark.catalog.dropTempView("df")
   return pdf
 
 def loadArchivesAsRDD(path, sc, spark):
-  rlph = sc._jvm.io.archivesunleashed.pyspark.matchbox.RecordLoaderPythonHelper
+  rlph = sc._jvm.io.archivesunleashed.spark.pythonhelpers.RecordLoaderPythonHelper
   df = rlph.loadArchives(path, sc._jsc, spark._jsparkSession)
   df.createTempView("df")
   pdf = spark.table("df")

diff --git a/src/main/python/scripts/extractLinkScript.py b/src/main/python/scripts/extractLinkScript.py
@@ -0,0 +1,37 @@
+# Archives Unleashed Toolkit (AUT):
+# An open-source platform for analyzing web archives.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import RecordLoader
+from DFTransformations import *
+from ExtractDomain import ExtractDomain
+from ExtractLinks import ExtractLinks
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+	# replace with your own path to archive file
+	path = "/Users/Prince/Projects/pyaut/aut/example.arc.gz"
+
+	spark = SparkSession.builder.appName("extractLinks").getOrCreate()
+	sc = spark.sparkContext
+
+	rdd = RecordLoader.loadArchivesAsRDD(path, sc, spark)
+	rdd1 = rdd.flatMap(lambda r: ExtractLinks(r.url, r.contentString))
+	rdd2 = rdd1.map(lambda r: (ExtractDomain(r[0]), ExtractDomain(r[1])))
+	rdd3 = rdd2.filter(lambda r: r[0] is not None and r[0]!= "" and r[1] is not None and r[1] != "")
+	rdd4 = countItems(rdd3).filter(lambda r: r[1] > 5)
+
+	print(rdd4.take(10))
+
+	spark.stop()
diff --git a/src/main/python/scripts/filterByDateScript.py b/src/main/python/scripts/filterByDateScript.py
@@ -0,0 +1,37 @@
+# Archives Unleashed Toolkit (AUT):
+# An open-source platform for analyzing web archives.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import RecordLoader
+from DFTransformations import *
+from ExtractDomain import ExtractDomain
+from ExtractLinks import ExtractLinks
+from ExtractDate import DateComponent
+from RemoveHTML import RemoveHTML
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+	# replace with your own path to archive file
+	path = "/Users/Prince/Projects/pyaut/aut/example.arc.gz"
+
+	spark = SparkSession.builder.appName("filterByDate").getOrCreate()
+	sc = spark.sparkContext
+
+	df = RecordLoader.loadArchivesAsDF(path, sc, spark)
+	filtered_df = keepDate(df, "2008", DateComponent.YYYY).filter(df['url'].like("%archive%"))
+	rdd = filtered_df.rdd
+	rdd.map(lambda r: (r.crawlDate, r.domain, r.url, RemoveHTML(r.contentString))) \
+	   .saveAsTextFile("out/")
+
+	spark.stop()
diff --git a/src/main/scala/io/archivesunleashed/spark/pythonhelpers/RecordLoaderPythonHelper.scala b/src/main/scala/io/archivesunleashed/spark/pythonhelpers/RecordLoaderPythonHelper.scala
@@ -16,14 +16,6 @@ import io.archivesunleashed.spark.matchbox.RecordLoader
 import io.archivesunleashed.spark.rdd.RecordRDD._
 
 object RecordLoaderPythonHelper {
-//  def loadArc(path: String, jssc: JavaSparkContext): JavaRDD[ArcRecordWritable] = {
-//    val sc = jssc.sc
-//    //val rdd = RecordLoader.loadArc(path, sc)
-//    val rdd = sc.newAPIHadoopFile(path, classOf[WacArcInputFormat], classOf[LongWritable], classOf[ArcRecordWritable])
-//      .map(r => r._2)
-//    val jrdd = new JavaRDD(rdd)
-//    jrdd
-//  }
 
   def loadArc(path: String, jssc: JavaSparkContext, spark: SparkSession): DataFrame = {
     val sc = jssc.sc
@@ -50,7 +42,4 @@ object RecordLoaderPythonHelper {
     sc.textFile(path).filter(line => !line.startsWith("{\"delete\":"))
       .map(line => try { parse(line) } catch { case e: Exception => null }).filter(x => x != null)
 
-  def add(a: Int, b: Int): Int = a + b
-
-  def square(a : List[Int]): List[Int] = a.map(x => x*x)
 }