Skip to content

Commit

Permalink
Add Python implementation of SaveBytes.
Browse files Browse the repository at this point in the history
- Resolves #478
- Tweak formatting in DataFrameLoader
  • Loading branch information
ruebot committed Jun 16, 2020
1 parent d8cca11 commit 9cb3028
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 11 deletions.
3 changes: 2 additions & 1 deletion src/main/python/aut/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from aut.app import ExtractPopularImages, WriteGEXF, WriteGraphML
from aut.app import ExtractPopularImages, SaveBytes, WriteGEXF, WriteGraphML
from aut.common import WebArchive
from aut.udfs import (
compute_image_size,
Expand All @@ -19,6 +19,7 @@

__all__ = [
"ExtractPopularImages",
"SaveBytes",
"WebArchive",
"WriteGEXF",
"WriteGraphML",
Expand Down
17 changes: 17 additions & 0 deletions src/main/python/aut/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import base64
import hashlib
import os
from xml.sax.saxutils import escape

from pyspark.sql import DataFrame
Expand All @@ -19,6 +21,21 @@ def ExtractPopularImages(d, limit, min_width, min_height):
)


def SaveBytes(data, bytes_path):
os.makedirs(bytes_path, exist_ok=True)

for row in data:
with open(
bytes_path
+ "/"
+ hashlib.md5(base64.b64decode(row[1])).hexdigest()
+ "."
+ row[0],
"wb",
) as f:
f.write(base64.b64decode(row[1]))


def WriteGEXF(data, gexf_path):
output_file = open(gexf_path, "x")
end_attribute = '" />\n'
Expand Down
20 changes: 10 additions & 10 deletions src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import org.apache.spark.sql.DataFrame
/** DataFrame wrapper for PySpark implementation. **/
class DataFrameLoader(sc: SparkContext) {

/** Create a DataFrame with crawl_date, url, mime_type_web_server, content and bytes. */
/** Create a DataFrame with crawl_date, url, mime_type_web_server, mime_type_tika, content, bytes, http_status_code, and archive_filename. */
def all(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.keepValidPages()
Expand All @@ -33,7 +33,7 @@ class DataFrameLoader(sc: SparkContext) {
/** Create a DataFrame with audio url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def audio(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.audio
.audio
}

/* Create a DataFrame with crawl date, source page, image url, and alt text. */
Expand All @@ -51,25 +51,25 @@ class DataFrameLoader(sc: SparkContext) {
/** Create a DataFrame with PDF url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def pdfs(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.pdfs
.pdfs
}

/** Create a DataFrame with presentation program url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
/** Create a DataFrame with presentation program file url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def presentationProgramFiles(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.presentationProgramFiles
.presentationProgramFiles
}

/** Create a DataFrame with spreadsheet url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def spreadsheets(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.spreadsheets
.spreadsheets
}

/** Create a DataFrame with video url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def videos(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.videos
.videos
}

/** Create a DataFrame with crawl_date, source, destination, and anchor. */
Expand All @@ -78,15 +78,15 @@ class DataFrameLoader(sc: SparkContext) {
.webgraph()
}

/** Create a DataFrame with crawl_date, url, mime_type_web_server, and content. */
/** Create a DataFrame with crawl_date, url, mime_type_web_server, language, and content. */
def webpages(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.webpages()
}

/** Create a DataFrame with word processor url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
/** Create a DataFrame with word processor file url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def wordProcessorFiles(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.wordProcessorFiles
.wordProcessorFiles
}
}

0 comments on commit 9cb3028

Please sign in to comment.