Skip to content

Commit

Permalink
Merge pull request ampoffcom#1 from despens/master
Browse files Browse the repository at this point in the history
Added support for index files.
  • Loading branch information
steffenfritz committed Dec 30, 2015
2 parents 6a614f3 + 1a86cc8 commit 5b536fc
Showing 1 changed file with 62 additions and 36 deletions.
98 changes: 62 additions & 36 deletions html2warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
html2warc creates warc files from local web resources
"""

__date__ = '2015/09/17'
__version__ = '0.6'
__date__ = '2015/12/30'
__version__ = '0.7'
__status__ = 'Testing'
__license__ = 'The MIT License (MIT)'
__copyright__ = 'Copyright (c) 2014 Steffen Fritz'
Expand All @@ -18,7 +18,14 @@
import uuid
import datetime
import mimetypes
import re

# possible names for an index file, listed as compiled regexp,
# in order of preference
index_filenames = [
re.compile(r'(index)\.(html?|asp|php)', re.I), # most common
re.compile(r'welcome\.html?', re.I) # AOL server
]

def source_to_warc(source_dir, targetwarc, createdate, rooturl):
"""
Expand All @@ -30,45 +37,64 @@ def source_to_warc(source_dir, targetwarc, createdate, rooturl):
"""
for rootdir, _, files in os.walk(source_dir):
for file_ in files:

possible_filenames_ = [file_]

for index_filename_ in index_filenames:
if index_filename_.fullmatch(file_):
possible_filenames_.append('')
continue # only one index per directory!

source_file_ = os.path.join(rootdir, file_)
mime_type_ = mimetypes.guess_type(source_file_)
file_size_ = os.path.getsize(source_file_)
block_length = 110 # init with len of network header

with open(targetwarc, "a", newline="\r\n") as fw:
fw.write("WARC/1.0\n")
fw.write("WARC-Type: response\n")

if rootdir == source_dir:
fw.write("WARC-Target-URI: " + rooturl + file_ + "\n")
else:
source_file_uri = source_file_.split("/", 1)[1]
fw.write("WARC-Target-URI: " + rooturl + source_file_uri + "\n")

fw.write("WARC-Record-ID: <urn:uuid:" + str(uuid.uuid4()) + ">\n")
fw.write("WARC-Date: " + str(createdate) + "\n")
fw.write("Content-Type: " + "application/http;msgtype=response" + "\n")
fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n")

block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate))
fw.write("Content-Length: " + str(block_length) + "\n")
fw.write("\n")

# network protocol information
fw.write("HTTP/1.1 200 OK\n")
fw.write("DATE: " + str(createdate) + "\n")
fw.write("Accept-Ranges: bytes" + "\n")
fw.write("Connection: close" + "\n")
fw.write("Content-Type: " + str(mime_type_[0]) + "\n")
fw.write("Content-Length: " + str(file_size_) + "\n")
fw.write("\n")

with open(source_file_, "rb") as fd:
for line_ in fd:
with open(targetwarc, "ab") as fw:
fw.write(line_)
fw = open(targetwarc, "a")
fw.write("\r\n\r\n")
rootdir_parts = rootdir.split('/')
source_dir_parts = source_dir.split('/')
source_file_uri_parts = []
path_step = 0
for segment in rootdir_parts:
if len(source_dir_parts) < path_step+1 or segment != source_dir_parts[path_step]:
source_file_uri_parts.append(segment)
path_step = path_step + 1

for possible_filename_ in possible_filenames_:

source_file_uri = rooturl + '/'.join(source_file_uri_parts)+ '/' + possible_filename_

print("{}\t[{}]\t{}b".format(source_file_uri, mime_type_[0], file_size_))

with open(targetwarc, "a", newline="\r\n") as fw:
fw.write("WARC/1.0\n")
fw.write("WARC-Type: response\n")

fw.write("WARC-Target-URI: " + source_file_uri + "\n")

fw.write("WARC-Record-ID: <urn:uuid:" + str(uuid.uuid4()) + ">\n")
fw.write("WARC-Date: " + str(createdate) + "\n")
fw.write("Content-Type: " + "application/http;msgtype=response" + "\n")
fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n")

block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate))
fw.write("Content-Length: " + str(block_length) + "\n")
fw.write("\n")

# network protocol information
fw.write("HTTP/1.1 200 OK\n")
fw.write("DATE: " + str(createdate) + "\n")
fw.write("Accept-Ranges: bytes" + "\n")
fw.write("Connection: close" + "\n")
fw.write("Content-Type: " + str(mime_type_[0]) + "\n")
fw.write("Content-Length: " + str(file_size_) + "\n")
fw.write("\n")

with open(source_file_, "rb") as fd:
for line_ in fd:
with open(targetwarc, "ab") as fw:
fw.write(line_)
fw = open(targetwarc, "a")
fw.write("\r\n\r\n")


def write_init_record(targetwarc, createdate):
Expand Down

0 comments on commit 5b536fc

Please sign in to comment.