diff --git a/html2warc.py b/html2warc.py index 905c33d..3c3fca7 100755 --- a/html2warc.py +++ b/html2warc.py @@ -3,8 +3,8 @@ html2warc creates warc files from local web resources """ -__date__ = '2015/09/17' -__version__ = '0.6' +__date__ = '2015/12/30' +__version__ = '0.7' __status__ = 'Testing' __license__ = 'The MIT License (MIT)' __copyright__ = 'Copyright (c) 2014 Steffen Fritz' @@ -18,7 +18,14 @@ import uuid import datetime import mimetypes +import re +# possible names for an index file, listed as compiled regexp, +# in order of preference +index_filenames = [ + re.compile(r'(index)\.(html?|asp|php)', re.I), # most common + re.compile(r'welcome\.html?', re.I) # AOL server +] def source_to_warc(source_dir, targetwarc, createdate, rooturl): """ @@ -30,45 +37,64 @@ def source_to_warc(source_dir, targetwarc, createdate, rooturl): """ for rootdir, _, files in os.walk(source_dir): for file_ in files: + + possible_filenames_ = [file_] + + for index_filename_ in index_filenames: + if index_filename_.fullmatch(file_): + possible_filenames_.append('') + continue # only one index per directory! + source_file_ = os.path.join(rootdir, file_) mime_type_ = mimetypes.guess_type(source_file_) file_size_ = os.path.getsize(source_file_) block_length = 110 # init with len of network header - with open(targetwarc, "a", newline="\r\n") as fw: - fw.write("WARC/1.0\n") - fw.write("WARC-Type: response\n") - - if rootdir == source_dir: - fw.write("WARC-Target-URI: " + rooturl + file_ + "\n") - else: - source_file_uri = source_file_.split("/", 1)[1] - fw.write("WARC-Target-URI: " + rooturl + source_file_uri + "\n") - - fw.write("WARC-Record-ID: \n") - fw.write("WARC-Date: " + str(createdate) + "\n") - fw.write("Content-Type: " + "application/http;msgtype=response" + "\n") - fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n") - - block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate)) - fw.write("Content-Length: " + str(block_length) + "\n") - fw.write("\n") - - # network protocol information - fw.write("HTTP/1.1 200 OK\n") - fw.write("DATE: " + str(createdate) + "\n") - fw.write("Accept-Ranges: bytes" + "\n") - fw.write("Connection: close" + "\n") - fw.write("Content-Type: " + str(mime_type_[0]) + "\n") - fw.write("Content-Length: " + str(file_size_) + "\n") - fw.write("\n") - - with open(source_file_, "rb") as fd: - for line_ in fd: - with open(targetwarc, "ab") as fw: - fw.write(line_) - fw = open(targetwarc, "a") - fw.write("\r\n\r\n") + rootdir_parts = rootdir.split('/') + source_dir_parts = source_dir.split('/') + source_file_uri_parts = [] + path_step = 0 + for segment in rootdir_parts: + if len(source_dir_parts) < path_step+1 or segment != source_dir_parts[path_step]: + source_file_uri_parts.append(segment) + path_step = path_step + 1 + + for possible_filename_ in possible_filenames_: + + source_file_uri = rooturl + '/'.join(source_file_uri_parts)+ '/' + possible_filename_ + + print("{}\t[{}]\t{}b".format(source_file_uri, mime_type_[0], file_size_)) + + with open(targetwarc, "a", newline="\r\n") as fw: + fw.write("WARC/1.0\n") + fw.write("WARC-Type: response\n") + + fw.write("WARC-Target-URI: " + source_file_uri + "\n") + + fw.write("WARC-Record-ID: \n") + fw.write("WARC-Date: " + str(createdate) + "\n") + fw.write("Content-Type: " + "application/http;msgtype=response" + "\n") + fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n") + + block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate)) + fw.write("Content-Length: " + str(block_length) + "\n") + fw.write("\n") + + # network protocol information + fw.write("HTTP/1.1 200 OK\n") + fw.write("DATE: " + str(createdate) + "\n") + fw.write("Accept-Ranges: bytes" + "\n") + fw.write("Connection: close" + "\n") + fw.write("Content-Type: " + str(mime_type_[0]) + "\n") + fw.write("Content-Length: " + str(file_size_) + "\n") + fw.write("\n") + + with open(source_file_, "rb") as fd: + for line_ in fd: + with open(targetwarc, "ab") as fw: + fw.write(line_) + fw = open(targetwarc, "a") + fw.write("\r\n\r\n") def write_init_record(targetwarc, createdate):