diff --git a/html2warc.py b/html2warc.py
index 905c33d..3c3fca7 100755
--- a/html2warc.py
+++ b/html2warc.py
@@ -3,8 +3,8 @@
html2warc creates warc files from local web resources
"""
-__date__ = '2015/09/17'
-__version__ = '0.6'
+__date__ = '2015/12/30'
+__version__ = '0.7'
__status__ = 'Testing'
__license__ = 'The MIT License (MIT)'
__copyright__ = 'Copyright (c) 2014 Steffen Fritz'
@@ -18,7 +18,14 @@
import uuid
import datetime
import mimetypes
+import re
+# possible names for an index file, listed as compiled regexp,
+# in order of preference
+index_filenames = [
+ re.compile(r'(index)\.(html?|asp|php)', re.I), # most common
+ re.compile(r'welcome\.html?', re.I) # AOL server
+]
def source_to_warc(source_dir, targetwarc, createdate, rooturl):
"""
@@ -30,45 +37,64 @@ def source_to_warc(source_dir, targetwarc, createdate, rooturl):
"""
for rootdir, _, files in os.walk(source_dir):
for file_ in files:
+
+ possible_filenames_ = [file_]
+
+ for index_filename_ in index_filenames:
+ if index_filename_.fullmatch(file_):
+ possible_filenames_.append('')
+ continue # only one index per directory!
+
source_file_ = os.path.join(rootdir, file_)
mime_type_ = mimetypes.guess_type(source_file_)
file_size_ = os.path.getsize(source_file_)
block_length = 110 # init with len of network header
- with open(targetwarc, "a", newline="\r\n") as fw:
- fw.write("WARC/1.0\n")
- fw.write("WARC-Type: response\n")
-
- if rootdir == source_dir:
- fw.write("WARC-Target-URI: " + rooturl + file_ + "\n")
- else:
- source_file_uri = source_file_.split("/", 1)[1]
- fw.write("WARC-Target-URI: " + rooturl + source_file_uri + "\n")
-
- fw.write("WARC-Record-ID: \n")
- fw.write("WARC-Date: " + str(createdate) + "\n")
- fw.write("Content-Type: " + "application/http;msgtype=response" + "\n")
- fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n")
-
- block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate))
- fw.write("Content-Length: " + str(block_length) + "\n")
- fw.write("\n")
-
- # network protocol information
- fw.write("HTTP/1.1 200 OK\n")
- fw.write("DATE: " + str(createdate) + "\n")
- fw.write("Accept-Ranges: bytes" + "\n")
- fw.write("Connection: close" + "\n")
- fw.write("Content-Type: " + str(mime_type_[0]) + "\n")
- fw.write("Content-Length: " + str(file_size_) + "\n")
- fw.write("\n")
-
- with open(source_file_, "rb") as fd:
- for line_ in fd:
- with open(targetwarc, "ab") as fw:
- fw.write(line_)
- fw = open(targetwarc, "a")
- fw.write("\r\n\r\n")
+ rootdir_parts = rootdir.split('/')
+ source_dir_parts = source_dir.split('/')
+ source_file_uri_parts = []
+ path_step = 0
+ for segment in rootdir_parts:
+ if len(source_dir_parts) < path_step+1 or segment != source_dir_parts[path_step]:
+ source_file_uri_parts.append(segment)
+ path_step = path_step + 1
+
+ for possible_filename_ in possible_filenames_:
+
+ source_file_uri = rooturl + '/'.join(source_file_uri_parts)+ '/' + possible_filename_
+
+ print("{}\t[{}]\t{}b".format(source_file_uri, mime_type_[0], file_size_))
+
+ with open(targetwarc, "a", newline="\r\n") as fw:
+ fw.write("WARC/1.0\n")
+ fw.write("WARC-Type: response\n")
+
+ fw.write("WARC-Target-URI: " + source_file_uri + "\n")
+
+ fw.write("WARC-Record-ID: \n")
+ fw.write("WARC-Date: " + str(createdate) + "\n")
+ fw.write("Content-Type: " + "application/http;msgtype=response" + "\n")
+ fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n")
+
+ block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate))
+ fw.write("Content-Length: " + str(block_length) + "\n")
+ fw.write("\n")
+
+ # network protocol information
+ fw.write("HTTP/1.1 200 OK\n")
+ fw.write("DATE: " + str(createdate) + "\n")
+ fw.write("Accept-Ranges: bytes" + "\n")
+ fw.write("Connection: close" + "\n")
+ fw.write("Content-Type: " + str(mime_type_[0]) + "\n")
+ fw.write("Content-Length: " + str(file_size_) + "\n")
+ fw.write("\n")
+
+ with open(source_file_, "rb") as fd:
+ for line_ in fd:
+ with open(targetwarc, "ab") as fw:
+ fw.write(line_)
+ fw = open(targetwarc, "a")
+ fw.write("\r\n\r\n")
def write_init_record(targetwarc, createdate):