Skip to content

Commit 432ca1d

Browse files
committed
added new option for regex acceptance you dont have to download entire site when looking for specific path
1 parent ed7948a commit 432ca1d

File tree

3 files changed

+41
-4
lines changed

3 files changed

+41
-4
lines changed

bin/wayback_machine_downloader

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
require_relative '../lib/wayback_machine_downloader'
44
require 'optparse'
5+
require 'pp'
56

67
options = {}
78
option_parser = OptionParser.new do |opts|
@@ -11,19 +12,26 @@ option_parser = OptionParser.new do |opts|
1112
opts.separator "Download any website from the Wayback Machine."
1213

1314
opts.separator ""
14-
opts.separator "Optional option:"
15+
opts.separator "Optional options:"
1516

1617
opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
1718
options[:timestamp] = t
1819
end
1920

21+
opts.on("--accept-regex [ACCEPT_REGEX]", String,"Specify a regular expression to download. If a path doesn't meet this regex, it won't get downloaded.") do |accept_regex|
22+
options[:accept_regex] = accept_regex
23+
end
24+
2025
opts.on("-v", "--version", "Display version") do |t|
2126
options[:version] = t
2227
end
2328
end.parse!
2429

25-
if base_url = ARGV[0]
26-
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
30+
# this used to be 0. we want to look at the /last/ option.
31+
#
32+
# TODO: this argument needs to be handled better. argument handling is sorta messy.
33+
if base_url = ARGV[-1]
34+
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], accept_regex: options[:accept_regex]
2735
wayback_machine_downloader.download_files
2836
elsif options[:version]
2937
puts WaybackMachineDownloader::VERSION

lib/wayback_machine_downloader.rb

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ class WaybackMachineDownloader
1313
def initialize params
1414
@base_url = params[:base_url]
1515
@timestamp = params[:timestamp].to_i
16+
@accept_regex = /#{params[:accept_regex]}/
1617
end
1718

1819
def backup_name
@@ -48,7 +49,21 @@ def get_file_list_curated
4849
end
4950
end
5051
end
51-
file_list_curated
52+
53+
# if accept_regex not defined, just return the file_list_curated
54+
if @accept_regex.nil?
55+
return file_list_curated
56+
end
57+
58+
59+
# accept_regex defined. now we need to create a filtered list.
60+
filtered_file_list_curated = Hash.new
61+
file_list_curated.each do |file_id, fileinfo|
62+
if fileinfo[:file_url].match @accept_regex
63+
filtered_file_list_curated[file_id] = fileinfo
64+
end
65+
end
66+
return filtered_file_list_curated
5267
end
5368

5469
def get_file_list_by_timestamp
@@ -64,6 +79,10 @@ def download_files
6479
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
6580
puts
6681
file_list_by_timestamp = get_file_list_by_timestamp
82+
if file_list_by_timestamp.count == 0
83+
puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
84+
return
85+
end
6786
count = 0
6887
file_list_by_timestamp.each do |file_remote_info|
6988
count += 1

test/test_wayback_machine_downloader.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,16 @@ def test_file_list_by_timestamp
3030
assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-1]
3131
end
3232

33+
def test_file_list_notthere_regex
34+
regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'abc123'
35+
assert_equal 0, regextester.get_file_list_curated.length
36+
end
37+
38+
def test_file_list_singleresult_regex
39+
regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'menu.html$'
40+
assert_equal 1, regextester.get_file_list_curated.length
41+
end
42+
3343
def test_file_download
3444
@wayback_machine_downloader.download_files
3545
linux_page = open 'websites/www.onlyfreegames.net/linux.htm'

0 commit comments

Comments
 (0)