Skip to content

Commit 363c019

Browse files
committed
Back off when actions fail
1 parent 6917998 commit 363c019

File tree

3 files changed

+186
-104
lines changed

3 files changed

+186
-104
lines changed

lib/jecloud.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
require 'jecloud/application'
1313
require 'jecloud/cli'
14+
require 'jecloud/session'
1415

1516
module JeCloud
1617
VERSION = File.read(File.join(File.dirname(__FILE__), '..', 'VERSION'))

lib/jecloud/application.rb

Lines changed: 130 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
module JeCloud
22
class Application
33

4+
MAX_REPEAT_DELAY = 10
5+
6+
class ExpectedDelay < Exception
7+
end
8+
9+
class UnexpectedExternalProblem < Exception
10+
end
11+
412
attr_reader :app_name
513

614
def initialize config_dir
@@ -89,134 +97,146 @@ def deploy! git_ref
8997
server = @config.servers.first || add_server!
9098
server.deployment = { 'version' => rev }
9199

92-
roll_forward
100+
roll_forward!
93101
end
94102

95103
def roll_forward!
96-
loop do
97-
update_config do
98-
return unless roll_forward_step!
104+
1.times do
105+
cont = true
106+
next_attempt = nil
107+
while cont
108+
update_config do
109+
session = Session.new(@config.failures)
110+
cont = roll_forward_step! session
111+
next_attempt = session.next_attempt
112+
end
113+
end
114+
if next_attempt
115+
delay = [0, next_attempt - Time.now.to_i].max
116+
if delay > MAX_REPEAT_DELAY
117+
$log.info "Next attempt delay is #{delay} sec (which is > #{MAX_REPEAT_DELAY} sec limit), quitting"
118+
else
119+
$log.info "Sleeping for #{delay} sec"
120+
sleep delay
121+
retry
122+
end
99123
end
100124
end
101125
end
102126

103-
def roll_forward_step!
127+
def roll_forward_step! session
104128
$log.debug "Roll forward running"
105129
@config.servers.each do |server|
106-
unless server.instance_id?
107-
instance_type = @cloud_config.ec2_instance_type
108-
die!("ec2_instance_type not specified") if instance_type.nil?
130+
catch :failed do
131+
session.action "#{server.uuid}-initial-setup", :unless => server.instance_id? do
132+
instance_type = @cloud_config.ec2_instance_type
133+
die!("ec2_instance_type not specified") if instance_type.nil?
109134

110-
ami = @cloud_config.ec2_ami
111-
die!("ec2_ami not specified") if ami.nil?
135+
ami = @cloud_config.ec2_ami
136+
die!("ec2_ami not specified") if ami.nil?
112137

113-
make_key!
138+
make_key!
114139

115-
$log.debug "Starting an instance of type #{instance_type} with AMI #{ami}"
116-
r = @ec2.run_instances :image_id => ami, :key_name => @ec2_ssh_key_name, :instance_type => instance_type
117-
puts r.to_yaml
118-
instance_res = ((r.instancesSet || {}).item || [])[0]
140+
$log.debug "Starting an instance of type #{instance_type} with AMI #{ami}"
141+
r = @ec2.run_instances :image_id => ami, :key_name => @ec2_ssh_key_name, :instance_type => instance_type
142+
puts r.to_yaml
143+
instance_res = ((r.instancesSet || {}).item || [])[0]
119144

120-
server.instance_id = instance_res.instanceId
121-
server.public_ip = instance_res.ipAddress
122-
$log.info "Successfully started an instance with ID #{server.instance_id}"
123-
return true
124-
end
125-
unless server.public_ip?
126-
r = @ec2.describe_instances
127-
puts r.to_yaml
128-
instance = (r.reservationSet.item.collect { |i| i.instancesSet.item } || []).flatten.find { |i| i.instanceId == server.instance_id }
129-
unless (instance.ipAddress || '').empty?
130-
$log.info "Instance with ID #{server.instance_id} has been assigned IP #{server.public_ip}"
131-
server.public_ip = instance.ipAddress
132-
return true
145+
server.instance_id = instance_res.instanceId
146+
server.public_ip = instance_res.ipAddress
147+
$log.info "Successfully started an instance with ID #{server.instance_id}"
133148
end
134-
next
135-
end
136-
137-
jecloud_installed = false
138-
$log.debug "Connecting via SSH to ec2-user@#{server.public_ip}"
139-
begin
140-
Net::SSH.start(server.public_ip, 'ec2-user', :keys => [@ec2_ssh_key_file]) do |ssh|
141-
$log.debug "SSH connected ok!"
142-
143-
# ch.request_pty do |ch, success|
144-
# raise "could not start a pseudo-tty" unless success
145-
#
146-
# # full EC2 environment
147-
# ###ch.env 'key', 'value'
148-
# ###...
149-
#
150-
# ch.exec 'sudo echo Hello 1337' do |ch, success|
151-
# raise "could not exec against a pseudo-tty" unless success
152-
# end
153-
# end
154-
155-
x = ssh.sudo!("echo ok").strip
156-
if x == 'ok'
157-
$log.debug "sudo test ok"
158-
else
159-
$log.error "sudo does not work on #{server.public_ip}"
160-
next
149+
session.action "#{server.uuid}-obtain-ip", :unless => server.public_ip? do
150+
r = @ec2.describe_instances
151+
puts r.to_yaml
152+
instance = (r.reservationSet.item.collect { |i| i.instancesSet.item } || []).flatten.find { |i| i.instanceId == server.instance_id }
153+
unless (instance.ipAddress || '').empty?
154+
$log.info "Instance with ID #{server.instance_id} has been assigned IP #{server.public_ip}"
155+
server.public_ip = instance.ipAddress
161156
end
157+
raise ExpectedDelay, "No IP address assigned yet"
158+
end
162159

163-
jecloud_version = ssh.exec!("jecloud print-version || echo 'NONE'").strip
164-
$log.debug "JeCloud version on the server: #{jecloud_version}"
165-
if jecloud_version =~ /^\d+\.\d+(?:\.\d+(?:\.\d+)?)?$/
166-
if jecloud_version.pad_numbers >= JeCloud::VERSION.pad_numbers
167-
$log.debug "JeCloud installed on the server is good enough."
168-
jecloud_installed = true
160+
jecloud_installed = false
161+
$log.debug "Connecting via SSH to ec2-user@#{server.public_ip}"
162+
session.action "#{server.uuid}-ssh" do
163+
Net::SSH.start(server.public_ip, 'ec2-user', :keys => [@ec2_ssh_key_file]) do |ssh|
164+
$log.debug "SSH connected ok!"
165+
166+
# ch.request_pty do |ch, success|
167+
# raise "could not start a pseudo-tty" unless success
168+
#
169+
# # full EC2 environment
170+
# ###ch.env 'key', 'value'
171+
# ###...
172+
#
173+
# ch.exec 'sudo echo Hello 1337' do |ch, success|
174+
# raise "could not exec against a pseudo-tty" unless success
175+
# end
176+
# end
177+
178+
session.action "#{server.uuid}-sudo-test" do
179+
x = ssh.sudo!("echo ok").strip
180+
if x == 'ok'
181+
$log.debug "sudo test ok"
182+
else
183+
raise UnexpectedExternalProblem, "sudo does not work on #{server.public_ip}"
184+
end
169185
end
170-
end
171186

172-
unless jecloud_installed
173-
yum_packages = %w/gcc gcc-c++ openssl openssl-devel ruby-devel rubygems git/
187+
jecloud_version = ssh.exec!("jecloud print-version || echo 'NONE'").strip
188+
$log.debug "JeCloud version on the server: #{jecloud_version}"
189+
if jecloud_version =~ /^\d+\.\d+(?:\.\d+(?:\.\d+)?)?$/
190+
if jecloud_version.pad_numbers >= JeCloud::VERSION.pad_numbers
191+
$log.debug "JeCloud installed on the server is good enough."
192+
jecloud_installed = true
193+
end
194+
end
174195

175-
$log.debug "Installing yum packages: #{yum_packages.join(' ')}"
176-
ssh.sudo_print!("yum install -y #{yum_packages.join(' ')}")
177-
$log.info "Installed yum packages: #{yum_packages.join(' ')}"
196+
session.action "#{server.uuid}-install-jecloud", :unless => jecloud_installed do
197+
yum_packages = %w/gcc gcc-c++ openssl openssl-devel ruby-devel rubygems git/
178198

179-
sftp = Net::SFTP::Session.new(ssh)
180-
sftp.loop { sftp.opening? }
199+
$log.debug "Installing yum packages: #{yum_packages.join(' ')}"
200+
ssh.sudo_print!("yum install -y #{yum_packages.join(' ')}")
201+
$log.info "Installed yum packages: #{yum_packages.join(' ')}"
181202

182-
$log.debug "Rebuilding JeCloud locally"
183-
puts `rake build`
203+
sftp = Net::SFTP::Session.new(ssh)
204+
sftp.loop { sftp.opening? }
184205

185-
remote_path = "/tmp/#{File.basename(GEM_FILE)}"
186-
$log.debug "Uploading JeCloud gem into #{server.public_ip}:#{remote_path}"
187-
sftp.file.open(remote_path, 'w') do |of|
188-
of.write(File.read(GEM_FILE))
189-
end
190-
sftp.loop
206+
$log.debug "Rebuilding JeCloud locally"
207+
puts `rake build`
208+
raise UnexpectedExternalProblem, "JeCloud build failed" unless $?.success?
191209

192-
$log.debug "Uninstalling old JeCloud version if any"
193-
ssh.sudo_print!("gem uninstall --executables jecloud")
210+
remote_path = "/tmp/#{File.basename(GEM_FILE)}"
211+
$log.debug "Uploading JeCloud gem into #{server.public_ip}:#{remote_path}"
212+
sftp.file.open(remote_path, 'w') do |of|
213+
of.write(File.read(GEM_FILE))
214+
end
215+
sftp.loop
194216

195-
$log.debug "Installing JeCloud gem"
196-
ssh.sudo_print!("gem install --no-rdoc --no-ri #{remote_path}")
217+
$log.debug "Uninstalling old JeCloud version if any"
218+
ssh.sudo_print!("gem uninstall --executables jecloud")
197219

198-
jecloud_version = ssh.exec!("jecloud print-version || echo 'NONE'").strip
199-
if jecloud_version == JeCloud::VERSION
200-
$log.info "Installed JeCloud on #{server.public_ip}"
201-
return true
202-
else
203-
puts jecloud_version
204-
$log.error "Installation of JeCloud failed on #{server.public_ip}"
205-
# TODO: back off next time
206-
next
220+
$log.debug "Installing JeCloud gem"
221+
ssh.sudo_print!("gem install --no-rdoc --no-ri #{remote_path}")
222+
223+
jecloud_version = ssh.exec!("jecloud print-version || echo 'NONE'").strip
224+
if jecloud_version == JeCloud::VERSION
225+
$log.info "Installed JeCloud on #{server.public_ip}"
226+
else
227+
puts jecloud_version
228+
raise UnexpectedExternalProblem, "Installation of JeCloud failed on #{server.public_ip}"
229+
end
207230
end
208-
end
209231

210-
# deployment requested?
211-
if server.deployment?
212-
# pretend that it succeeded
213-
server.deployment = nil
214-
return true
232+
# deployment requested?
233+
if server.deployment?
234+
# pretend that it succeeded
235+
server.deployment = nil
236+
return true
237+
end
215238
end
216239
end
217-
rescue Errno::ECONNREFUSED => e
218-
puts "Server not ready yet."
219-
next
220240
end
221241
end
222242
return false
@@ -233,17 +253,23 @@ def add_server!
233253
end
234254

235255
def read_config
236-
Hashie::Mash.new(YAML.load(AWS::S3::S3Object.value('state.json', @config_bucket_name)))
237-
rescue AWS::S3::NoSuchKey
238-
Hashie::Mash.new.tap do |config|
239-
config.servers = []
256+
begin
257+
Hashie::Mash.new(YAML.load(AWS::S3::S3Object.value('state.json', @config_bucket_name)))
258+
rescue AWS::S3::NoSuchKey
259+
Hashie::Mash.new
260+
end.tap do |config|
261+
config.servers ||= []
262+
config.servers.each do |server|
263+
server.uuid ||= `uuidgen`.strip
264+
end
265+
config.failures!
240266
end
241267
end
242268

243269
def update_config
244-
@config = read_config
245270
result = yield
246271
AWS::S3::S3Object.store 'state.json', YAML.dump(@config.to_hash), @config_bucket_name
272+
puts @config.to_hash.to_yaml
247273
return result
248274
end
249275

lib/jecloud/session.rb

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
module JeCloud
2+
class Session
3+
4+
attr_reader :next_attempt
5+
6+
BACKOFF_DELAYS = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025]
7+
8+
def initialize failures
9+
@failures = failures
10+
@next_attempt = nil
11+
end
12+
13+
def action name, options={}
14+
options = { :if => true, :unless => false }.merge(options)
15+
unless options.if && !options.unless
16+
$log.debug "Not needed: #{name}"
17+
return
18+
end
19+
20+
failure = @failures.delete(name) || Hashie::Mash.new
21+
now = Time.now.to_i
22+
next_attempt = failure.last && failure.last + failure.delay
23+
if next_attempt && next_attempt > now
24+
$log.debug "Skipping action #{name} for #{failure.last + failure.delay - now} more seconds"
25+
@failures[name] = failure
26+
@next_attempt = [@next_attempt || next_attempt, next_attempt].min
27+
throw :failed
28+
else
29+
$log.debug "Starting: #{name}"
30+
begin
31+
yield
32+
$log.info "Succeeded: #{name}"
33+
rescue Exception => e
34+
message = "#{e.class.name}: #{e.message}"
35+
failure.first ||= now
36+
failure.last = now
37+
failure['count'] = (failure['count'] || 0) + 1
38+
failure.message = message
39+
failure.delay = BACKOFF_DELAYS.find { |delay| delay > (failure.delay || 0) } || BACKOFF_DELAYS.last
40+
@failures[name] = failure
41+
42+
next_attempt = failure.last + failure.delay
43+
@next_attempt = [@next_attempt || next_attempt, next_attempt].min
44+
45+
$log.error "Action #{name} failed with #{message}, will retry in #{failure.delay} seconds"
46+
$stderr.puts e.backtrace
47+
throw :failed
48+
end
49+
end
50+
end
51+
52+
private
53+
54+
end
55+
end

0 commit comments

Comments
 (0)