зеркало из https://github.com/mozilla/labs-vcap.git
[hm] [cc] [dea] cleanup after crashed droplet if it is flapping.
In order to enable continuous attempts to restart an application, DEAs need to clean up after crashes. Until the droplet is declared flapping, its data is not cleaned up, as before. This patch ensures cleanup of flapping droplets in DEA. Additionally, quiet logging for flapping droplets that are no longer restarted. Change-Id: I919927bd54f7a64021d8cbd82b33361606910653
This commit is contained in:
Родитель
d966809f61
Коммит
e22a72545f
|
@ -174,6 +174,8 @@ class AppManager
|
|||
end
|
||||
CloudController.logger.debug("[HealthManager] Starting #{indices.length} missing instances for app: #{app.id}")
|
||||
# FIXME - Check capacity
|
||||
|
||||
message[:flapping] = true if payload[:flapping]
|
||||
indices.each { |i| start_instance(message, i) }
|
||||
when /STOP/i
|
||||
# If HM detects older versions, let's clean up here versus suppressing
|
||||
|
|
|
@ -547,6 +547,7 @@ module DEA
|
|||
framework = message_json['framework']
|
||||
debug = message_json['debug']
|
||||
console = message_json['console']
|
||||
flapping = message_json['flapping']
|
||||
|
||||
# Limits processing
|
||||
mem = DEFAULT_APP_MEM
|
||||
|
@ -601,6 +602,7 @@ module DEA
|
|||
:start => Time.now,
|
||||
:state_timestamp => Time.now.to_i,
|
||||
:log_id => "(name=%s app_id=%s instance=%s index=%s)" % [name, droplet_id, instance_id, instance_index],
|
||||
:flapping => flapping ? true : false
|
||||
}
|
||||
|
||||
instances = @droplets[droplet_id] || {}
|
||||
|
@ -1295,15 +1297,16 @@ module DEA
|
|||
# Drop usage and resource tracking regardless of state
|
||||
remove_instance_resources(instance)
|
||||
@usage.delete(instance[:pid]) if instance[:pid]
|
||||
# clean up the in memory instance and directory only if the instance didn't crash
|
||||
if instance[:state] != :CRASHED
|
||||
# clean up the in memory instance and directory only if
|
||||
# the instance didn't crash or when it was marked as flapping
|
||||
if instance[:state] != :CRASHED || instance[:flapping]
|
||||
if droplet = @droplets[instance[:droplet_id]]
|
||||
droplet.delete(instance[:instance_id])
|
||||
@droplets.delete(instance[:droplet_id]) if droplet.empty?
|
||||
schedule_snapshot
|
||||
end
|
||||
unless @disable_dir_cleanup
|
||||
@logger.debug("#{instance[:name]}: Cleaning up dir #{instance[:dir]}")
|
||||
@logger.debug("#{instance[:name]}: Cleaning up dir #{instance[:dir]}#{instance[:flapping]?' (flapping)':''}")
|
||||
EM.system("rm -rf #{instance[:dir]}")
|
||||
end
|
||||
# Rechown crashed application directory using uid and gid of DEA
|
||||
|
|
|
@ -318,7 +318,7 @@ class HealthManager
|
|||
end
|
||||
|
||||
if index_entry[:state] == FLAPPING && !restart_pending?(app_id, index) && now - index_entry[:last_action] > @restart_timeout
|
||||
delay_or_giveup_restart_of_flapping_instance(app_id, index, index_entry)
|
||||
delay_or_giveup_restart_of_flapping_instance(app_id, index, index_entry, true)
|
||||
end
|
||||
|
||||
if index_entry[:state] == DOWN && now - index_entry[:last_action] > @restart_timeout
|
||||
|
@ -527,12 +527,13 @@ class HealthManager
|
|||
droplet_entry # return the droplet that we changed. This allows the spec tests to ensure the behaviour is correct.
|
||||
end
|
||||
|
||||
def delay_or_giveup_restart_of_flapping_instance(droplet_id, index, index_entry)
|
||||
def delay_or_giveup_restart_of_flapping_instance(droplet_id, index, index_entry, giveup_quietly = false)
|
||||
|
||||
index_entry[:last_action] = now #regardless of whether real action is omitted or delayed, a decision timestamp is needed
|
||||
|
||||
if @giveup_crash_number > 0 && index_entry[:crashes] > @giveup_crash_number
|
||||
@logger.info("giving up on flapping instance (app_id=#{droplet_id}, index=#{index}). Number of crashes: #{index_entry[:crashes]}.")
|
||||
@logger.info("given up on flapping instance (app_id=#{droplet_id}, index=#{index}). " +
|
||||
"Number of crashes: #{index_entry[:crashes]}.") unless giveup_quietly
|
||||
else
|
||||
@pending_restart[droplet_id] ||= {}
|
||||
@pending_restart[droplet_id][index] = true
|
||||
|
@ -542,7 +543,7 @@ class HealthManager
|
|||
@logger.info("delayed-restarting flapping instance (app_id=#{droplet_id}, index=#{index}). Delay: #{restart_delay}. Number of crashes: #{index_entry[:crashes]}.")
|
||||
EM.add_timer(restart_delay) do
|
||||
index_entry[:last_action] = now
|
||||
start_instances(droplet_id, [index], false)
|
||||
start_instances(droplet_id, [index], false, true)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -744,7 +745,7 @@ class HealthManager
|
|||
entry_updated
|
||||
end
|
||||
|
||||
def start_instances(droplet_id, indices, high_priority = false)
|
||||
def start_instances(droplet_id, indices, high_priority = false, flapping = false)
|
||||
droplet_entry = @droplets[droplet_id]
|
||||
|
||||
if droplet_entry.nil?
|
||||
|
@ -760,6 +761,8 @@ class HealthManager
|
|||
:indices => indices
|
||||
}
|
||||
|
||||
start_message[:flapping] = true if flapping
|
||||
|
||||
if queue_requests?
|
||||
queue_request(start_message, high_priority)
|
||||
else
|
||||
|
|
|
@ -134,14 +134,14 @@ describe HealthManager do
|
|||
}
|
||||
end
|
||||
|
||||
def make_restart_message
|
||||
{
|
||||
def make_restart_message(options = {})
|
||||
m = {
|
||||
'droplet' => @app.id,
|
||||
'op' => 'START',
|
||||
'last_updated' => @app.last_updated.to_i,
|
||||
'version' => "#{@app.staged_package_hash}-#{@app.run_count}",
|
||||
'indices' => [0]
|
||||
}
|
||||
}.merge(options)
|
||||
end
|
||||
|
||||
def get_live_index(droplet_entry,index)
|
||||
|
@ -239,7 +239,7 @@ describe HealthManager do
|
|||
def ensure_flapping_delayed_restart(delay)
|
||||
in_em_with_fiber do |f|
|
||||
|
||||
should_publish_to_nats "cloudcontrollers.hm.requests", make_restart_message
|
||||
should_publish_to_nats "cloudcontrollers.hm.requests", make_restart_message('flapping' => true)
|
||||
|
||||
@hm.process_heartbeat_message(make_heartbeat_message([0], "RUNNING").to_json)
|
||||
droplet_entry = @hm.process_exited_message(make_crashed_message.to_json)
|
||||
|
|
Загрузка…
Ссылка в новой задаче