[hm] [cc] [dea] cleanup after crashed droplet if it is flapping.

In order to enable continuous attempts to restart an application, DEAs
need to clean up after crashes.

Until the droplet is declared flapping, its data is not cleaned up, as
before.

This patch ensures cleanup of flapping droplets in DEA.

Additionally, quiet logging for flapping droplets that are no longer restarted.

Change-Id: I919927bd54f7a64021d8cbd82b33361606910653
This commit is contained in:
Bob Nugmanov 2012-04-18 20:52:23 -07:00
Родитель d966809f61
Коммит e22a72545f
4 изменённых файлов: 20 добавлений и 12 удалений

Просмотреть файл

@ -174,6 +174,8 @@ class AppManager
end
CloudController.logger.debug("[HealthManager] Starting #{indices.length} missing instances for app: #{app.id}")
# FIXME - Check capacity
message[:flapping] = true if payload[:flapping]
indices.each { |i| start_instance(message, i) }
when /STOP/i
# If HM detects older versions, let's clean up here versus suppressing

Просмотреть файл

@ -547,6 +547,7 @@ module DEA
framework = message_json['framework']
debug = message_json['debug']
console = message_json['console']
flapping = message_json['flapping']
# Limits processing
mem = DEFAULT_APP_MEM
@ -601,6 +602,7 @@ module DEA
:start => Time.now,
:state_timestamp => Time.now.to_i,
:log_id => "(name=%s app_id=%s instance=%s index=%s)" % [name, droplet_id, instance_id, instance_index],
:flapping => flapping ? true : false
}
instances = @droplets[droplet_id] || {}
@ -1295,15 +1297,16 @@ module DEA
# Drop usage and resource tracking regardless of state
remove_instance_resources(instance)
@usage.delete(instance[:pid]) if instance[:pid]
# clean up the in memory instance and directory only if the instance didn't crash
if instance[:state] != :CRASHED
# clean up the in memory instance and directory only if
# the instance didn't crash or when it was marked as flapping
if instance[:state] != :CRASHED || instance[:flapping]
if droplet = @droplets[instance[:droplet_id]]
droplet.delete(instance[:instance_id])
@droplets.delete(instance[:droplet_id]) if droplet.empty?
schedule_snapshot
end
unless @disable_dir_cleanup
@logger.debug("#{instance[:name]}: Cleaning up dir #{instance[:dir]}")
@logger.debug("#{instance[:name]}: Cleaning up dir #{instance[:dir]}#{instance[:flapping]?' (flapping)':''}")
EM.system("rm -rf #{instance[:dir]}")
end
# Rechown crashed application directory using uid and gid of DEA

Просмотреть файл

@ -318,7 +318,7 @@ class HealthManager
end
if index_entry[:state] == FLAPPING && !restart_pending?(app_id, index) && now - index_entry[:last_action] > @restart_timeout
delay_or_giveup_restart_of_flapping_instance(app_id, index, index_entry)
delay_or_giveup_restart_of_flapping_instance(app_id, index, index_entry, true)
end
if index_entry[:state] == DOWN && now - index_entry[:last_action] > @restart_timeout
@ -527,12 +527,13 @@ class HealthManager
droplet_entry # return the droplet that we changed. This allows the spec tests to ensure the behaviour is correct.
end
def delay_or_giveup_restart_of_flapping_instance(droplet_id, index, index_entry)
def delay_or_giveup_restart_of_flapping_instance(droplet_id, index, index_entry, giveup_quietly = false)
index_entry[:last_action] = now #regardless of whether real action is omitted or delayed, a decision timestamp is needed
if @giveup_crash_number > 0 && index_entry[:crashes] > @giveup_crash_number
@logger.info("giving up on flapping instance (app_id=#{droplet_id}, index=#{index}). Number of crashes: #{index_entry[:crashes]}.")
@logger.info("given up on flapping instance (app_id=#{droplet_id}, index=#{index}). " +
"Number of crashes: #{index_entry[:crashes]}.") unless giveup_quietly
else
@pending_restart[droplet_id] ||= {}
@pending_restart[droplet_id][index] = true
@ -542,7 +543,7 @@ class HealthManager
@logger.info("delayed-restarting flapping instance (app_id=#{droplet_id}, index=#{index}). Delay: #{restart_delay}. Number of crashes: #{index_entry[:crashes]}.")
EM.add_timer(restart_delay) do
index_entry[:last_action] = now
start_instances(droplet_id, [index], false)
start_instances(droplet_id, [index], false, true)
end
end
end
@ -744,7 +745,7 @@ class HealthManager
entry_updated
end
def start_instances(droplet_id, indices, high_priority = false)
def start_instances(droplet_id, indices, high_priority = false, flapping = false)
droplet_entry = @droplets[droplet_id]
if droplet_entry.nil?
@ -760,6 +761,8 @@ class HealthManager
:indices => indices
}
start_message[:flapping] = true if flapping
if queue_requests?
queue_request(start_message, high_priority)
else

Просмотреть файл

@ -134,14 +134,14 @@ describe HealthManager do
}
end
def make_restart_message
{
def make_restart_message(options = {})
m = {
'droplet' => @app.id,
'op' => 'START',
'last_updated' => @app.last_updated.to_i,
'version' => "#{@app.staged_package_hash}-#{@app.run_count}",
'indices' => [0]
}
}.merge(options)
end
def get_live_index(droplet_entry,index)
@ -239,7 +239,7 @@ describe HealthManager do
def ensure_flapping_delayed_restart(delay)
in_em_with_fiber do |f|
should_publish_to_nats "cloudcontrollers.hm.requests", make_restart_message
should_publish_to_nats "cloudcontrollers.hm.requests", make_restart_message('flapping' => true)
@hm.process_heartbeat_message(make_heartbeat_message([0], "RUNNING").to_json)
droplet_entry = @hm.process_exited_message(make_crashed_message.to_json)