[hm] [cc] [dea] cleanup after crashed droplet if it is flapping.

In order to enable continuous attempts to restart an application, DEAs need to clean up after crashes. Until the droplet is declared flapping, its data is not cleaned up, as before. This patch ensures cleanup of flapping droplets in DEA. Additionally, quiet logging for flapping droplets that are no longer restarted. Change-Id: I919927bd54f7a64021d8cbd82b33361606910653
2012-04-18 20:52:23 -07:00 · 2012-04-18 20:52:23 -07:00 · e22a72545f
--- a/cloud_controller/app/models/app_manager.rb
+++ b/cloud_controller/app/models/app_manager.rb
@ -174,6 +174,8 @@ class AppManager
      end
      CloudController.logger.debug("[HealthManager] Starting #{indices.length} missing instances for app: #{app.id}")
      # FIXME - Check capacity
+
+      message[:flapping] = true if payload[:flapping]
      indices.each { |i| start_instance(message, i) }
    when /STOP/i
      # If HM detects older versions, let's clean up here versus suppressing
--- a/dea/lib/dea/agent.rb
+++ b/dea/lib/dea/agent.rb
@ -547,6 +547,7 @@ module DEA
      framework = message_json['framework']
      debug = message_json['debug']
      console = message_json['console']
+      flapping = message_json['flapping']

      # Limits processing
      mem     = DEFAULT_APP_MEM
@ -601,6 +602,7 @@ module DEA
        :start => Time.now,
        :state_timestamp => Time.now.to_i,
        :log_id => "(name=%s app_id=%s instance=%s index=%s)" % [name, droplet_id, instance_id, instance_index],
+        :flapping => flapping ? true : false
      }

      instances = @droplets[droplet_id] || {}
@ -1295,15 +1297,16 @@ module DEA
      # Drop usage and resource tracking regardless of state
      remove_instance_resources(instance)
      @usage.delete(instance[:pid]) if instance[:pid]
-      # clean up the in memory instance and directory only if the instance didn't crash
-      if instance[:state] != :CRASHED
+      # clean up the in memory instance and directory only if
+      # the instance didn't crash or when it was marked as flapping
+      if instance[:state] != :CRASHED || instance[:flapping]
        if droplet = @droplets[instance[:droplet_id]]
          droplet.delete(instance[:instance_id])
          @droplets.delete(instance[:droplet_id]) if droplet.empty?
          schedule_snapshot
        end
        unless @disable_dir_cleanup
-          @logger.debug("#{instance[:name]}: Cleaning up dir #{instance[:dir]}")
+          @logger.debug("#{instance[:name]}: Cleaning up dir #{instance[:dir]}#{instance[:flapping]?' (flapping)':''}")
          EM.system("rm -rf #{instance[:dir]}")
        end
      # Rechown crashed application directory using uid and gid of DEA
--- a/health_manager/lib/health_manager.rb
+++ b/health_manager/lib/health_manager.rb
@ -318,7 +318,7 @@ class HealthManager
          end

          if index_entry[:state] == FLAPPING && !restart_pending?(app_id, index) && now - index_entry[:last_action] > @restart_timeout
-            delay_or_giveup_restart_of_flapping_instance(app_id, index, index_entry)
+            delay_or_giveup_restart_of_flapping_instance(app_id, index, index_entry, true)
          end

          if index_entry[:state] == DOWN && now - index_entry[:last_action] > @restart_timeout
@ -527,12 +527,13 @@ class HealthManager
    droplet_entry # return the droplet that we changed. This allows the spec tests to ensure the behaviour is correct.
  end

-  def delay_or_giveup_restart_of_flapping_instance(droplet_id, index, index_entry)
+  def delay_or_giveup_restart_of_flapping_instance(droplet_id, index, index_entry, giveup_quietly = false)

    index_entry[:last_action] = now #regardless of whether real action is omitted or delayed, a decision timestamp is needed

    if @giveup_crash_number > 0 && index_entry[:crashes] > @giveup_crash_number
-      @logger.info("giving up on flapping instance (app_id=#{droplet_id}, index=#{index}). Number of crashes: #{index_entry[:crashes]}.")
+      @logger.info("given up on flapping instance (app_id=#{droplet_id}, index=#{index}). " +
+                   "Number of crashes: #{index_entry[:crashes]}.") unless giveup_quietly
    else
      @pending_restart[droplet_id] ||= {}
      @pending_restart[droplet_id][index] = true
@ -542,7 +543,7 @@ class HealthManager
      @logger.info("delayed-restarting flapping instance (app_id=#{droplet_id}, index=#{index}). Delay: #{restart_delay}. Number of crashes: #{index_entry[:crashes]}.")
      EM.add_timer(restart_delay) do
        index_entry[:last_action] = now
-        start_instances(droplet_id, [index], false)
+        start_instances(droplet_id, [index], false, true)
      end
    end
  end
@ -744,7 +745,7 @@ class HealthManager
    entry_updated
  end

-  def start_instances(droplet_id, indices, high_priority = false)
+  def start_instances(droplet_id, indices, high_priority = false, flapping = false)
    droplet_entry = @droplets[droplet_id]

    if droplet_entry.nil?
@ -760,6 +761,8 @@ class HealthManager
      :indices => indices
    }

+    start_message[:flapping] = true if flapping
+
    if queue_requests?
      queue_request(start_message, high_priority)
    else
--- a/health_manager/spec/health_manager_spec.rb
+++ b/health_manager/spec/health_manager_spec.rb
@ -134,14 +134,14 @@ describe HealthManager do
    }
  end

-  def make_restart_message
-    {
+  def make_restart_message(options = {})
+    m = {
      'droplet' => @app.id,
      'op' => 'START',
      'last_updated' => @app.last_updated.to_i,
      'version' => "#{@app.staged_package_hash}-#{@app.run_count}",
      'indices' => [0]
-    }
+    }.merge(options)
  end

  def get_live_index(droplet_entry,index)
@ -239,7 +239,7 @@ describe HealthManager do
  def ensure_flapping_delayed_restart(delay)
    in_em_with_fiber do |f|

-      should_publish_to_nats "cloudcontrollers.hm.requests", make_restart_message
+      should_publish_to_nats "cloudcontrollers.hm.requests", make_restart_message('flapping' => true)

      @hm.process_heartbeat_message(make_heartbeat_message([0], "RUNNING").to_json)
      droplet_entry = @hm.process_exited_message(make_crashed_message.to_json)