Android: improves test sharding reliability.

Originally test sharding for android was written with performance in mind (i.e., to scale up the test speed per device). Now that we're on the main waterfall, we need to improve reliability as devices may randomly drop offline during tests. This patch captures exceptions in key places and retries if there are enough devices available. BUG=153718 TEST=run android tests, randomly unplugging devices Review URL: https://chromiumcodereview.appspot.com/11275078 git-svn-id: http://src.chromium.org/svn/trunk/src/build@165918 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
2012-11-05 11:49:15 +00:00 · 2012-11-05 11:49:15 +00:00 · 13689003bf
--- a/android/pylib/base_test_sharder.py
+++ b/android/pylib/base_test_sharder.py
@ -7,6 +7,7 @@ import android_commands
 import logging
 import multiprocessing

+from android_commands import errors
 from test_result import TestResults


@ -43,7 +44,9 @@ class BaseTestSharder(object):

  def __init__(self, attached_devices):
    self.attached_devices = attached_devices
-    self.retries = 1
+    # Worst case scenario: a device will drop offline per run, so we need
+    # to retry until we're out of devices.
+    self.retries = len(self.attached_devices)
    self.tests = []

  def CreateShardedTestRunner(self, device, index):
@ -83,12 +86,20 @@ class BaseTestSharder(object):
      logging.warning('Try %d of %d', retry + 1, self.retries)
      self.SetupSharding(self.tests)
      test_runners = []
-      for index, device in enumerate(self.attached_devices):
-        logging.warning('*' * 80)
-        logging.warning('Creating shard %d for %s', index, device)
-        logging.warning('*' * 80)
-        test_runner = self.CreateShardedTestRunner(device, index)
-        test_runners += [test_runner]
+
+      # Try to create N shards, and retrying on failure.
+      try:
+        for index, device in enumerate(self.attached_devices):
+          logging.warning('*' * 80)
+          logging.warning('Creating shard %d for %s', index, device)
+          logging.warning('*' * 80)
+          test_runner = self.CreateShardedTestRunner(device, index)
+          test_runners += [test_runner]
+      except errors.DeviceUnresponsiveError as e:
+        logging.critical('****Failed to create a shard: [%s]', e)
+        self.attached_devices.remove(device)
+        continue
+
      logging.warning('Starting...')
      pool = multiprocessing.Pool(len(self.attached_devices),
                                  SetTestsContainer,
@ -96,8 +107,12 @@ class BaseTestSharder(object):
      # map can't handle KeyboardInterrupt exception. It's a python bug.
      # So use map_async instead.
      async_results = pool.map_async(_ShardedTestRunnable, test_runners)
-      results_lists = async_results.get(999999)
-
+      try:
+        results_lists = async_results.get(999999)
+      except errors.DeviceUnresponsiveError as e:
+        logging.critical('****Failed to run test: [%s]', e)
+        self.attached_devices = android_commands.GetAttachedDevices()
+        continue
      test_results = TestResults.FromTestResults(results_lists)
      # Re-check the attached devices for some devices may
      # become offline
@ -119,5 +134,9 @@ class BaseTestSharder(object):
          self.tests += [t.name]
        if not self.tests:
          break
+    else:
+      # We ran out retries, possibly out of healthy devices.
+      # There's no recovery at this point.
+      raise Exception('Unrecoverable error while retrying test runs.')
    self.OnTestsCompleted(test_runners, final_results)
    return final_results
--- a/android/run_tests.py
+++ b/android/run_tests.py
@ -209,25 +209,47 @@ class TestSharder(BaseTestSharder):
    self.log_dump_name = log_dump_name
    self.fast_and_loose = fast_and_loose
    self.build_type = build_type
-    test = SingleTestRunner(self.attached_devices[0], test_suite, gtest_filter,
-                            test_arguments, timeout, rebaseline,
-                            performance_test, cleanup_test_files, tool, 0,
-                            not not self.log_dump_name, fast_and_loose,
-                            build_type)
    self.tests = []
    if not self.gtest_filter:
      # No filter has been specified, let's add all tests then.
-      # The executable/apk needs to be copied before we can call GetAllTests.
-      test.test_package.StripAndCopyExecutable()
-      all_tests = test.test_package.GetAllTests()
-      if not rebaseline:
-        disabled_list = test.GetDisabledTests()
-        # Only includes tests that do not have any match in the disabled list.
-        all_tests = filter(lambda t:
-                           not any([fnmatch.fnmatch(t, disabled_pattern)
-                                    for disabled_pattern in disabled_list]),
-                           all_tests)
-      self.tests = all_tests
+      self.tests, self.attached_devices = self._GetTests()
+
+  def _GetTests(self):
+    """Returns a tuple of (all_tests, available_devices).
+
+    Tries to obtain the list of available tests.
+    Raises Exception if all devices failed.
+    """
+    available_devices = list(self.attached_devices)
+    while available_devices:
+      try:
+        logging.info('Obtaining tests from %s', available_devices[-1])
+        all_tests = self._GetTestsFromDevice(available_devices[-1])
+        return all_tests, available_devices
+      except Exception as e:
+        logging.info('Failed obtaining tests from %s %s',
+                     available_devices[-1], e)
+        available_devices.pop()
+    raise Exception('No device available to get the list of tests.')
+
+  def _GetTestsFromDevice(self, device):
+    test = SingleTestRunner(device, self.test_suite, self.gtest_filter,
+                            self.test_arguments, self.timeout, self.rebaseline,
+                            self.performance_test, self.cleanup_test_files,
+                            self.tool, 0,
+                            not not self.log_dump_name, self.fast_and_loose,
+                            self.build_type)
+    # The executable/apk needs to be copied before we can call GetAllTests.
+    test.test_package.StripAndCopyExecutable()
+    all_tests = test.test_package.GetAllTests()
+    if not self.rebaseline:
+      disabled_list = test.GetDisabledTests()
+      # Only includes tests that do not have any match in the disabled list.
+      all_tests = filter(lambda t:
+                         not any([fnmatch.fnmatch(t, disabled_pattern)
+                                  for disabled_pattern in disabled_list]),
+                         all_tests)
+    return all_tests

  def CreateShardedTestRunner(self, device, index):
    """Creates a suite-specific test runner.