diff --git a/android/pylib/base_test_sharder.py b/android/pylib/base_test_sharder.py index 530676915..b8d03c830 100644 --- a/android/pylib/base_test_sharder.py +++ b/android/pylib/base_test_sharder.py @@ -7,6 +7,7 @@ import android_commands import logging import multiprocessing +from android_commands import errors from test_result import TestResults @@ -43,7 +44,9 @@ class BaseTestSharder(object): def __init__(self, attached_devices): self.attached_devices = attached_devices - self.retries = 1 + # Worst case scenario: a device will drop offline per run, so we need + # to retry until we're out of devices. + self.retries = len(self.attached_devices) self.tests = [] def CreateShardedTestRunner(self, device, index): @@ -83,12 +86,20 @@ class BaseTestSharder(object): logging.warning('Try %d of %d', retry + 1, self.retries) self.SetupSharding(self.tests) test_runners = [] - for index, device in enumerate(self.attached_devices): - logging.warning('*' * 80) - logging.warning('Creating shard %d for %s', index, device) - logging.warning('*' * 80) - test_runner = self.CreateShardedTestRunner(device, index) - test_runners += [test_runner] + + # Try to create N shards, and retrying on failure. + try: + for index, device in enumerate(self.attached_devices): + logging.warning('*' * 80) + logging.warning('Creating shard %d for %s', index, device) + logging.warning('*' * 80) + test_runner = self.CreateShardedTestRunner(device, index) + test_runners += [test_runner] + except errors.DeviceUnresponsiveError as e: + logging.critical('****Failed to create a shard: [%s]', e) + self.attached_devices.remove(device) + continue + logging.warning('Starting...') pool = multiprocessing.Pool(len(self.attached_devices), SetTestsContainer, @@ -96,8 +107,12 @@ class BaseTestSharder(object): # map can't handle KeyboardInterrupt exception. It's a python bug. # So use map_async instead. async_results = pool.map_async(_ShardedTestRunnable, test_runners) - results_lists = async_results.get(999999) - + try: + results_lists = async_results.get(999999) + except errors.DeviceUnresponsiveError as e: + logging.critical('****Failed to run test: [%s]', e) + self.attached_devices = android_commands.GetAttachedDevices() + continue test_results = TestResults.FromTestResults(results_lists) # Re-check the attached devices for some devices may # become offline @@ -119,5 +134,9 @@ class BaseTestSharder(object): self.tests += [t.name] if not self.tests: break + else: + # We ran out retries, possibly out of healthy devices. + # There's no recovery at this point. + raise Exception('Unrecoverable error while retrying test runs.') self.OnTestsCompleted(test_runners, final_results) return final_results diff --git a/android/run_tests.py b/android/run_tests.py index 1d58daff9..6b0ca7eda 100755 --- a/android/run_tests.py +++ b/android/run_tests.py @@ -209,25 +209,47 @@ class TestSharder(BaseTestSharder): self.log_dump_name = log_dump_name self.fast_and_loose = fast_and_loose self.build_type = build_type - test = SingleTestRunner(self.attached_devices[0], test_suite, gtest_filter, - test_arguments, timeout, rebaseline, - performance_test, cleanup_test_files, tool, 0, - not not self.log_dump_name, fast_and_loose, - build_type) self.tests = [] if not self.gtest_filter: # No filter has been specified, let's add all tests then. - # The executable/apk needs to be copied before we can call GetAllTests. - test.test_package.StripAndCopyExecutable() - all_tests = test.test_package.GetAllTests() - if not rebaseline: - disabled_list = test.GetDisabledTests() - # Only includes tests that do not have any match in the disabled list. - all_tests = filter(lambda t: - not any([fnmatch.fnmatch(t, disabled_pattern) - for disabled_pattern in disabled_list]), - all_tests) - self.tests = all_tests + self.tests, self.attached_devices = self._GetTests() + + def _GetTests(self): + """Returns a tuple of (all_tests, available_devices). + + Tries to obtain the list of available tests. + Raises Exception if all devices failed. + """ + available_devices = list(self.attached_devices) + while available_devices: + try: + logging.info('Obtaining tests from %s', available_devices[-1]) + all_tests = self._GetTestsFromDevice(available_devices[-1]) + return all_tests, available_devices + except Exception as e: + logging.info('Failed obtaining tests from %s %s', + available_devices[-1], e) + available_devices.pop() + raise Exception('No device available to get the list of tests.') + + def _GetTestsFromDevice(self, device): + test = SingleTestRunner(device, self.test_suite, self.gtest_filter, + self.test_arguments, self.timeout, self.rebaseline, + self.performance_test, self.cleanup_test_files, + self.tool, 0, + not not self.log_dump_name, self.fast_and_loose, + self.build_type) + # The executable/apk needs to be copied before we can call GetAllTests. + test.test_package.StripAndCopyExecutable() + all_tests = test.test_package.GetAllTests() + if not self.rebaseline: + disabled_list = test.GetDisabledTests() + # Only includes tests that do not have any match in the disabled list. + all_tests = filter(lambda t: + not any([fnmatch.fnmatch(t, disabled_pattern) + for disabled_pattern in disabled_list]), + all_tests) + return all_tests def CreateShardedTestRunner(self, device, index): """Creates a suite-specific test runner.