Android: improves test sharding reliability.
Originally test sharding for android was written with performance in mind (i.e., to scale up the test speed per device). Now that we're on the main waterfall, we need to improve reliability as devices may randomly drop offline during tests. This patch captures exceptions in key places and retries if there are enough devices available. BUG=153718 TEST=run android tests, randomly unplugging devices Review URL: https://chromiumcodereview.appspot.com/11275078 git-svn-id: http://src.chromium.org/svn/trunk/src/build@165918 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
This commit is contained in:
Родитель
39ab8bb1cc
Коммит
13689003bf
|
@ -7,6 +7,7 @@ import android_commands
|
|||
import logging
|
||||
import multiprocessing
|
||||
|
||||
from android_commands import errors
|
||||
from test_result import TestResults
|
||||
|
||||
|
||||
|
@ -43,7 +44,9 @@ class BaseTestSharder(object):
|
|||
|
||||
def __init__(self, attached_devices):
|
||||
self.attached_devices = attached_devices
|
||||
self.retries = 1
|
||||
# Worst case scenario: a device will drop offline per run, so we need
|
||||
# to retry until we're out of devices.
|
||||
self.retries = len(self.attached_devices)
|
||||
self.tests = []
|
||||
|
||||
def CreateShardedTestRunner(self, device, index):
|
||||
|
@ -83,12 +86,20 @@ class BaseTestSharder(object):
|
|||
logging.warning('Try %d of %d', retry + 1, self.retries)
|
||||
self.SetupSharding(self.tests)
|
||||
test_runners = []
|
||||
for index, device in enumerate(self.attached_devices):
|
||||
logging.warning('*' * 80)
|
||||
logging.warning('Creating shard %d for %s', index, device)
|
||||
logging.warning('*' * 80)
|
||||
test_runner = self.CreateShardedTestRunner(device, index)
|
||||
test_runners += [test_runner]
|
||||
|
||||
# Try to create N shards, and retrying on failure.
|
||||
try:
|
||||
for index, device in enumerate(self.attached_devices):
|
||||
logging.warning('*' * 80)
|
||||
logging.warning('Creating shard %d for %s', index, device)
|
||||
logging.warning('*' * 80)
|
||||
test_runner = self.CreateShardedTestRunner(device, index)
|
||||
test_runners += [test_runner]
|
||||
except errors.DeviceUnresponsiveError as e:
|
||||
logging.critical('****Failed to create a shard: [%s]', e)
|
||||
self.attached_devices.remove(device)
|
||||
continue
|
||||
|
||||
logging.warning('Starting...')
|
||||
pool = multiprocessing.Pool(len(self.attached_devices),
|
||||
SetTestsContainer,
|
||||
|
@ -96,8 +107,12 @@ class BaseTestSharder(object):
|
|||
# map can't handle KeyboardInterrupt exception. It's a python bug.
|
||||
# So use map_async instead.
|
||||
async_results = pool.map_async(_ShardedTestRunnable, test_runners)
|
||||
results_lists = async_results.get(999999)
|
||||
|
||||
try:
|
||||
results_lists = async_results.get(999999)
|
||||
except errors.DeviceUnresponsiveError as e:
|
||||
logging.critical('****Failed to run test: [%s]', e)
|
||||
self.attached_devices = android_commands.GetAttachedDevices()
|
||||
continue
|
||||
test_results = TestResults.FromTestResults(results_lists)
|
||||
# Re-check the attached devices for some devices may
|
||||
# become offline
|
||||
|
@ -119,5 +134,9 @@ class BaseTestSharder(object):
|
|||
self.tests += [t.name]
|
||||
if not self.tests:
|
||||
break
|
||||
else:
|
||||
# We ran out retries, possibly out of healthy devices.
|
||||
# There's no recovery at this point.
|
||||
raise Exception('Unrecoverable error while retrying test runs.')
|
||||
self.OnTestsCompleted(test_runners, final_results)
|
||||
return final_results
|
||||
|
|
|
@ -209,25 +209,47 @@ class TestSharder(BaseTestSharder):
|
|||
self.log_dump_name = log_dump_name
|
||||
self.fast_and_loose = fast_and_loose
|
||||
self.build_type = build_type
|
||||
test = SingleTestRunner(self.attached_devices[0], test_suite, gtest_filter,
|
||||
test_arguments, timeout, rebaseline,
|
||||
performance_test, cleanup_test_files, tool, 0,
|
||||
not not self.log_dump_name, fast_and_loose,
|
||||
build_type)
|
||||
self.tests = []
|
||||
if not self.gtest_filter:
|
||||
# No filter has been specified, let's add all tests then.
|
||||
# The executable/apk needs to be copied before we can call GetAllTests.
|
||||
test.test_package.StripAndCopyExecutable()
|
||||
all_tests = test.test_package.GetAllTests()
|
||||
if not rebaseline:
|
||||
disabled_list = test.GetDisabledTests()
|
||||
# Only includes tests that do not have any match in the disabled list.
|
||||
all_tests = filter(lambda t:
|
||||
not any([fnmatch.fnmatch(t, disabled_pattern)
|
||||
for disabled_pattern in disabled_list]),
|
||||
all_tests)
|
||||
self.tests = all_tests
|
||||
self.tests, self.attached_devices = self._GetTests()
|
||||
|
||||
def _GetTests(self):
|
||||
"""Returns a tuple of (all_tests, available_devices).
|
||||
|
||||
Tries to obtain the list of available tests.
|
||||
Raises Exception if all devices failed.
|
||||
"""
|
||||
available_devices = list(self.attached_devices)
|
||||
while available_devices:
|
||||
try:
|
||||
logging.info('Obtaining tests from %s', available_devices[-1])
|
||||
all_tests = self._GetTestsFromDevice(available_devices[-1])
|
||||
return all_tests, available_devices
|
||||
except Exception as e:
|
||||
logging.info('Failed obtaining tests from %s %s',
|
||||
available_devices[-1], e)
|
||||
available_devices.pop()
|
||||
raise Exception('No device available to get the list of tests.')
|
||||
|
||||
def _GetTestsFromDevice(self, device):
|
||||
test = SingleTestRunner(device, self.test_suite, self.gtest_filter,
|
||||
self.test_arguments, self.timeout, self.rebaseline,
|
||||
self.performance_test, self.cleanup_test_files,
|
||||
self.tool, 0,
|
||||
not not self.log_dump_name, self.fast_and_loose,
|
||||
self.build_type)
|
||||
# The executable/apk needs to be copied before we can call GetAllTests.
|
||||
test.test_package.StripAndCopyExecutable()
|
||||
all_tests = test.test_package.GetAllTests()
|
||||
if not self.rebaseline:
|
||||
disabled_list = test.GetDisabledTests()
|
||||
# Only includes tests that do not have any match in the disabled list.
|
||||
all_tests = filter(lambda t:
|
||||
not any([fnmatch.fnmatch(t, disabled_pattern)
|
||||
for disabled_pattern in disabled_list]),
|
||||
all_tests)
|
||||
return all_tests
|
||||
|
||||
def CreateShardedTestRunner(self, device, index):
|
||||
"""Creates a suite-specific test runner.
|
||||
|
|
Загрузка…
Ссылка в новой задаче