Android: improves test sharding reliability.

Originally test sharding for android was written with performance in mind
(i.e., to scale up the test speed per device).
Now that we're on the main waterfall, we need to improve reliability as
devices may randomly drop offline during tests.
This patch captures exceptions in key places and retries if there are enough
devices available.

BUG=153718
TEST=run android tests, randomly unplugging devices


Review URL: https://chromiumcodereview.appspot.com/11275078

git-svn-id: http://src.chromium.org/svn/trunk/src/build@165918 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
This commit is contained in:
bulach@chromium.org 2012-11-05 11:49:15 +00:00
Родитель 39ab8bb1cc
Коммит 13689003bf
2 изменённых файлов: 66 добавлений и 25 удалений

Просмотреть файл

@ -7,6 +7,7 @@ import android_commands
import logging
import multiprocessing
from android_commands import errors
from test_result import TestResults
@ -43,7 +44,9 @@ class BaseTestSharder(object):
def __init__(self, attached_devices):
self.attached_devices = attached_devices
self.retries = 1
# Worst case scenario: a device will drop offline per run, so we need
# to retry until we're out of devices.
self.retries = len(self.attached_devices)
self.tests = []
def CreateShardedTestRunner(self, device, index):
@ -83,12 +86,20 @@ class BaseTestSharder(object):
logging.warning('Try %d of %d', retry + 1, self.retries)
self.SetupSharding(self.tests)
test_runners = []
for index, device in enumerate(self.attached_devices):
logging.warning('*' * 80)
logging.warning('Creating shard %d for %s', index, device)
logging.warning('*' * 80)
test_runner = self.CreateShardedTestRunner(device, index)
test_runners += [test_runner]
# Try to create N shards, and retrying on failure.
try:
for index, device in enumerate(self.attached_devices):
logging.warning('*' * 80)
logging.warning('Creating shard %d for %s', index, device)
logging.warning('*' * 80)
test_runner = self.CreateShardedTestRunner(device, index)
test_runners += [test_runner]
except errors.DeviceUnresponsiveError as e:
logging.critical('****Failed to create a shard: [%s]', e)
self.attached_devices.remove(device)
continue
logging.warning('Starting...')
pool = multiprocessing.Pool(len(self.attached_devices),
SetTestsContainer,
@ -96,8 +107,12 @@ class BaseTestSharder(object):
# map can't handle KeyboardInterrupt exception. It's a python bug.
# So use map_async instead.
async_results = pool.map_async(_ShardedTestRunnable, test_runners)
results_lists = async_results.get(999999)
try:
results_lists = async_results.get(999999)
except errors.DeviceUnresponsiveError as e:
logging.critical('****Failed to run test: [%s]', e)
self.attached_devices = android_commands.GetAttachedDevices()
continue
test_results = TestResults.FromTestResults(results_lists)
# Re-check the attached devices for some devices may
# become offline
@ -119,5 +134,9 @@ class BaseTestSharder(object):
self.tests += [t.name]
if not self.tests:
break
else:
# We ran out retries, possibly out of healthy devices.
# There's no recovery at this point.
raise Exception('Unrecoverable error while retrying test runs.')
self.OnTestsCompleted(test_runners, final_results)
return final_results

Просмотреть файл

@ -209,25 +209,47 @@ class TestSharder(BaseTestSharder):
self.log_dump_name = log_dump_name
self.fast_and_loose = fast_and_loose
self.build_type = build_type
test = SingleTestRunner(self.attached_devices[0], test_suite, gtest_filter,
test_arguments, timeout, rebaseline,
performance_test, cleanup_test_files, tool, 0,
not not self.log_dump_name, fast_and_loose,
build_type)
self.tests = []
if not self.gtest_filter:
# No filter has been specified, let's add all tests then.
# The executable/apk needs to be copied before we can call GetAllTests.
test.test_package.StripAndCopyExecutable()
all_tests = test.test_package.GetAllTests()
if not rebaseline:
disabled_list = test.GetDisabledTests()
# Only includes tests that do not have any match in the disabled list.
all_tests = filter(lambda t:
not any([fnmatch.fnmatch(t, disabled_pattern)
for disabled_pattern in disabled_list]),
all_tests)
self.tests = all_tests
self.tests, self.attached_devices = self._GetTests()
def _GetTests(self):
"""Returns a tuple of (all_tests, available_devices).
Tries to obtain the list of available tests.
Raises Exception if all devices failed.
"""
available_devices = list(self.attached_devices)
while available_devices:
try:
logging.info('Obtaining tests from %s', available_devices[-1])
all_tests = self._GetTestsFromDevice(available_devices[-1])
return all_tests, available_devices
except Exception as e:
logging.info('Failed obtaining tests from %s %s',
available_devices[-1], e)
available_devices.pop()
raise Exception('No device available to get the list of tests.')
def _GetTestsFromDevice(self, device):
test = SingleTestRunner(device, self.test_suite, self.gtest_filter,
self.test_arguments, self.timeout, self.rebaseline,
self.performance_test, self.cleanup_test_files,
self.tool, 0,
not not self.log_dump_name, self.fast_and_loose,
self.build_type)
# The executable/apk needs to be copied before we can call GetAllTests.
test.test_package.StripAndCopyExecutable()
all_tests = test.test_package.GetAllTests()
if not self.rebaseline:
disabled_list = test.GetDisabledTests()
# Only includes tests that do not have any match in the disabled list.
all_tests = filter(lambda t:
not any([fnmatch.fnmatch(t, disabled_pattern)
for disabled_pattern in disabled_list]),
all_tests)
return all_tests
def CreateShardedTestRunner(self, device, index):
"""Creates a suite-specific test runner.