Bug 1479290 - Reduce memory usage of wpt-update, r=ato

Summary:
The memory usage of wpt-update has always been high, and accidentially
regressed in Bug 1476053 to really problematic levels when doing a
full metadata update for all test results. This patch implements a
number of changes that reduce the peak memory usage to ~1Gb and the
heap after reading all results data to ~600Mb.

THere are several changes in this patch:

* Change from a dict {test: [(subtest, prop, run_info, result)]} to a
  nested dictionary {test: {subtest: [(prop, run_info,
  result)]}}. This fixes the silliness that caused the previous
  regression.

* Convert all unicode data to bytestrings and then intern() the
  bytestring. This allows reusing string allocations for repeated
  strings, of which there are many.

* Process the test manifests upfront and then allow the manifest data
  to be gc'd before starting to process the results files, so that we
  are not holding on to data that is no longer required.

* Stop storing status-type results as (status, default_expected); just
  look up the default_expected for a specific test if we have to
  update the expectations for that test.

* Add __slots__ to all frequently-allocated classes to avoid the
  overhead of constructing an __dict__ for each instance, which
  typically has size overhead (see
  e.g. http://book.pythontips.com/en/latest/__slots__magic.html )

* Move to using a custom compact representation for the list of
  per-subtest results i.e. [(prop, run_info, results)]. This uses an
  array.array of 2-byte ints to store the entries and effectively
  interns the values, bit packing the representations of property to
  be updated and result (if it's a test status) into 4 bits each in
  the first byte, and the run info into the second byte. Properties
  with a complex data representation (e.g. LSAN failures) are stored
  directly, with a 0 value of the status bits indicating that the
  complex value should be read.

Reviewers: ato

Tags: #secure-revision

Bug #: 1479290

Differential Revision: https://phabricator.services.mozilla.com/D2496

MozReview-Commit-ID: 7zGpdzdaqRj
This commit is contained in:
James Graham 2018-07-29 16:13:15 +01:00
Родитель b5603c9dbf
Коммит 23304678f9
1 изменённых файлов: 214 добавлений и 104 удалений

Просмотреть файл

@ -1,3 +1,4 @@
import array
import os
import shutil
import tempfile
@ -17,19 +18,12 @@ manifestitem = None
logger = structuredlog.StructuredLogger("web-platform-tests")
try:
import ujson as json
except ImportError:
import json
def load_test_manifests(serve_root, test_paths):
do_delayed_imports(serve_root)
manifest_loader = testloader.ManifestLoader(test_paths, False)
return manifest_loader.load()
def update_expected(test_paths, serve_root, log_file_names,
rev_old=None, rev_new="HEAD", ignore_existing=False,
sync_root=None, property_order=None, boolean_properties=None,
@ -39,10 +33,11 @@ def update_expected(test_paths, serve_root, log_file_names,
If stability is not None, assume log_file_names refers to logs from repeated
test jobs, disable tests that don't behave as expected on all runs"""
do_delayed_imports(serve_root)
manifests = load_test_manifests(serve_root, test_paths)
id_test_map = load_test_data(test_paths)
for metadata_path, updated_ini in update_from_logs(manifests,
for metadata_path, updated_ini in update_from_logs(id_test_map,
*log_file_names,
ignore_existing=ignore_existing,
property_order=property_order,
@ -122,33 +117,104 @@ def unexpected_changes(manifests, change_data, files_changed):
# Check if all the RHS values are the same; if so collapse the conditionals
def update_from_logs(manifests, *log_filenames, **kwargs):
class InternedData(object):
"""Class for interning data of any (hashable) type.
This class is intended for building a mapping of int <=> value, such
that the integer may be stored as a proxy for the real value, and then
the real value obtained later from the proxy value.
In order to support the use case of packing the integer value as binary,
it is possible to specify a maximum bitsize of the data; adding more items
than this allowed will result in a ValueError exception.
The zero value is reserved to use as a sentinal."""
type_conv = None
rev_type_conv = None
def __init__(self, max_bits=8):
self.max_idx = 2**max_bits - 2
# Reserve 0 as a sentinal
self._data = [None], {}
def store(self, obj):
if self.type_conv is not None:
obj = self.type_conv(obj)
objs, obj_to_idx = self._data
if obj not in obj_to_idx:
value = len(objs)
objs.append(obj)
obj_to_idx[obj] = value
if value > self.max_idx:
raise ValueError
else:
value = obj_to_idx[obj]
return value
def get(self, idx):
obj = self._data[0][idx]
if self.rev_type_conv is not None:
obj = self.rev_type_conv(obj)
return obj
class RunInfoInterned(InternedData):
def type_conv(self, value):
return tuple(value.items())
def rev_type_conv(self, value):
return dict(value)
prop_intern = InternedData(4)
run_info_intern = RunInfoInterned()
status_intern = InternedData(4)
def load_test_data(test_paths):
manifest_loader = testloader.ManifestLoader(test_paths, False)
manifests = manifest_loader.load()
id_test_map = {}
for test_manifest, paths in manifests.iteritems():
id_test_map.update(create_test_tree(paths["metadata_path"],
test_manifest))
return id_test_map
def update_from_logs(id_test_map, *log_filenames, **kwargs):
ignore_existing = kwargs.get("ignore_existing", False)
property_order = kwargs.get("property_order")
boolean_properties = kwargs.get("boolean_properties")
stability = kwargs.get("stability")
id_test_map = {}
for test_manifest, paths in manifests.iteritems():
id_test_map.update(create_test_tree(
paths["metadata_path"],
test_manifest))
updater = ExpectedUpdater(manifests,
id_test_map,
updater = ExpectedUpdater(id_test_map,
ignore_existing=ignore_existing)
for log_filename in log_filenames:
for i, log_filename in enumerate(log_filenames):
print("Processing log %d/%d" % (i + 1, len(log_filenames)))
with open(log_filename) as f:
updater.update_from_log(f)
for item in update_results(id_test_map, property_order, boolean_properties, stability):
yield item
def update_results(id_test_map, property_order, boolean_properties, stability):
test_file_items = set(id_test_map.itervalues())
default_expected_by_type = {}
for test_type, test_cls in wpttest.manifest_test_cls.iteritems():
if test_cls.result_cls:
default_expected_by_type[(test_type, False)] = test_cls.result_cls.default_expected
if test_cls.subtest_result_cls:
default_expected_by_type[(test_type, True)] = test_cls.subtest_result_cls.default_expected
for test_file in test_file_items:
updated_expected = test_file.update(property_order, boolean_properties, stability)
updated_expected = test_file.update(property_order, boolean_properties, stability,
default_expected_by_type)
if updated_expected is not None and updated_expected.modified:
yield test_file.metadata_path, updated_expected
@ -214,7 +280,7 @@ def write_new_expected(metadata_path, expected):
class ExpectedUpdater(object):
def __init__(self, test_manifests, id_test_map, ignore_existing=False):
def __init__(self, id_test_map, ignore_existing=False):
self.id_test_map = id_test_map
self.ignore_existing = ignore_existing
self.run_info = None
@ -226,13 +292,6 @@ class ExpectedUpdater(object):
"lsan_leak": self.lsan_leak}
self.tests_visited = {}
self.types_by_path = {}
for manifest in test_manifests.iterkeys():
for test_type, path, _ in manifest:
if test_type in wpttest.manifest_test_cls:
self.types_by_path[path] = wpttest.manifest_test_cls[test_type]
self.run_infos = []
def update_from_log(self, log_file):
self.run_info = None
try:
@ -282,10 +341,10 @@ class ExpectedUpdater(object):
action_map["lsan_leak"](item)
def suite_start(self, data):
self.run_info = data["run_info"]
self.run_info = run_info_intern.store(data["run_info"])
def test_start(self, data):
test_id = data["test"]
test_id = intern(data["test"].encode("utf8"))
try:
test_data = self.id_test_map[test_id]
except KeyError:
@ -298,20 +357,15 @@ class ExpectedUpdater(object):
self.tests_visited[test_id] = set()
def test_status(self, data):
test_id = data["test"]
subtest = data["subtest"]
test_id = intern(data["test"].encode("utf8"))
subtest = intern(data["subtest"].encode("utf8"))
test_data = self.id_test_map.get(test_id)
if test_data is None:
return
test_cls = self.types_by_path[test_data.test_path]
self.tests_visited[test_id].add(subtest)
result = test_cls.subtest_result_cls(
subtest,
data["status"],
None)
result = status_intern.store(data["status"])
test_data.set(test_id, subtest, "status", self.run_info, result)
if data.get("expected") and data["expected"] != data["status"]:
@ -321,23 +375,20 @@ class ExpectedUpdater(object):
if data["status"] == "SKIP":
return
test_id = data["test"]
test_id = intern(data["test"].encode("utf8"))
test_data = self.id_test_map.get(test_id)
if test_data is None:
return
test_cls = self.types_by_path[test_data.test_path]
result = status_intern.store(data["status"])
result = test_cls.result_cls(
data["status"],
None)
test_data.set(test_id, None, "status", self.run_info, result)
if data.get("expected") and data["status"] != data["expected"]:
test_data.set_requires_update()
del self.tests_visited[test_id]
def assertion_count(self, data):
test_id = data["test"]
test_id = intern(data["test"].encode("utf8"))
test_data = self.id_test_map.get(test_id)
if test_data is None:
return
@ -348,20 +399,18 @@ class ExpectedUpdater(object):
def lsan_leak(self, data):
dir_path = data.get("scope", "/")
dir_id = os.path.join(dir_path, "__dir__").replace(os.path.sep, "/")
dir_id = intern(os.path.join(dir_path, "__dir__").replace(os.path.sep, "/").encode("utf8"))
if dir_id.startswith("/"):
dir_id = dir_id[1:]
test_data = self.id_test_map[dir_id]
test_data.set(dir_id, None, "lsan", self.run_info, (data["frames"], data.get("allowed_match")))
test_data.set(dir_id, None, "lsan",
self.run_info, (data["frames"], data.get("allowed_match")))
if not data.get("allowed_match"):
test_data.set_requires_update()
def create_test_tree(metadata_path, test_manifest):
"""Create a map of expectation manifests for all tests in test_manifest,
reading existing manifests under manifest_path
:returns: A map of test_id to (manifest, test, expectation_data)
"""Create a map of test_id to TestFileData for that test.
"""
id_test_map = {}
exclude_types = frozenset(["stub", "helper", "manual", "support", "conformancechecker"])
@ -370,13 +419,14 @@ def create_test_tree(metadata_path, test_manifest):
issubclass(item, manifestitem.ManifestItem) and
item.item_type is not None]
include_types = set(all_types) - exclude_types
for _, test_path, tests in test_manifest.itertypes(*include_types):
test_file_data = TestFileData(test_manifest,
for item_type, test_path, tests in test_manifest.itertypes(*include_types):
test_file_data = TestFileData(intern(test_manifest.url_base.encode("utf8")),
intern(item_type.encode("utf8")),
metadata_path,
test_path,
tests)
for test in tests:
id_test_map[test.id] = test_file_data
id_test_map[intern(test.id.encode("utf8"))] = test_file_data
dir_path = os.path.split(test_path)[0].replace(os.path.sep, "/")
while True:
@ -384,9 +434,10 @@ def create_test_tree(metadata_path, test_manifest):
dir_id = dir_path + "/__dir__"
else:
dir_id = "__dir__"
dir_id = (test_manifest.url_base + dir_id).lstrip("/")
dir_id = intern((test_manifest.url_base + dir_id).lstrip("/").encode("utf8"))
if dir_id not in id_test_map:
test_file_data = TestFileData(test_manifest,
test_file_data = TestFileData(intern(test_manifest.url_base.encode("utf8")),
None,
metadata_path,
dir_id,
[])
@ -398,70 +449,126 @@ def create_test_tree(metadata_path, test_manifest):
return id_test_map
class PackedResultList(object):
"""Class for storing test results.
Results are stored as an array of 2-byte integers for compactness.
The first 4 bits represent the property name, the second 4 bits
represent the test status (if it's a result with a status code), and
the final 8 bits represent the run_info. If the result doesn't have a
simple status code but instead a richer type, we place that richer type
in a dictionary and set the status part of the result type to 0.
This class depends on the global prop_intern, run_info_intern and
status_intern InteredData objects to convert between the bit values
and corresponding Python objects."""
def __init__(self):
self.data = array.array("H")
__slots__ = ("data", "raw_data")
def append(self, prop, run_info, value):
out_val = (prop << 12) + run_info
if prop == prop_intern.store("status"):
out_val += value << 8
else:
if not hasattr(self, "raw_data"):
self.raw_data = {}
self.raw_data[len(self.data)] = value
self.data.append(out_val)
def unpack(self, idx, packed):
prop = prop_intern.get((packed & 0xF000) >> 12)
value_idx = (packed & 0x0F00) >> 8
if value_idx is 0:
value = self.raw_data[idx]
else:
value = status_intern.get(value_idx)
run_info = run_info_intern.get((packed & 0x00FF))
return prop, run_info, value
def __iter__(self):
for i, item in enumerate(self.data):
yield self.unpack(i, item)
class TestFileData(object):
def __init__(self, test_manifest, metadata_path, test_path, tests):
self.test_manifest = test_manifest
__slots__ = ("url_base", "item_type", "test_path", "metadata_path", "tests",
"_requires_update", "clear", "data")
def __init__(self, url_base, item_type, metadata_path, test_path, tests):
self.url_base = url_base
self.item_type = item_type
self.test_path = test_path
self.metadata_path = metadata_path
self.tests = tests
self._expected = None
self.tests = {intern(item.id.encode("utf8")) for item in tests}
self._requires_update = False
self.clear = set()
self.data = []
self.data = defaultdict(lambda: defaultdict(PackedResultList))
def set_requires_update(self):
self._requires_update = True
def set(self, test_id, subtest_id, prop, run_info, value):
self.data.append((test_id, subtest_id, prop, run_info, value))
self.data[test_id][subtest_id].append(prop_intern.store(prop),
run_info,
value)
def expected(self, property_order, boolean_properties):
if self._expected is None:
expected_data = load_expected(self.test_manifest,
self.metadata_path,
self.test_path,
self.tests,
property_order,
boolean_properties)
if expected_data is None:
expected_data = create_expected(self.test_manifest,
self.test_path,
property_order,
boolean_properties)
self._expected = expected_data
return self._expected
expected_data = load_expected(self.url_base,
self.metadata_path,
self.test_path,
self.tests,
property_order,
boolean_properties)
if expected_data is None:
expected_data = create_expected(self.url_base,
self.test_path,
property_order,
boolean_properties)
return expected_data
def update(self, property_order, boolean_properties, stability):
def update(self, property_order, boolean_properties, stability,
default_expected_by_type):
if not self._requires_update:
return
expected = self.expected(property_order, boolean_properties)
expected_by_test = {}
for test in self.tests:
if not expected.has_test(test.id):
expected.append(manifestupdate.TestNode.create(test.id))
test_expected = expected.get_test(test.id)
expected_by_test[test.id] = test_expected
for test_id in self.tests:
if not expected.has_test(test_id):
expected.append(manifestupdate.TestNode.create(test_id))
test_expected = expected.get_test(test_id)
expected_by_test[test_id] = test_expected
for prop in self.clear:
test_expected.clear(prop)
for (test_id, subtest_id, prop, run_info, value) in self.data:
# Special case directory metadata
if subtest_id is None and test_id.endswith("__dir__"):
if prop == "lsan":
expected.set_lsan(run_info, value)
continue
for test_id, test_data in self.data.iteritems():
for subtest_id, results_list in test_data.iteritems():
for prop, run_info, value in results_list:
# Special case directory metadata
if subtest_id is None and test_id.endswith("__dir__"):
if prop == "lsan":
expected.set_lsan(run_info, value)
continue
test_expected = expected_by_test[test_id]
if subtest_id is None:
item_expected = test_expected
else:
item_expected = test_expected.get_subtest(subtest_id)
if prop == "status":
item_expected.set_result(run_info, value)
elif prop == "asserts":
item_expected.set_asserts(run_info, value)
if prop == "status":
value = Result(value, default_expected_by_type[self.item_type,
subtest_id is not None])
test_expected = expected_by_test[test_id]
if subtest_id is None:
item_expected = test_expected
else:
item_expected = test_expected.get_subtest(subtest_id)
if prop == "status":
item_expected.set_result(run_info, value)
elif prop == "asserts":
item_expected.set_asserts(run_info, value)
expected.coalesce_properties(stability=stability)
for test in expected.iterchildren():
@ -472,29 +579,32 @@ class TestFileData(object):
return expected
def create_expected(test_manifest, test_path, property_order=None,
Result = namedtuple("Result", ["status", "default_expected"])
def create_expected(url_base, test_path, property_order=None,
boolean_properties=None):
expected = manifestupdate.ExpectedManifest(None, test_path, test_manifest.url_base,
expected = manifestupdate.ExpectedManifest(None,
test_path,
url_base,
property_order=property_order,
boolean_properties=boolean_properties)
return expected
def load_expected(test_manifest, metadata_path, test_path, tests, property_order=None,
def load_expected(url_base, metadata_path, test_path, tests, property_order=None,
boolean_properties=None):
expected_manifest = manifestupdate.get_manifest(metadata_path,
test_path,
test_manifest.url_base,
url_base,
property_order=property_order,
boolean_properties=boolean_properties)
if expected_manifest is None:
return
tests_by_id = {item.id for item in tests}
# Remove expected data for tests that no longer exist
for test in expected_manifest.iterchildren():
if test.id not in tests_by_id:
if test.id not in tests:
test.remove()
return expected_manifest