Bug 875562 - Part 7: Implement limits for stored crash data; r=Yoric

This patch institutes limits for how much crash data can be stored in
terms of maximum number of crash events per day. If more events than the
limit are encountered, we record that a high water mark has been
encountered and we silently discard the payload of future events.

We chose to not increment the version of the store payload because no
clients are actively saving this data yet, so it doesn't make sense to
incur a version bump.

--HG--
extra : rebase_source : 3d5d92668b38e8b9c3c45e1a61a3a172b576e0a5
extra : amend_source : 71fffa0724450c5fe13563d2539813e9a831b3de
This commit is contained in:
Gregory Szorc 2014-01-28 17:33:38 -08:00
Родитель d1c9b4eb93
Коммит 738b579962
4 изменённых файлов: 208 добавлений и 20 удалений

Просмотреть файл

@ -30,6 +30,13 @@ const AGGREGATE_STARTUP_DELAY_MS = 57000;
const MILLISECONDS_IN_DAY = 24 * 60 * 60 * 1000;
// Converts Date to days since UNIX epoch.
// This was copied from /services/metrics.storage.jsm. The implementation
// does not account for leap seconds.
function dateToDays(date) {
return Math.floor(date.getTime() / MILLISECONDS_IN_DAY);
}
/**
* A gateway to crash-related data.
@ -520,6 +527,17 @@ let gCrashManager;
* When metadata is updated, the caller must explicitly persist the changes
* to disk. This prevents excessive I/O during updates.
*
* The store has a mechanism for ensuring it doesn't grow too large. A ceiling
* is placed on the number of daily events that can occur for events that can
* occur with relatively high frequency, notably plugin crashes and hangs
* (plugins can enter cycles where they repeatedly crash). If we've reached
* the high water mark and new data arrives, it's silently dropped.
* However, the count of actual events is always preserved. This allows
* us to report on the severity of problems beyond the storage threshold.
*
* Main process crashes are excluded from limits because they are both
* important and should be rare.
*
* @param storeDir (string)
* Directory the store should be located in.
* @param telemetrySizeKey (string)
@ -534,25 +552,41 @@ function CrashStore(storeDir, telemetrySizeKey) {
// Holds the read data from disk.
this._data = null;
// Maps days since UNIX epoch to a Map of event types to counts.
// This data structure is populated when the JSON file is loaded
// and is also updated when new events are added.
this._countsByDay = new Map();
}
CrashStore.prototype = Object.freeze({
// A crash that occurred in the main process.
TYPE_MAIN_CRASH: "main-crash",
// A crash in a plugin process.
TYPE_PLUGIN_CRASH: "plugin-crash",
// A hang in a plugin process.
TYPE_PLUGIN_HANG: "plugin-hang",
// Maximum number of events to store per day. This establishes a
// ceiling on the per-type/per-day records that will be stored.
HIGH_WATER_DAILY_THRESHOLD: 100,
/**
* Load data from disk.
*
* @return Promise<null>
* @return Promise
*/
load: function () {
return Task.spawn(function* () {
// Loading replaces data. So reset data structures.
this._data = {
v: 1,
crashes: new Map(),
corruptDate: null,
};
this._countsByDay = new Map();
try {
let decoder = new TextDecoder();
@ -563,10 +597,42 @@ CrashStore.prototype = Object.freeze({
this._data.corruptDate = new Date(data.corruptDate);
}
// actualCounts is used to validate that the derived counts by
// days stored in the payload matches up to actual data.
let actualCounts = new Map();
for (let id in data.crashes) {
let crash = data.crashes[id];
let denormalized = this._denormalize(crash);
this._data.crashes.set(id, denormalized);
let key = dateToDays(denormalized.crashDate) + "-" + denormalized.type;
actualCounts.set(key, (actualCounts.get(key) || 0) + 1);
}
// The validation in this loop is arguably not necessary. We perform
// it as a defense against unknown bugs.
for (let dayKey in data.countsByDay) {
let day = parseInt(dayKey, 10);
for (let type in data.countsByDay[day]) {
this._ensureCountsForDay(day);
let count = data.countsByDay[day][type];
let key = day + "-" + type;
// If the payload says we have data for a given day but we
// don't, the payload is wrong. Ignore it.
if (!actualCounts.has(key)) {
continue;
}
// If we encountered more data in the payload than what the
// data structure says, use the proper value.
count = Math.max(count, actualCounts.get(key));
this._countsByDay.get(day).set(type, count);
}
}
} catch (ex if ex instanceof OS.File.Error && ex.becauseNoSuchFile) {
// Missing files (first use) are allowed.
@ -594,8 +660,22 @@ CrashStore.prototype = Object.freeze({
}
let normalized = {
// The version should be incremented whenever the format
// changes.
v: 1,
// Maps crash IDs to objects defining the crash.
crashes: {},
// Maps days since UNIX epoch to objects mapping event types to
// counts. This is a mirror of this._countsByDay. e.g.
// {
// 15000: {
// "main-crash": 2,
// "plugin-crash": 1
// }
// }
countsByDay: {},
// When the store was last corrupted.
corruptDate: null,
};
@ -608,6 +688,13 @@ CrashStore.prototype = Object.freeze({
normalized.crashes[id] = c;
}
for (let [day, m] of this._countsByDay) {
normalized.countsByDay[day] = {};
for (let [type, count] of m) {
normalized.countsByDay[day][type] = count;
}
}
let encoder = new TextEncoder();
let data = encoder.encode(JSON.stringify(normalized));
let size = yield OS.File.writeAtomic(this._storePath, data, {
@ -729,16 +816,51 @@ CrashStore.prototype = Object.freeze({
return null;
},
_ensureCrashRecord: function (id) {
_ensureCountsForDay: function (day) {
if (!this._countsByDay.has(day)) {
this._countsByDay.set(day, new Map());
}
},
/**
* Ensure the crash record is present in storage.
*
* Returns the crash record if we're allowed to store it or null
* if we've hit the high water mark.
*
* @param id
* (string) The crash ID.
* @param type
* (string) One of the this.TYPE_* constants describing the crash type.
* @param date
* (Date) When this crash occurred.
*
* @return null | object crash record
*/
_ensureCrashRecord: function (id, type, date) {
let day = dateToDays(date);
this._ensureCountsForDay(day);
let count = (this._countsByDay.get(day).get(type) || 0) + 1;
this._countsByDay.get(day).set(type, count);
if (count > this.HIGH_WATER_DAILY_THRESHOLD && type != this.TYPE_MAIN_CRASH) {
return null;
}
if (!this._data.crashes.has(id)) {
this._data.crashes.set(id, {
id: id,
type: null,
crashDate: null,
type: type,
crashDate: date,
});
}
return this._data.crashes.get(id);
let crash = this._data.crashes.get(id);
crash.type = type;
crash.date = date;
return crash;
},
/**
@ -748,9 +870,7 @@ CrashStore.prototype = Object.freeze({
* @param date (Date) When the crash occurred.
*/
addMainProcessCrash: function (id, date) {
let r = this._ensureCrashRecord(id);
r.type = this.TYPE_MAIN_CRASH;
r.crashDate = date;
this._ensureCrashRecord(id, this.TYPE_MAIN_CRASH, date);
},
/**
@ -760,9 +880,7 @@ CrashStore.prototype = Object.freeze({
* @param date (Date) When the crash occurred.
*/
addPluginCrash: function (id, date) {
let r = this._ensureCrashRecord(id);
r.type = this.TYPE_PLUGIN_CRASH;
r.crashDate = date;
this._ensureCrashRecord(id, this.TYPE_PLUGIN_CRASH, date);
},
/**
@ -772,9 +890,7 @@ CrashStore.prototype = Object.freeze({
* @param date (Date) When the hang was reported.
*/
addPluginHang: function (id, date) {
let r = this._ensureCrashRecord(id);
r.type = this.TYPE_PLUGIN_HANG;
r.crashDate = date;
this._ensureCrashRecord(id, this.TYPE_PLUGIN_HANG, date);
},
get mainProcessCrashes() {
@ -847,6 +963,10 @@ CrashRecord.prototype = Object.freeze({
return this._o.crashDate;
},
get oldestDate() {
return this._o.crashDate;
},
get type() {
return this._o.type;
},

Просмотреть файл

@ -155,9 +155,8 @@ crash data grows. As new data is accumulated, we need to read and write
an entire file to make small updates. LZ4 compression helps reduce I/O.
But, there is a potential for unbounded file growth. We establish a
limit for the max age of records. Anything older than that limit is
pruned. Future patches will also limit the maximum number of records. This
will establish a hard limit on the size of the file, at least in terms of
crashes.
Care must be taken when new crash data is recorded, as this will increase
the size of the file and make I/O a larger concern.
pruned. We also establish a daily limit on the number of crashes we will
store. All crashes beyond the first N in a day have no payload and are
only recorded by the presence of a count. This count ensures we can
distinguish between ``N`` and ``100 * N``, which are very different
values!

Просмотреть файл

@ -258,3 +258,24 @@ add_task(function* test_plugin_hang_event_file() {
count = yield m.aggregateEventsFiles();
Assert.equal(count, 0);
});
// Excessive amounts of files should be processed properly.
add_task(function* test_high_water_mark() {
let m = yield getManager();
let store = yield m._getStore();
for (let i = 0; i < store.HIGH_WATER_DAILY_THRESHOLD + 1; i++) {
yield m.createEventsFile("m" + i, "crash.main.1", DUMMY_DATE, "m" + i);
yield m.createEventsFile("pc" + i, "crash.plugin.1", DUMMY_DATE, "pc" + i);
yield m.createEventsFile("ph" + i, "hang.plugin.1", DUMMY_DATE, "ph" + i);
}
let count = yield m.aggregateEventsFiles();
Assert.equal(count, 3 * bsp.CrashStore.prototype.HIGH_WATER_DAILY_THRESHOLD + 3);
// Need to fetch again in case the first one was garbage collected.
store = yield m._getStore();
// +1 is for preserved main process crash.
Assert.equal(store.crashesCount, 3 * store.HIGH_WATER_DAILY_THRESHOLD + 1);
});

Просмотреть файл

@ -181,3 +181,51 @@ add_task(function* test_add_mixed_types() {
Assert.equal(s.pluginCrashes.length, 1);
Assert.equal(s.pluginHangs.length, 1);
});
// Crashes added beyond the high water mark behave properly.
add_task(function* test_high_water() {
let s = yield getStore();
let d1 = new Date(2014, 0, 1, 0, 0, 0);
let d2 = new Date(2014, 0, 2, 0, 0, 0);
for (let i = 0; i < s.HIGH_WATER_DAILY_THRESHOLD + 1; i++) {
s.addMainProcessCrash("m1" + i, d1);
s.addMainProcessCrash("m2" + i, d2);
s.addPluginCrash("pc1" + i, d1);
s.addPluginCrash("pc2" + i, d2);
s.addPluginHang("ph1" + i, d1);
s.addPluginHang("ph2" + i, d2);
}
// We preserve main process crashes. Plugin crashes and hangs beyond should
// be discarded.
Assert.equal(s.crashesCount, 6 * s.HIGH_WATER_DAILY_THRESHOLD + 2);
Assert.equal(s.mainProcessCrashes.length, 2 * s.HIGH_WATER_DAILY_THRESHOLD + 2);
Assert.equal(s.pluginCrashes.length, 2 * s.HIGH_WATER_DAILY_THRESHOLD);
Assert.equal(s.pluginHangs.length, 2 * s.HIGH_WATER_DAILY_THRESHOLD);
// But raw counts should be preserved.
let day1 = bsp.dateToDays(d1);
let day2 = bsp.dateToDays(d2);
Assert.ok(s._countsByDay.has(day1));
Assert.ok(s._countsByDay.has(day2));
Assert.equal(s._countsByDay.get(day1).get(s.TYPE_MAIN_CRASH),
s.HIGH_WATER_DAILY_THRESHOLD + 1);
Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_CRASH),
s.HIGH_WATER_DAILY_THRESHOLD + 1);
Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_HANG),
s.HIGH_WATER_DAILY_THRESHOLD + 1);
yield s.save();
yield s.load();
Assert.ok(s._countsByDay.has(day1));
Assert.ok(s._countsByDay.has(day2));
Assert.equal(s._countsByDay.get(day1).get(s.TYPE_MAIN_CRASH),
s.HIGH_WATER_DAILY_THRESHOLD + 1);
Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_CRASH),
s.HIGH_WATER_DAILY_THRESHOLD + 1);
Assert.equal(s._countsByDay.get(day1).get(s.TYPE_PLUGIN_HANG),
s.HIGH_WATER_DAILY_THRESHOLD + 1);
});