Merge pull request #148 from mozilla/issues/117/1

Fixes #117 - Use the Trexa list instead of the Tranco list
This commit is contained in:
Mike Taylor 2020-05-28 16:17:59 -05:00 коммит произвёл GitHub
Родитель bbd25a71ce 27bc26a138
Коммит 33f145cb0e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 69 добавлений и 139 удалений

Просмотреть файл

@ -16,7 +16,7 @@ You can omit any keys where the defaults would suffice. Here is what a commented
would look like:
```
{
// The size of the Tranco list to download, up to 1 million sites.
// The size of the Trexa list to download, up to ~150k sites.
"listSize": 500,
// The directory that will be used to store the downloaded list.
"listDir": "data/",
@ -78,6 +78,12 @@ A `--resume` option also exists, to continue collecting weekly results until the
npm start 2019-05-23 -- --resume
```
A `--exact` option exists to allow testing of single dates that don't fall at the end of the week.
```
npm start 2020-05-28 -- --exact
```
Code of Conduct
===============

Просмотреть файл

@ -187,9 +187,10 @@ function getEOW(date) {
/**
* Return the list of query dates for a given inputDate
* @param {Date} inputDate the date to start with.
* @param {Object} options
* @returns an Array with all dates to gather bugs for
*/
function getQueryDates(inputDate) {
function getQueryDates(inputDate, options) {
const queryDates = [];
if (inputDate) {
// We want to consider open bugs only until the end of the given week.
@ -205,7 +206,7 @@ function getQueryDates(inputDate) {
queryDates.push(getEOW(parsed));
parsed.setDate(parsed.getDate() + 7);
if (getEOW(parsed) > today) {
// Stop if we get into future dates (the Tranco list won't
// Stop if we get into future dates (the Trexa list won't
// have anything useful for us).
break;
}
@ -222,6 +223,9 @@ function getQueryDates(inputDate) {
break;
}
}
// one day we can use options?.exact
} else if (options && options.exact) {
queryDates.push(parsed);
} else {
// A single date is specified.
queryDates.push(getEOW(parsed));

Просмотреть файл

@ -3,7 +3,7 @@ const bugs = require("./bugs");
const fs = require("fs");
const helpers = require("./helpers");
const spreadsheet = require("./spreadsheet");
const tranco = require("./tranco");
const trexa = require("./trexa");
const argv = process.argv.slice(2);
@ -38,12 +38,14 @@ const main = async () => {
const inputDate = argv[0] || maxDate;
if (argv.includes("--resume")) {
queryDates = helpers.resumeQueryDates(inputDate);
} else if (argv.includes("--exact")) {
queryDates = helpers.getQueryDates(inputDate, { exact: true });
} else {
queryDates = helpers.getQueryDates(inputDate);
}
for (const date of queryDates) {
const LIST_FILE = await tranco.fetchList(LIST_SIZE, LIST_DIR, date);
const LIST_FILE = await trexa.fetchList(LIST_SIZE, LIST_DIR, date);
const bugTable = await bugs.fetchBugs(
LIST_FILE,
bugzillaKey,

113
package-lock.json сгенерированный
Просмотреть файл

@ -229,9 +229,9 @@
},
"dependencies": {
"minimist": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz",
"integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=",
"version": "1.2.5",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.5.tgz",
"integrity": "sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==",
"dev": true
}
}
@ -3735,18 +3735,6 @@
"mime": "^2.2.0"
}
},
"handlebars": {
"version": "4.5.3",
"resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.5.3.tgz",
"integrity": "sha512-3yPecJoJHK/4c6aZhSvxOyG4vJKDshV36VHp0iVCDVh7o9w2vwi3NSnL2MMPj3YdduqaBcu7cGbggJQM0br9xA==",
"dev": true,
"requires": {
"neo-async": "^2.6.0",
"optimist": "^0.6.1",
"source-map": "^0.6.1",
"uglify-js": "^3.1.4"
}
},
"har-schema": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz",
@ -3875,6 +3863,12 @@
"whatwg-encoding": "^1.0.1"
}
},
"html-escaper": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
"integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==",
"dev": true
},
"http-signature": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz",
@ -4605,12 +4599,12 @@
}
},
"istanbul-reports": {
"version": "2.2.6",
"resolved": "https://registry.npmjs.org/istanbul-reports/-/istanbul-reports-2.2.6.tgz",
"integrity": "sha512-SKi4rnMyLBKe0Jy2uUdx28h8oG7ph2PPuQPvIAh31d+Ci+lSiEu4C+h3oBPuJ9+mPKhOyW0M8gY4U5NM1WLeXA==",
"version": "2.2.7",
"resolved": "https://registry.npmjs.org/istanbul-reports/-/istanbul-reports-2.2.7.tgz",
"integrity": "sha512-uu1F/L1o5Y6LzPVSVZXNOoD/KXpJue9aeLRd0sM9uMXfZvzomB0WxVamWb5ue8kA2vVWEmW7EG+A5n3f1kqHKg==",
"dev": true,
"requires": {
"handlebars": "^4.1.2"
"html-escaper": "^2.0.0"
}
},
"jest": {
@ -5683,14 +5677,6 @@
"dev": true,
"requires": {
"minimist": "^1.2.0"
},
"dependencies": {
"minimist": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz",
"integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=",
"dev": true
}
}
},
"jsonify": {
@ -5731,9 +5717,9 @@
}
},
"kind-of": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz",
"integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==",
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz",
"integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==",
"dev": true
},
"kleur": {
@ -6293,9 +6279,9 @@
}
},
"minimist": {
"version": "0.0.8",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=",
"version": "1.2.5",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.5.tgz",
"integrity": "sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==",
"dev": true
},
"mixin-deep": {
@ -6335,12 +6321,20 @@
}
},
"mkdirp": {
"version": "0.5.1",
"resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
"integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=",
"version": "0.5.5",
"resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.5.tgz",
"integrity": "sha512-NKmAlESf6jMGym1++R0Ra7wvhV+wFW63FaSOFPwRahvea0gMUcGUhVeAg/0BC0wiv9ih5NYPB1Wn1UEI1/L+xQ==",
"dev": true,
"requires": {
"minimist": "0.0.8"
"minimist": "^1.2.5"
},
"dependencies": {
"minimist": {
"version": "1.2.5",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.5.tgz",
"integrity": "sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==",
"dev": true
}
}
},
"ms": {
@ -6386,12 +6380,6 @@
"integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=",
"dev": true
},
"neo-async": {
"version": "2.6.1",
"resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.1.tgz",
"integrity": "sha512-iyam8fBuCUpWeKPGpaNMetEocMt364qkCsfL9JuhjXX6dRnguRVOfk2GZaDpPjcOKiiXCPINZC1GczQ7iTq3Zw==",
"dev": true
},
"nice-try": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz",
@ -6658,16 +6646,6 @@
"integrity": "sha512-pVOEP16TrAO2/fjej1IdOyupJY8KDUM1CvsaScRbw6oddvpQoOfGk4ywha0HKKVAD6RkW4x6Q+tNBwhf3Bgpuw==",
"dev": true
},
"optimist": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
"integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
"dev": true,
"requires": {
"minimist": "~0.0.1",
"wordwrap": "~0.0.2"
}
},
"optionator": {
"version": "0.8.3",
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.8.3.tgz",
@ -7401,9 +7379,9 @@
}
},
"minimist": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz",
"integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=",
"version": "1.2.5",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.5.tgz",
"integrity": "sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==",
"dev": true
},
"to-regex-range": {
@ -8196,17 +8174,6 @@
"integrity": "sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg==",
"dev": true
},
"uglify-js": {
"version": "3.7.0",
"resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.7.0.tgz",
"integrity": "sha512-PC/ee458NEMITe1OufAjal65i6lB58R1HWMRcxwvdz1UopW0DYqlRL3xdu3IcTvTXsB02CRHykidkTRL+A3hQA==",
"dev": true,
"optional": true,
"requires": {
"commander": "~2.20.3",
"source-map": "~0.6.1"
}
},
"union-value": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/union-value/-/union-value-1.0.1.tgz",
@ -8418,12 +8385,6 @@
"integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
"dev": true
},
"wordwrap": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
"integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=",
"dev": true
},
"wrap-ansi": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-5.1.0.tgz",
@ -8591,9 +8552,9 @@
}
},
"yargs-parser": {
"version": "13.1.1",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.1.tgz",
"integrity": "sha512-oVAVsHz6uFrg3XQheFII8ESO2ssAf9luWuAd6Wexsu4F3OtIW0o8IribPXYrD4WC24LWtPrJlGy87y5udK+dxQ==",
"version": "13.1.2",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz",
"integrity": "sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg==",
"requires": {
"camelcase": "^5.0.0",
"decamelize": "^1.2.0"

Просмотреть файл

@ -1,5 +1,5 @@
const fs = require("fs");
const tranco = require("../tranco.js");
const trexa = require("../trexa.js");
const args = {
listFile: "./tests/fixtures/copy.csv",
@ -21,7 +21,7 @@ afterAll(async () => {
});
test("ignoredDomains get removed", async () => {
await tranco.removeIgnoredDomains(args).then(async returnedArgs => {
await trexa.removeIgnoredDomains(args).then(async returnedArgs => {
const data = await fs.promises.readFile(returnedArgs.listFile, "utf8");
const lines = data.split(/^/m);
@ -32,7 +32,7 @@ test("ignoredDomains get removed", async () => {
describe("clampListSize tests", () => {
test("clampListSize current size > config.listSize", async () => {
await tranco.clampListSize(args).then(async csvPath => {
await trexa.clampListSize(args).then(async csvPath => {
const data = await fs.promises.readFile(csvPath, "utf8");
const lines = data.split(/\r?\n/);
@ -43,7 +43,7 @@ describe("clampListSize tests", () => {
test("clampListSize current size < config.listSize", async () => {
args.config.listSize = 15;
await tranco.clampListSize(args).then(async csvPath => {
await trexa.clampListSize(args).then(async csvPath => {
const data = await fs.promises.readFile(csvPath, "utf8");
const lines = data.split(/\r?\n/);

Просмотреть файл

@ -80,44 +80,6 @@ const removeIgnoredDomains = function(args) {
});
};
/**
* Returns the list ID for the specified date or, if that cannot be found, the
* most recent one available. If a date is not specified, returns the latest
* available list.
* @param {Date} date the date of the requested list ID
* @returns the String of the list ID
*/
const fetchListID = async date => {
const ID_URL = `https://tranco-list.eu/daily_list_id?date=${parseDate(date)}`;
return fetch(ID_URL).then(async res => {
if (
res.ok &&
res.headers.get("content-type") === "text/plain; charset=utf-8"
) {
return { listID: await res.text(), listDate: date };
} else if (res.status === 503) {
const newDate = new Date(date);
const now = new Date();
// Future dates are unlikely to be available yet, but also ones
// from long ago may have never been available. Try to converge
// towards the present.
if (date > now) {
newDate.setDate(newDate.getDate() - 1);
// If we end up at "today", we need to request the list from
// the day before -- the daily list is actually a day old.
} else if (parseDate(newDate) === parseDate(now)) {
newDate.setDate(newDate.getDate() - 2);
} else {
newDate.setDate(newDate.getDate() + 1);
}
console.warn(`Retrying with date ${newDate}`);
return fetchListID(newDate);
}
throw new Error(`Request for ${ID_URL} returned status ${res.status}!`);
});
};
const fetchList = async (
size = 500,
directory = "data/",
@ -138,9 +100,7 @@ const fetchList = async (
date = new Date();
}
// Fetch the list ID for the requested date.
const { listID, listDate } = await fetchListID(date);
const file = `${directory}list-${parseDate(listDate)}.csv`;
const file = `${directory}list-${parseDate(date)}.csv`;
// Check for an already downloaded list.
const listIsCached = await fs.promises
@ -148,28 +108,25 @@ const fetchList = async (
.then(() => true)
.catch(() => false);
if (listIsCached) {
console.log("Found cached Tranco list");
console.log("Found cached Trexa list");
return file;
}
// Fetch the list.
const LIST_URL = `https://tranco-list.eu/download/${listID}/${listSize}`;
return fetch(LIST_URL).then(res => {
if (
!res.ok ||
res.headers.get("content-type") !== "text/csv; charset=utf-8"
) {
throw new Error(`List ${listID} not found!`);
const LIST_URL = `https://trexa.webcompat.com/api/lists/${parseDate(
date
)}?count=${listSize}`;
return fetch(LIST_URL, {
headers: { "User-Agent": "mozilla-tsci/1.0" },
}).then(res => {
if (!res.ok || !res.headers.get("content-type").includes("text/csv")) {
throw new Error(`List trexa-${parseDate(date)}.csv not found!`);
}
return new Promise((resolve, reject) => {
const dest = fs.createWriteStream(file);
res.body.pipe(dest);
dest.on("finish", () => {
console.log(
`Downloaded Tranco list with ID ${listID} for date ${parseDate(
listDate
)}`
);
console.log(`Downloaded Trexa list for date ${parseDate(date)}`);
removeIgnoredDomains({ listFile: file, config })
.then(clampListSize)
.then(