2019-11-23 04:49:38 +03:00
|
|
|
const config = require("./config.json");
|
|
|
|
const escapeStringRegexp = require("escape-string-regexp");
|
|
|
|
const fs = require("fs");
|
|
|
|
const fetch = require("node-fetch");
|
|
|
|
const replace = require("replace-in-file");
|
2019-06-13 04:13:15 +03:00
|
|
|
|
|
|
|
const IGNORED_DOMAINS = config.ignoredDomains || [];
|
2019-06-14 19:20:07 +03:00
|
|
|
let DOMAINS_REGEXP_CACHE = [];
|
2019-06-13 04:13:15 +03:00
|
|
|
|
2019-11-01 23:27:44 +03:00
|
|
|
/**
|
|
|
|
* Clamp the CSV to config.listSize. We do this because we end up fetching
|
|
|
|
* more than config.listSize by config.ignoredDomains.length. If all those
|
|
|
|
* domains are removed, this will become a no-op.
|
|
|
|
*/
|
2019-11-23 04:49:38 +03:00
|
|
|
const clampListSize = listFile => {
|
|
|
|
return new Promise(async (resolve, reject) => {
|
|
|
|
const data = await fs.promises
|
|
|
|
.readFile(listFile, "utf8")
|
|
|
|
.catch(err => reject(err));
|
|
|
|
const lines = data.split(/\r?\n/);
|
|
|
|
const desiredLength = config.listSize;
|
|
|
|
const currentLength = lines.length;
|
|
|
|
// It's unclear why the list would be smaller, but if it is
|
|
|
|
// just return it.
|
|
|
|
if (currentLength <= desiredLength) {
|
|
|
|
resolve(listFile);
|
|
|
|
} else {
|
|
|
|
lines.splice(desiredLength, currentLength - desiredLength);
|
|
|
|
const clampedFile = lines.join("\r\n");
|
|
|
|
await fs.promises
|
|
|
|
.writeFile(listFile, clampedFile)
|
|
|
|
.catch(err => reject(err));
|
|
|
|
resolve(listFile);
|
|
|
|
}
|
|
|
|
});
|
2019-11-01 23:27:44 +03:00
|
|
|
};
|
|
|
|
|
2019-06-13 04:13:15 +03:00
|
|
|
/**
|
|
|
|
* Return the list without the domains specific in config.ignoredDomains
|
|
|
|
* @param {String} listFile
|
|
|
|
* @returns a String path to the CSV file
|
|
|
|
*/
|
2019-11-23 04:49:38 +03:00
|
|
|
const removeIgnoredDomains = function(listFile) {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
// Modify the website list, if we have any ignoredDomains.
|
|
|
|
if (IGNORED_DOMAINS.length) {
|
|
|
|
if (!DOMAINS_REGEXP_CACHE.length) {
|
|
|
|
DOMAINS_REGEXP_CACHE = IGNORED_DOMAINS.map((value, index) => {
|
|
|
|
// create an escaped regexp out of each domain we want to ignore
|
|
|
|
// the CSV format will look like one of the following (why tho):
|
|
|
|
// 1,example.com\r\n
|
|
|
|
// 1,example.com\n
|
|
|
|
return (IGNORED_DOMAINS[index] = new RegExp(
|
|
|
|
`\\d{1,3},${escapeStringRegexp(value)}\\r?\\n`
|
|
|
|
));
|
|
|
|
});
|
|
|
|
}
|
|
|
|
console.log(`Skipping domains per config.ignoredDomains`);
|
|
|
|
replace({
|
|
|
|
countMatches: true,
|
|
|
|
files: listFile,
|
|
|
|
from: DOMAINS_REGEXP_CACHE,
|
|
|
|
to: "",
|
|
|
|
})
|
|
|
|
.then(results => {
|
|
|
|
if (!results[0].hasChanged) {
|
|
|
|
console.warn(
|
|
|
|
"Warning: config.ignoredDomains set, but the list was not modified."
|
|
|
|
);
|
|
|
|
}
|
|
|
|
resolve(listFile);
|
|
|
|
})
|
|
|
|
.catch(error => reject(error));
|
|
|
|
} else {
|
|
|
|
resolve(listFile);
|
|
|
|
}
|
|
|
|
});
|
2019-06-13 04:13:15 +03:00
|
|
|
};
|
2019-05-18 02:25:45 +03:00
|
|
|
|
2019-06-25 02:51:30 +03:00
|
|
|
/**
|
|
|
|
* Returns the list ID for the specified date or, if that cannot be found, the
|
|
|
|
* most recent one available. If a date is not specified, returns the latest
|
|
|
|
* available list.
|
|
|
|
* @param {Date} date the date of the requested list ID
|
|
|
|
* @returns the String of the list ID
|
|
|
|
*/
|
2019-11-23 04:49:38 +03:00
|
|
|
const fetchListID = async date => {
|
|
|
|
const ID_URL = `https://tranco-list.eu/daily_list_id?date=${parseDate(date)}`;
|
2019-11-20 23:43:02 +03:00
|
|
|
|
2019-11-23 04:49:38 +03:00
|
|
|
return fetch(ID_URL).then(async res => {
|
|
|
|
if (
|
|
|
|
res.ok &&
|
|
|
|
res.headers.get("content-type") === "text/plain; charset=utf-8"
|
|
|
|
) {
|
|
|
|
return { listID: await res.text(), listDate: date };
|
|
|
|
} else if (res.status === 503) {
|
|
|
|
const newDate = new Date(date);
|
|
|
|
const now = new Date();
|
|
|
|
// Future dates are unlikely to be available yet, but also ones
|
|
|
|
// from long ago may have never been available. Try to converge
|
|
|
|
// towards the present.
|
|
|
|
if (date > now) {
|
|
|
|
newDate.setDate(newDate.getDate() - 1);
|
|
|
|
// If we end up at "today", we need to request the list from
|
|
|
|
// the day before -- the daily list is actually a day old.
|
|
|
|
} else if (parseDate(newDate) === parseDate(now)) {
|
|
|
|
newDate.setDate(newDate.getDate() - 2);
|
|
|
|
} else {
|
|
|
|
newDate.setDate(newDate.getDate() + 1);
|
|
|
|
}
|
|
|
|
console.warn(`Retrying with date ${newDate}`);
|
|
|
|
return fetchListID(newDate);
|
|
|
|
}
|
|
|
|
throw new Error(`Request for ${ID_URL} returned status ${res.status}!`);
|
|
|
|
});
|
2019-06-25 02:51:30 +03:00
|
|
|
};
|
|
|
|
|
2019-05-25 02:26:48 +03:00
|
|
|
const fetchList = async (size = 500, directory = "data/", date) => {
|
2019-11-23 04:49:38 +03:00
|
|
|
const listSize = size + IGNORED_DOMAINS.length;
|
2019-05-18 02:25:45 +03:00
|
|
|
|
2019-11-23 04:49:38 +03:00
|
|
|
// Create the data directory.
|
|
|
|
await new Promise((resolve, reject) => {
|
|
|
|
fs.mkdir(directory, { recursive: true }, err => {
|
|
|
|
if (err) reject(err);
|
|
|
|
resolve();
|
2019-05-18 02:25:45 +03:00
|
|
|
});
|
2019-11-23 04:49:38 +03:00
|
|
|
});
|
2019-05-18 02:25:45 +03:00
|
|
|
|
2019-11-23 04:49:38 +03:00
|
|
|
if (!date) {
|
|
|
|
date = new Date();
|
|
|
|
}
|
2019-11-05 22:01:41 +03:00
|
|
|
|
2019-11-23 04:49:38 +03:00
|
|
|
// Fetch the list ID for the requested date.
|
|
|
|
const { listID, listDate } = await fetchListID(date);
|
|
|
|
const file = `${directory}list-${parseDate(listDate)}.csv`;
|
2019-05-18 02:25:45 +03:00
|
|
|
|
2019-11-23 04:49:38 +03:00
|
|
|
// Check for an already downloaded list.
|
|
|
|
const listIsCached = await fs.promises
|
|
|
|
.access(file, fs.constants.R_OK | fs.constants.W_OK)
|
|
|
|
.then(() => true)
|
|
|
|
.catch(() => false);
|
|
|
|
if (listIsCached) {
|
|
|
|
console.log("Found cached Tranco list");
|
|
|
|
return file;
|
|
|
|
}
|
2019-05-25 02:26:48 +03:00
|
|
|
|
2019-11-23 04:49:38 +03:00
|
|
|
// Fetch the list.
|
|
|
|
const LIST_URL = `https://tranco-list.eu/download/${listID}/${listSize}`;
|
|
|
|
return fetch(LIST_URL).then(res => {
|
|
|
|
if (
|
|
|
|
!res.ok ||
|
|
|
|
res.headers.get("content-type") !== "text/csv; charset=utf-8"
|
|
|
|
) {
|
|
|
|
throw new Error(`List ${listID} not found!`);
|
|
|
|
}
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const dest = fs.createWriteStream(file);
|
|
|
|
res.body.pipe(dest);
|
|
|
|
dest.on("finish", () => {
|
|
|
|
console.log(
|
|
|
|
`Downloaded Tranco list with ID ${listID} for date ${parseDate(
|
|
|
|
listDate
|
|
|
|
)}`
|
|
|
|
);
|
|
|
|
removeIgnoredDomains(file)
|
|
|
|
.then(clampListSize)
|
|
|
|
.then(
|
|
|
|
newFile => resolve(newFile),
|
|
|
|
error => reject(error)
|
|
|
|
);
|
|
|
|
});
|
2019-06-13 04:13:15 +03:00
|
|
|
});
|
2019-11-23 04:49:38 +03:00
|
|
|
});
|
|
|
|
};
|
2019-05-25 02:26:48 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the specified date formatted as yyyy-mm-dd in UTC.
|
|
|
|
* @param {Date} date
|
|
|
|
* @returns a String representation of the date as yyyy-mm-dd
|
|
|
|
*/
|
|
|
|
function parseDate(date) {
|
2019-11-23 04:49:38 +03:00
|
|
|
return date.toISOString().split("T")[0];
|
2019-05-18 02:25:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = {
|
2019-11-23 04:49:38 +03:00
|
|
|
fetchList,
|
|
|
|
};
|