Issue #69 - clamp the list to config.listSize after we remove sites

This commit is contained in:
Mike Taylor 2019-11-01 15:27:44 -05:00
Родитель 76abf81212
Коммит 5e6989e1f7
1 изменённых файлов: 29 добавлений и 1 удалений

Просмотреть файл

@ -7,6 +7,32 @@ const replace = require('replace-in-file');
const IGNORED_DOMAINS = config.ignoredDomains || [];
let DOMAINS_REGEXP_CACHE = [];
/**
* Clamp the CSV to config.listSize. We do this because we end up fetching
* more than config.listSize by config.ignoredDomains.length. If all those
* domains are removed, this will become a no-op.
*/
const clampListSize = (listFile) => {
return new Promise(async (resolve, reject) => {
const data = await fs.promises.readFile(listFile, 'utf8')
.catch(err => reject(err));
const lines = data.split(/\r?\n/);
const desiredLength = config.listSize;
const currentLength = lines.length;
// It's unclear why the list would be smaller, but if it is
// just return it.
if (currentLength <= desiredLength) {
resolve(listFile);
} else {
lines.splice(desiredLength, currentLength - desiredLength);
const clampedFile = lines.join("\r\n");
await fs.promises.writeFile(listFile, clampedFile)
.catch(err => reject(err));
resolve(listFile);
}
});
};
/**
* Return the list without the domains specific in config.ignoredDomains
* @param {String} listFile
@ -120,7 +146,9 @@ const fetchList = async (size = 500, directory = "data/", date) => {
res.body.pipe(dest);
dest.on('finish', () => {
console.log(`Downloaded Tranco list with ID ${LIST_ID} for date ${parseDate(date)}`);
removeIgnoredDomains(file).then((newFile) => resolve(newFile), error => reject(error));
removeIgnoredDomains(file)
.then(clampListSize)
.then((newFile) => resolve(newFile), error => reject(error));
});
});
});