зеркало из https://github.com/github/lightcrawler.git
Add initial CLI and queue
This commit is contained in:
Родитель
a07f24ea09
Коммит
fce93aaab5
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
const yargs = require('yargs')
|
||||
const lightcrawler = require('.')
|
||||
|
||||
const options = yargs
|
||||
.alias('u', 'url').describe('url', 'URL to crawl')
|
||||
.alias('h', 'help').help('h')
|
||||
.argv
|
||||
|
||||
lightcrawler(options)
|
|
@ -1,6 +1,8 @@
|
|||
{
|
||||
"extends": "lighthouse:default",
|
||||
"settings": {
|
||||
"onlyAudits": ["color-contrast"]
|
||||
"onlyAudits": [
|
||||
"external-anchors-use-rel-noopener"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
72
index.js
72
index.js
|
@ -1,60 +1,76 @@
|
|||
const cheerio = require('cheerio')
|
||||
const Crawler = require('simplecrawler')
|
||||
const ChildProcess = require('child_process')
|
||||
const Crawler = require('simplecrawler')
|
||||
const path = require('path')
|
||||
const queue = require('async/queue')
|
||||
|
||||
const crawler = new Crawler('https://electron.atom.io')
|
||||
crawler.respectRobotsTxt = false
|
||||
crawler.parseHTMLComments = false
|
||||
crawler.parseScriptTags = false
|
||||
crawler.maxDepth = 1
|
||||
module.exports = (options) => {
|
||||
const crawler = new Crawler(options.url)
|
||||
crawler.respectRobotsTxt = false
|
||||
crawler.parseHTMLComments = false
|
||||
crawler.parseScriptTags = false
|
||||
crawler.maxDepth = 1
|
||||
|
||||
crawler.discoverResources = (buffer, item) => {
|
||||
const page = cheerio.load(buffer.toString('utf8'))
|
||||
const links = page('a[href]').map(function () {
|
||||
return page(this).attr('href')
|
||||
}).get()
|
||||
crawler.discoverResources = (buffer, item) => {
|
||||
const page = cheerio.load(buffer.toString('utf8'))
|
||||
const links = page('a[href]').map(function () {
|
||||
return page(this).attr('href')
|
||||
}).get()
|
||||
|
||||
return links
|
||||
return links
|
||||
}
|
||||
|
||||
const lighthouseQueue = queue(runLighthouse, 5);
|
||||
|
||||
crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
|
||||
lighthouseQueue.push(queueItem.url)
|
||||
})
|
||||
|
||||
crawler.start()
|
||||
}
|
||||
|
||||
crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
|
||||
console.log('running lighthouse on ', queueItem.url)
|
||||
runLighthouse(queueItem.url)
|
||||
})
|
||||
|
||||
function runLighthouse (url) {
|
||||
const lighthouse = ChildProcess.spawn(path.join(__dirname, 'node_modules', '.bin', 'lighthouse'), [
|
||||
function runLighthouse (url, callback) {
|
||||
const args = [
|
||||
url,
|
||||
'--output=json',
|
||||
'--output-path=stdout',
|
||||
'--disable-device-emulation',
|
||||
'--disable-cpu-throttling',
|
||||
'--disable-network-throttling',
|
||||
'--chrome-flags="--headless --disable-gpu"',
|
||||
'--chrome-flags=--headless --disable-gpu',
|
||||
`--config-path=${path.join(__dirname, 'config.json')}`
|
||||
])
|
||||
]
|
||||
const lighthouse = ChildProcess.spawn(path.join(__dirname, 'node_modules', '.bin', 'lighthouse'), args)
|
||||
|
||||
let output = ''
|
||||
lighthouse.stdout.on('data', (data) => {
|
||||
output += data
|
||||
})
|
||||
lighthouse.once('close', () => {
|
||||
callback()
|
||||
|
||||
const report = JSON.parse(output)
|
||||
|
||||
report.reportCategories.forEach((category) => {
|
||||
category.audits.forEach((audit) => {
|
||||
if (audit.score !== 100) {
|
||||
console.log(`${url} failed ${audit.id}`)
|
||||
audit.result.extendedInfo.value.nodes.forEach((result) => {
|
||||
console.log(result.failureSummary)
|
||||
console.log(result.path)
|
||||
console.log(result.html)
|
||||
})
|
||||
|
||||
const {value} = audit.result.extendedInfo
|
||||
if (Array.isArray(value)) {
|
||||
value.forEach((result) => {
|
||||
console.log(` ${result.url}`)
|
||||
})
|
||||
} else if (Array.isArray(value.nodes)) {
|
||||
value.nodes.forEach((result) => {
|
||||
let message = result.failureSummary
|
||||
message = message.replace(/^Fix any of the following:/g, '').trim()
|
||||
console.log(` ${message}`)
|
||||
console.log(` ${result.html}`)
|
||||
})
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
crawler.start()
|
||||
|
|
|
@ -6,12 +6,17 @@
|
|||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"bin": {
|
||||
"lightcrawler": "./cli.js"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"async": "^2.4.1",
|
||||
"cheerio": "^1.0.0-rc.1",
|
||||
"lighthouse": "^2.1.0",
|
||||
"simplecrawler": "^1.1.3"
|
||||
"simplecrawler": "^1.1.3",
|
||||
"yargs": "^8.0.2"
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче