new_audit(robots-txt): /robots.txt validation (#4845)

This commit is contained in:
Konrad Dzwinel 2018-03-27 23:15:26 +02:00 коммит произвёл Brendan Kenny
Родитель f7efaa5468
Коммит 42d47ba36d
5 изменённых файлов: 474 добавлений и 0 удалений

Просмотреть файл

@ -66,6 +66,10 @@ module.exports = [
'canonical': {
score: 1,
},
'robots-txt': {
rawValue: true,
notApplicable: true,
},
},
},
{

Просмотреть файл

@ -0,0 +1,225 @@
/**
* @license Copyright 2018 Google Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
*/
'use strict';
/**
* @fileoverview Validates robots.txt file according to the official standard and its various
* extensions respected by the popular web crawlers.
* Validator rules, and the resources backing these rules, can be found here:
* https://github.com/GoogleChrome/lighthouse/issues/4356#issuecomment-375489925
*/
const Audit = require('../audit');
const URL = require('../../lib/url-shim');
const HTTP_CLIENT_ERROR_CODE_LOW = 400;
const HTTP_SERVER_ERROR_CODE_LOW = 500;
const DIRECTIVE_SITEMAP = 'sitemap';
const DIRECTIVE_USER_AGENT = 'user-agent';
const DIRECTIVE_ALLOW = 'allow';
const DIRECTIVE_DISALLOW = 'disallow';
const DIRECTIVES_GROUP_MEMBERS = new Set([DIRECTIVE_ALLOW, DIRECTIVE_DISALLOW]);
const DIRECTIVE_SAFELIST = new Set([
DIRECTIVE_USER_AGENT, DIRECTIVE_DISALLOW, // standard
DIRECTIVE_ALLOW, DIRECTIVE_SITEMAP, // universally supported
'crawl-delay', // yahoo, bing, yandex
'clean-param', 'host', // yandex
'request-rate', 'visit-time', 'noindex', // not officially supported, but used in the wild
]);
const SITEMAP_VALID_PROTOCOLS = new Set(['https:', 'http:', 'ftp:']);
/**
* @param {string} directiveName
* @param {string} directiveValue
* @throws will throw an exception if given directive is invalid
*/
function verifyDirective(directiveName, directiveValue) {
if (!DIRECTIVE_SAFELIST.has(directiveName)) {
throw new Error('Unknown directive');
}
if (directiveName === DIRECTIVE_SITEMAP) {
let sitemapUrl;
try {
sitemapUrl = new URL(directiveValue);
} catch (e) {
throw new Error('Invalid sitemap URL');
}
if (!SITEMAP_VALID_PROTOCOLS.has(sitemapUrl.protocol)) {
throw new Error('Invalid sitemap URL protocol');
}
}
if (directiveName === DIRECTIVE_USER_AGENT && !directiveValue) {
throw new Error('No user-agent specified');
}
if (directiveName === DIRECTIVE_ALLOW || directiveName === DIRECTIVE_DISALLOW) {
if (directiveValue !== '' && directiveValue[0] !== '/' && directiveValue[0] !== '*') {
throw new Error('Pattern should either be empty, start with "/" or "*"');
}
const dollarIndex = directiveValue.indexOf('$');
if (dollarIndex !== -1 && dollarIndex !== directiveValue.length - 1) {
throw new Error('"$" should only be used at the end of the pattern');
}
}
}
/**
* @param {string} line single line from a robots.txt file
* @throws will throw an exception if given line has errors
* @returns {{directive: string, value: string}|null}
*/
function parseLine(line) {
const hashIndex = line.indexOf('#');
if (hashIndex !== -1) {
line = line.substr(0, hashIndex);
}
line = line.trim();
if (line.length === 0) {
return null;
}
const colonIndex = line.indexOf(':');
if (colonIndex === -1) {
throw new Error('Syntax not understood');
}
const directiveName = line.slice(0, colonIndex).trim().toLowerCase();
const directiveValue = line.slice(colonIndex + 1).trim();
verifyDirective(directiveName, directiveValue);
return {
directive: directiveName,
value: directiveValue,
};
}
/**
* @param {string} content
* @returns {Array<{index: string, line: string, message: string}>}
*/
function validateRobots(content) {
/**
* @type Array<{index: string, line: string, message: string}>
*/
const errors = [];
let inGroup = false;
content
.split(/\r\n|\r|\n/)
.forEach((line, index) => {
let parsedLine;
try {
parsedLine = parseLine(line);
} catch (e) {
errors.push({
index: (index + 1).toString(),
line: line,
message: e.message.toString(),
});
}
if (!parsedLine) {
return;
}
// group-member records (allow, disallow) have to be precided with a start-of-group record (user-agent)
// see: https://developers.google.com/search/reference/robots_txt#grouping-of-records
if (parsedLine.directive === DIRECTIVE_USER_AGENT) {
inGroup = true;
} else if (!inGroup && DIRECTIVES_GROUP_MEMBERS.has(parsedLine.directive)) {
errors.push({
index: (index + 1).toString(),
line: line,
message: 'No user-agent specified',
});
}
});
return errors;
}
class RobotsTxt extends Audit {
/**
* @return {LH.Audit.Meta}
*/
static get meta() {
return {
name: 'robots-txt',
description: 'robots.txt is valid',
failureDescription: 'robots.txt is not valid',
helpText: 'If your robots.txt file is malformed, crawlers may not be able to understand ' +
'how you want your website to be crawled or indexed.',
requiredArtifacts: ['RobotsTxt'],
};
}
/**
* @param {{RobotsTxt: {status: number, content: string}}} artifacts
* @return {LH.Audit.Product}
*/
static audit(artifacts) {
const {
status,
content,
} = artifacts.RobotsTxt;
if (!status) {
return {
rawValue: false,
debugString: 'Lighthouse was unable to download your robots.txt file',
};
}
if (status >= HTTP_SERVER_ERROR_CODE_LOW) {
return {
rawValue: false,
displayValue: `request for robots.txt returned HTTP${status}`,
};
} else if (status >= HTTP_CLIENT_ERROR_CODE_LOW || content === '') {
return {
rawValue: true,
notApplicable: true,
};
}
const validationErrors = validateRobots(content);
const headings = [
{key: 'index', itemType: 'text', text: 'Line #'},
{key: 'line', itemType: 'code', text: 'Content'},
{key: 'message', itemType: 'code', text: 'Error'},
];
const details = Audit.makeTableDetails(headings, validationErrors, {});
let displayValue;
if (validationErrors.length) {
displayValue = validationErrors.length > 1 ?
`${validationErrors.length} errors found` : '1 error found';
}
return {
rawValue: validationErrors.length === 0,
details,
displayValue,
};
}
}
module.exports = RobotsTxt;

Просмотреть файл

@ -180,6 +180,7 @@ module.exports = {
'seo/font-size',
'seo/link-text',
'seo/is-crawlable',
'seo/robots-txt',
'seo/hreflang',
'seo/plugins',
'seo/canonical',
@ -400,6 +401,7 @@ module.exports = {
{id: 'http-status-code', weight: 1, group: 'seo-crawl'},
{id: 'link-text', weight: 1, group: 'seo-content'},
{id: 'is-crawlable', weight: 1, group: 'seo-crawl'},
{id: 'robots-txt', weight: 1, group: 'seo-crawl'},
{id: 'hreflang', weight: 1, group: 'seo-content'},
{id: 'canonical', weight: 1, group: 'seo-content'},
{id: 'font-size', weight: 1, group: 'seo-mobile'},

Просмотреть файл

@ -0,0 +1,242 @@
/**
* @license Copyright 2018 Google Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
*/
'use strict';
const RobotsTxtAudit = require('../../../audits/seo/robots-txt.js');
const assert = require('assert');
/* eslint-env mocha */
describe('SEO: robots.txt audit', () => {
it('fails and reports error when no robots.txt was provided', () => {
const artifacts = {
RobotsTxt: {
status: null,
content: null,
},
};
const auditResult = RobotsTxtAudit.audit(artifacts);
assert.equal(auditResult.rawValue, false);
assert.ok(auditResult.debugString);
});
it('fails when request for /robots.txt returns a HTTP500+ error', () => {
const testData = [
{
status: 500,
content: null,
},
{
status: 503,
content: 'There is some content',
},
{
status: 599,
content: null,
},
];
testData.forEach(RobotsTxt => {
const artifacts = {
RobotsTxt,
};
const auditResult = RobotsTxtAudit.audit(artifacts);
assert.equal(auditResult.rawValue, false);
});
});
it('fails when robots.txt file contains errors', () => {
const testData = [
{
RobotsTxt: {
status: 200,
content: 'Allow: /',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 201,
content: 'syntax error',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 301,
content: 'unknown: directive',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 200,
content: 'unknown: directive',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 200,
content: 'sitemap: /cant/be/relative.xml',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 200,
content: 'sitemap:#can\'t be empty',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 200,
content: 'user-agent: *\nallow: https://cant.be/absolute',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 399,
content: 'user-agent: *\nallow: must/start/with/a/slash',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 200,
content: 'user-agent: *\nallow: /dolar/sign$in/the/middle',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 200,
content: 'user-agent: *\nallow: must/start/with/a/slash',
},
expectedErrors: 1,
},
{
RobotsTxt: {
status: 200,
content: `user-agent: *
allow: /
disallow: /test
user agent: wrong
alow: /wrong
disallow /wrong
`,
},
expectedErrors: 3,
},
{
RobotsTxt: {
status: 200,
content: `every
single
line
is
wrong
`,
},
expectedErrors: 5,
},
];
testData.forEach(({RobotsTxt, expectedErrors}) => {
const artifacts = {
RobotsTxt,
};
const auditResult = RobotsTxtAudit.audit(artifacts);
assert.equal(auditResult.rawValue, false);
assert.equal(auditResult.details.items.length, expectedErrors);
assert.ok(auditResult.displayValue);
});
});
it('not applicable when there is no robots.txt or it\'s empty', () => {
const testData = [
{
status: 404,
content: 'invalid content',
},
{
status: 401,
content: 'invalid content',
},
{
status: 200,
content: '',
},
];
testData.forEach(RobotsTxt => {
const artifacts = {
RobotsTxt,
};
const auditResult = RobotsTxtAudit.audit(artifacts);
assert.equal(auditResult.rawValue, true);
assert.equal(auditResult.notApplicable, true);
});
});
it('passes when robots.txt is valid', () => {
const testData = [
{
status: 200,
content: '#just a comment',
},
{
status: 201,
content: 'user-agent:*\ndisallow:',
},
{
status: 200,
content: 'USER-AGENT: *\nALLOW: / \nDISALLOW:#comment',
},
{
status: 204,
content: `User-agent: Twitterbot
Disallow:
User-agent: BadBot
Disallow: / # go away!
Sitemap: https://example.com/sitemap.xml
User-agent: Yandex
Host: https://brainly.com
clean-param: bla
User-agent: Bing
Disallow: /*.swf$
crawl-delay: 10
User-agent: NotOfficial
noindex: /bla
Visit-time: 0600-0845
Request-rate: 1/30m
`,
},
];
testData.forEach(RobotsTxt => {
const artifacts = {
RobotsTxt,
};
const auditResult = RobotsTxtAudit.audit(artifacts);
assert.equal(auditResult.rawValue, true);
});
});
});

Просмотреть файл

@ -21,6 +21,7 @@
"lighthouse-core/gather/connections/**/*.js",
"lighthouse-core/gather/gatherers/gatherer.js",
"lighthouse-core/scripts/*.js",
"lighthouse-core/audits/seo/robots-txt.js",
"./typings/*.d.ts"
],
"exclude": [