new_audit(robots-txt): /robots.txt validation (#4845)

2018-03-27 23:15:26 +02:00 · 2018-03-27 23:15:26 +02:00 · 42d47ba36d
--- a/lighthouse-cli/test/smokehouse/seo/expectations.js
+++ b/lighthouse-cli/test/smokehouse/seo/expectations.js
@ -66,6 +66,10 @@ module.exports = [
      'canonical': {
        score: 1,
      },
+      'robots-txt': {
+        rawValue: true,
+        notApplicable: true,
+      },
    },
  },
  {
--- a/lighthouse-core/audits/seo/robots-txt.js
+++ b/lighthouse-core/audits/seo/robots-txt.js
@ -0,0 +1,225 @@
+/**
+ * @license Copyright 2018 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+ */
+'use strict';
+
+/**
+ * @fileoverview Validates robots.txt file according to the official standard and its various
+ * extensions respected by the popular web crawlers.
+ * Validator rules, and the resources backing these rules, can be found here:
+ * https://github.com/GoogleChrome/lighthouse/issues/4356#issuecomment-375489925
+ */
+
+const Audit = require('../audit');
+const URL = require('../../lib/url-shim');
+
+const HTTP_CLIENT_ERROR_CODE_LOW = 400;
+const HTTP_SERVER_ERROR_CODE_LOW = 500;
+
+const DIRECTIVE_SITEMAP = 'sitemap';
+const DIRECTIVE_USER_AGENT = 'user-agent';
+const DIRECTIVE_ALLOW = 'allow';
+const DIRECTIVE_DISALLOW = 'disallow';
+const DIRECTIVES_GROUP_MEMBERS = new Set([DIRECTIVE_ALLOW, DIRECTIVE_DISALLOW]);
+const DIRECTIVE_SAFELIST = new Set([
+  DIRECTIVE_USER_AGENT, DIRECTIVE_DISALLOW, // standard
+  DIRECTIVE_ALLOW, DIRECTIVE_SITEMAP, // universally supported
+  'crawl-delay', // yahoo, bing, yandex
+  'clean-param', 'host', // yandex
+  'request-rate', 'visit-time', 'noindex', // not officially supported, but used in the wild
+]);
+const SITEMAP_VALID_PROTOCOLS = new Set(['https:', 'http:', 'ftp:']);
+
+/**
+ * @param {string} directiveName
+ * @param {string} directiveValue
+ * @throws will throw an exception if given directive is invalid
+ */
+function verifyDirective(directiveName, directiveValue) {
+  if (!DIRECTIVE_SAFELIST.has(directiveName)) {
+    throw new Error('Unknown directive');
+  }
+
+  if (directiveName === DIRECTIVE_SITEMAP) {
+    let sitemapUrl;
+
+    try {
+      sitemapUrl = new URL(directiveValue);
+    } catch (e) {
+      throw new Error('Invalid sitemap URL');
+    }
+
+    if (!SITEMAP_VALID_PROTOCOLS.has(sitemapUrl.protocol)) {
+      throw new Error('Invalid sitemap URL protocol');
+    }
+  }
+
+  if (directiveName === DIRECTIVE_USER_AGENT && !directiveValue) {
+    throw new Error('No user-agent specified');
+  }
+
+  if (directiveName === DIRECTIVE_ALLOW || directiveName === DIRECTIVE_DISALLOW) {
+    if (directiveValue !== '' && directiveValue[0] !== '/' && directiveValue[0] !== '*') {
+      throw new Error('Pattern should either be empty, start with "/" or "*"');
+    }
+
+    const dollarIndex = directiveValue.indexOf('$');
+
+    if (dollarIndex !== -1 && dollarIndex !== directiveValue.length - 1) {
+      throw new Error('"$" should only be used at the end of the pattern');
+    }
+  }
+}
+
+/**
+ * @param {string} line single line from a robots.txt file
+ * @throws will throw an exception if given line has errors
+ * @returns {{directive: string, value: string}|null}
+ */
+function parseLine(line) {
+  const hashIndex = line.indexOf('#');
+
+  if (hashIndex !== -1) {
+    line = line.substr(0, hashIndex);
+  }
+
+  line = line.trim();
+
+  if (line.length === 0) {
+    return null;
+  }
+
+  const colonIndex = line.indexOf(':');
+
+  if (colonIndex === -1) {
+    throw new Error('Syntax not understood');
+  }
+
+  const directiveName = line.slice(0, colonIndex).trim().toLowerCase();
+  const directiveValue = line.slice(colonIndex + 1).trim();
+
+  verifyDirective(directiveName, directiveValue);
+
+  return {
+    directive: directiveName,
+    value: directiveValue,
+  };
+}
+
+/**
+ * @param {string} content
+ * @returns {Array<{index: string, line: string, message: string}>}
+ */
+function validateRobots(content) {
+  /**
+   * @type Array<{index: string, line: string, message: string}>
+   */
+  const errors = [];
+  let inGroup = false;
+
+  content
+    .split(/\r\n|\r|\n/)
+    .forEach((line, index) => {
+      let parsedLine;
+
+      try {
+        parsedLine = parseLine(line);
+      } catch (e) {
+        errors.push({
+          index: (index + 1).toString(),
+          line: line,
+          message: e.message.toString(),
+        });
+      }
+
+      if (!parsedLine) {
+        return;
+      }
+
+      // group-member records (allow, disallow) have to be precided with a start-of-group record (user-agent)
+      // see: https://developers.google.com/search/reference/robots_txt#grouping-of-records
+      if (parsedLine.directive === DIRECTIVE_USER_AGENT) {
+        inGroup = true;
+      } else if (!inGroup && DIRECTIVES_GROUP_MEMBERS.has(parsedLine.directive)) {
+        errors.push({
+          index: (index + 1).toString(),
+          line: line,
+          message: 'No user-agent specified',
+        });
+      }
+    });
+
+  return errors;
+}
+
+class RobotsTxt extends Audit {
+  /**
+   * @return {LH.Audit.Meta}
+   */
+  static get meta() {
+    return {
+      name: 'robots-txt',
+      description: 'robots.txt is valid',
+      failureDescription: 'robots.txt is not valid',
+      helpText: 'If your robots.txt file is malformed, crawlers may not be able to understand ' +
+      'how you want your website to be crawled or indexed.',
+      requiredArtifacts: ['RobotsTxt'],
+    };
+  }
+
+  /**
+   * @param {{RobotsTxt: {status: number, content: string}}} artifacts
+   * @return {LH.Audit.Product}
+   */
+  static audit(artifacts) {
+    const {
+      status,
+      content,
+    } = artifacts.RobotsTxt;
+
+    if (!status) {
+      return {
+        rawValue: false,
+        debugString: 'Lighthouse was unable to download your robots.txt file',
+      };
+    }
+
+    if (status >= HTTP_SERVER_ERROR_CODE_LOW) {
+      return {
+        rawValue: false,
+        displayValue: `request for robots.txt returned HTTP${status}`,
+      };
+    } else if (status >= HTTP_CLIENT_ERROR_CODE_LOW || content === '') {
+      return {
+        rawValue: true,
+        notApplicable: true,
+      };
+    }
+
+    const validationErrors = validateRobots(content);
+
+    const headings = [
+      {key: 'index', itemType: 'text', text: 'Line #'},
+      {key: 'line', itemType: 'code', text: 'Content'},
+      {key: 'message', itemType: 'code', text: 'Error'},
+    ];
+
+    const details = Audit.makeTableDetails(headings, validationErrors, {});
+    let displayValue;
+
+    if (validationErrors.length) {
+      displayValue = validationErrors.length > 1 ?
+        `${validationErrors.length} errors found` : '1 error found';
+    }
+
+    return {
+      rawValue: validationErrors.length === 0,
+      details,
+      displayValue,
+    };
+  }
+}
+
+module.exports = RobotsTxt;
--- a/lighthouse-core/config/default.js
+++ b/lighthouse-core/config/default.js
@ -180,6 +180,7 @@ module.exports = {
    'seo/font-size',
    'seo/link-text',
    'seo/is-crawlable',
+    'seo/robots-txt',
    'seo/hreflang',
    'seo/plugins',
    'seo/canonical',
@ -400,6 +401,7 @@ module.exports = {
        {id: 'http-status-code', weight: 1, group: 'seo-crawl'},
        {id: 'link-text', weight: 1, group: 'seo-content'},
        {id: 'is-crawlable', weight: 1, group: 'seo-crawl'},
+        {id: 'robots-txt', weight: 1, group: 'seo-crawl'},
        {id: 'hreflang', weight: 1, group: 'seo-content'},
        {id: 'canonical', weight: 1, group: 'seo-content'},
        {id: 'font-size', weight: 1, group: 'seo-mobile'},
--- a/lighthouse-core/test/audits/seo/robots-txt.js
+++ b/lighthouse-core/test/audits/seo/robots-txt.js
@ -0,0 +1,242 @@
+/**
+ * @license Copyright 2018 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+ */
+'use strict';
+
+const RobotsTxtAudit = require('../../../audits/seo/robots-txt.js');
+const assert = require('assert');
+
+/* eslint-env mocha */
+
+describe('SEO: robots.txt audit', () => {
+  it('fails and reports error when no robots.txt was provided', () => {
+    const artifacts = {
+      RobotsTxt: {
+        status: null,
+        content: null,
+      },
+    };
+
+    const auditResult = RobotsTxtAudit.audit(artifacts);
+    assert.equal(auditResult.rawValue, false);
+    assert.ok(auditResult.debugString);
+  });
+
+  it('fails when request for /robots.txt returns a HTTP500+ error', () => {
+    const testData = [
+      {
+        status: 500,
+        content: null,
+      },
+      {
+        status: 503,
+        content: 'There is some content',
+      },
+      {
+        status: 599,
+        content: null,
+      },
+    ];
+
+    testData.forEach(RobotsTxt => {
+      const artifacts = {
+        RobotsTxt,
+      };
+
+      const auditResult = RobotsTxtAudit.audit(artifacts);
+      assert.equal(auditResult.rawValue, false);
+    });
+  });
+
+  it('fails when robots.txt file contains errors', () => {
+    const testData = [
+      {
+        RobotsTxt: {
+          status: 200,
+          content: 'Allow: /',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 201,
+          content: 'syntax error',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 301,
+          content: 'unknown: directive',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: 'unknown: directive',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: 'sitemap: /cant/be/relative.xml',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: 'sitemap:#can\'t be empty',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: 'user-agent: *\nallow: https://cant.be/absolute',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 399,
+          content: 'user-agent: *\nallow: must/start/with/a/slash',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: 'user-agent: *\nallow: /dolar/sign$in/the/middle',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: 'user-agent: *\nallow: must/start/with/a/slash',
+        },
+        expectedErrors: 1,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: `user-agent: *
+allow: /
+disallow: /test
+
+user agent: wrong
+alow: /wrong
+disallow /wrong
+`,
+        },
+        expectedErrors: 3,
+      },
+      {
+        RobotsTxt: {
+          status: 200,
+          content: `every
+single
+line
+is
+wrong
+`,
+        },
+        expectedErrors: 5,
+      },
+    ];
+
+    testData.forEach(({RobotsTxt, expectedErrors}) => {
+      const artifacts = {
+        RobotsTxt,
+      };
+
+      const auditResult = RobotsTxtAudit.audit(artifacts);
+
+      assert.equal(auditResult.rawValue, false);
+      assert.equal(auditResult.details.items.length, expectedErrors);
+      assert.ok(auditResult.displayValue);
+    });
+  });
+
+  it('not applicable when there is no robots.txt or it\'s empty', () => {
+    const testData = [
+      {
+        status: 404,
+        content: 'invalid content',
+      },
+      {
+        status: 401,
+        content: 'invalid content',
+      },
+      {
+        status: 200,
+        content: '',
+      },
+    ];
+
+    testData.forEach(RobotsTxt => {
+      const artifacts = {
+        RobotsTxt,
+      };
+
+      const auditResult = RobotsTxtAudit.audit(artifacts);
+      assert.equal(auditResult.rawValue, true);
+      assert.equal(auditResult.notApplicable, true);
+    });
+  });
+
+  it('passes when robots.txt is valid', () => {
+    const testData = [
+      {
+        status: 200,
+        content: '#just a comment',
+      },
+      {
+        status: 201,
+        content: 'user-agent:*\ndisallow:',
+      },
+      {
+        status: 200,
+        content: 'USER-AGENT:  *\nALLOW:    /         \nDISALLOW:#comment',
+      },
+      {
+        status: 204,
+        content: `User-agent: Twitterbot
+Disallow:
+
+User-agent: BadBot
+Disallow: / # go away!
+
+Sitemap: https://example.com/sitemap.xml
+
+User-agent: Yandex
+Host: https://brainly.com
+clean-param: bla
+
+User-agent: Bing
+Disallow: /*.swf$
+crawl-delay: 10
+
+User-agent: NotOfficial
+noindex: /bla
+Visit-time: 0600-0845
+Request-rate: 1/30m
+`,
+      },
+    ];
+
+    testData.forEach(RobotsTxt => {
+      const artifacts = {
+        RobotsTxt,
+      };
+
+      const auditResult = RobotsTxtAudit.audit(artifacts);
+      assert.equal(auditResult.rawValue, true);
+    });
+  });
+});
--- a/tsconfig.json
+++ b/tsconfig.json
@ -21,6 +21,7 @@
    "lighthouse-core/gather/connections/**/*.js",
    "lighthouse-core/gather/gatherers/gatherer.js",
    "lighthouse-core/scripts/*.js",
+    "lighthouse-core/audits/seo/robots-txt.js",
    "./typings/*.d.ts"
  ],
  "exclude": [