gecko-dev/toolkit/actors/ContentMetaChild.sys.mjs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// Debounce time in milliseconds - this should be long enough to account for
// sync script tags that could appear between desired meta tags
const TIMEOUT_DELAY = 1000;

const ACCEPTED_PROTOCOLS = ["http:", "https:"];

// Possible description tags, listed in order from least favourable to most favourable
const DESCRIPTION_RULES = [
  "twitter:description",
  "description",
  "og:description",
];

// Possible image tags, listed in order from least favourable to most favourable
const PREVIEW_IMAGE_RULES = [
  "thumbnail",
  "twitter:image",
  "og:image",
  "og:image:url",
  "og:image:secure_url",
];

/*
 * Checks if the incoming meta tag has a greater score than the current best
 * score by checking the index of the meta tag in the list of rules provided.
 *
 * @param {Array} aRules
 *          The list of rules for a given type of meta tag
 * @param {String} aTag
 *          The name or property of the incoming meta tag
 * @param {String} aEntry
 *          The current best entry for the given meta tag
 *
 * @returns {Boolean} true if the incoming meta tag is better than the current
 *                    best meta tag of that same kind, false otherwise
 */
function shouldExtractMetadata(aRules, aTag, aEntry) {
  return aRules.indexOf(aTag) > aEntry.currMaxScore;
}

/*
 * Ensure that the preview image URL is safe and valid before storing
 *
 * @param {URL} aURL
 *          A URL object that needs to be checked for valid principal and protocol
 *
 * @returns {Boolean} true if the preview URL is safe and can be stored, false otherwise
 */
function checkLoadURIStr(aURL) {
  if (!ACCEPTED_PROTOCOLS.includes(aURL.protocol)) {
    return false;
  }
  try {
    let ssm = Services.scriptSecurityManager;
    let principal = ssm.createNullPrincipal({});
    ssm.checkLoadURIStrWithPrincipal(
      principal,
      aURL.href,
      ssm.DISALLOW_INHERIT_PRINCIPAL
    );
  } catch (e) {
    return false;
  }
  return true;
}

/*
 * This listens to DOMMetaAdded events and collects relevant metadata about the
 * meta tag received. Then, it sends the metadata gathered from the meta tags
 * and the url of the page as it's payload to be inserted into moz_places.
 */
export class ContentMetaChild extends JSWindowActorChild {
  constructor() {
    super();

    // Store a mapping of the best description and preview
    // image collected so far for a given URL.
    this.metaTags = new Map();
  }

  didDestroy() {
    for (let entry of this.metaTags.values()) {
      entry.timeout.cancel();
    }
  }

  handleEvent(event) {
    switch (event.type) {
      case "DOMContentLoaded":
        const metaTags = this.contentWindow.document.querySelectorAll("meta");
        for (let metaTag of metaTags) {
          this.onMetaTag(metaTag);
        }
        break;
      case "DOMMetaAdded":
        this.onMetaTag(event.originalTarget);
        break;
      default:
    }
  }

  onMetaTag(metaTag) {
    const window = metaTag.ownerGlobal;

    // If there's no meta tag, ignore this. Also verify that the window
    // matches just to be safe.
    if (!metaTag || !metaTag.ownerDocument || window != this.contentWindow) {
      return;
    }

    const url = metaTag.ownerDocument.documentURI;

    let name = metaTag.name;
    let prop = metaTag.getAttributeNS(null, "property");
    if (!name && !prop) {
      return;
    }

    let tag = name || prop;

    const entry = this.metaTags.get(url) || {
      description: { value: null, currMaxScore: -1 },
      image: { value: null, currMaxScore: -1 },
      timeout: null,
    };

    // Malformed meta tag - do not store it
    const content = metaTag.getAttributeNS(null, "content");
    if (!content) {
      return;
    }

    if (shouldExtractMetadata(DESCRIPTION_RULES, tag, entry.description)) {
      // Extract the description
      entry.description.value = content;
      entry.description.currMaxScore = DESCRIPTION_RULES.indexOf(tag);
    } else if (shouldExtractMetadata(PREVIEW_IMAGE_RULES, tag, entry.image)) {
      // Extract the preview image
      let value;
      try {
        value = new URL(content, url);
      } catch (e) {
        return;
      }
      if (value && checkLoadURIStr(value)) {
        entry.image.value = value.href;
        entry.image.currMaxScore = PREVIEW_IMAGE_RULES.indexOf(tag);
      }
    } else {
      // We don't care about other meta tags
      return;
    }

    if (!this.metaTags.has(url)) {
      this.metaTags.set(url, entry);
    }

    if (entry.timeout) {
      entry.timeout.delay = TIMEOUT_DELAY;
    } else {
      // We want to debounce incoming meta tags until we're certain we have the
      // best one for description and preview image, and only store that one
      entry.timeout = Cc["@mozilla.org/timer;1"].createInstance(Ci.nsITimer);
      entry.timeout.initWithCallback(
        () => {
          entry.timeout = null;
          this.metaTags.delete(url);
          // We try to cancel the timers when we get destroyed, but if
          // there's a race, catch it:
          if (!this.manager || this.manager.isClosed) {
            return;
          }

          // Save description and preview image to moz_places
          this.sendAsyncMessage("Meta:SetPageInfo", {
            url,
            description: entry.description.value,
            previewImageURL: entry.image.value,
          });

          // Telemetry for recording the size of page metadata
          let metadataSize = entry.description.value
            ? entry.description.value.length
            : 0;
          metadataSize += entry.image.value ? entry.image.value.length : 0;
          Services.telemetry
            .getHistogramById("PAGE_METADATA_SIZE")
            .add(metadataSize);
        },
        TIMEOUT_DELAY,
        Ci.nsITimer.TYPE_ONE_SHOT
      );
    }
  }
}