Bug 1724320 - Collect Open Graph page data. r=mossop

Differential Revision: https://phabricator.services.mozilla.com/D121927
2021-08-09 17:16:29 +00:00 · 2021-08-09 17:16:29 +00:00 · 56b979ba71
--- a/browser/components/pagedata/OpenGraphPageData.jsm
+++ b/browser/components/pagedata/OpenGraphPageData.jsm
@ -0,0 +1,106 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+var EXPORTED_SYMBOLS = ["OpenGraphPageData"];
+
+const { PageDataCollector } = ChromeUtils.import(
+  "resource:///modules/pagedata/PageDataCollector.jsm"
+);
+
+/**
+ * @typedef {object} GeneralPageData
+ *   Data about a product.
+ * @property {string | undefined} title
+ *   The title describing the page.
+ * @property {string | undefined} site_name
+ *   The name of the site the page is on.
+ * @property {string | undefined} type
+ *   The type of the object being described by Open Graph. See
+ *   https://ogp.me/#types for a list of possible types.
+ * @property {string | undefined} image
+ *   A URL pointing to an image that describes the page.
+ * @property {string | undefined} url
+ *   The permalink to the page.
+ */
+
+const RELEVANT_TAGS = ["title", "site_name", "image", "type", "url"];
+
+/**
+ * Collects Open Graph related data from a page.
+ *
+ * TODO: Respond to DOM mutations to trigger recollection.
+ */
+class OpenGraphPageData extends PageDataCollector {
+  /**
+   * @see PageDataCollector.init
+   */
+  async init() {
+    return this.#collect();
+  }
+
+  /**
+   * Collects data from the meta tags on the page.
+   * See https://ogp.me/ for the parsing spec.
+   *
+   * @param {NodeList} tags
+   *  A NodeList of Open Graph meta tags.
+   * @returns {GeneralPageData}
+   *   Data describing the webpage.
+   */
+  #collectOpenGraphTags(tags) {
+    // Ensure all tags are present in the returned object, even if their values
+    // are undefined.
+    let pageData = Object.fromEntries(
+      RELEVANT_TAGS.map(tag => [tag, undefined])
+    );
+
+    for (let tag of tags) {
+      // Stripping "og:" from the property name.
+      let propertyName = tag.getAttribute("property").substring(3);
+      if (RELEVANT_TAGS.includes(propertyName)) {
+        pageData[propertyName] = tag.getAttribute("content");
+      }
+    }
+
+    return pageData;
+  }
+
+  /**
+   * Collects the existing data from the page.
+   *
+   * @returns {Data[]}
+   */
+  #collect() {
+    /**
+     * A map from item type to an array of the items found in the page.
+     */
+    let items = new Map();
+    let insert = (type, item) => {
+      let data = items.get(type);
+      if (!data) {
+        data = [];
+        items.set(type, data);
+      }
+      data.push(item);
+    };
+
+    // Sites can technically define an Open Graph prefix other than `og:`.
+    // However, `og:` is one of the default RDFa prefixes and it's likely
+    // uncommon that sites use a custom prefix. If we find that metadata is
+    // missing for common sites due to this issue, we could consider adding a
+    // basic RDFa parser.
+    let openGraphTags = this.document.querySelectorAll("meta[property^='og:'");
+    if (!openGraphTags.length) {
+      return [];
+    }
+    insert(
+      PageDataCollector.DATA_TYPE.GENERAL,
+      this.#collectOpenGraphTags(openGraphTags)
+    );
+
+    return Array.from(items, ([type, data]) => ({ type, data }));
+  }
+}
--- a/browser/components/pagedata/PageDataChild.jsm
+++ b/browser/components/pagedata/PageDataChild.jsm
@ -11,6 +11,7 @@ const { XPCOMUtils } = ChromeUtils.import(
 );

 XPCOMUtils.defineLazyModuleGetters(this, {
+  OpenGraphPageData: "resource:///modules/pagedata/OpenGraphPageData.jsm",
  PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.jsm",
  SchemaOrgPageData: "resource:///modules/pagedata/SchemaOrgPageData.jsm",
  Services: "resource://gre/modules/Services.jsm",
@ -42,7 +43,7 @@ XPCOMUtils.defineLazyPreferenceGetter(
 * @returns {PageDataCollector[]}
 */
 function getCollectors(document) {
-  return [new SchemaOrgPageData(document)];
+  return [new SchemaOrgPageData(document), new OpenGraphPageData(document)];
 }

 /**
--- a/browser/components/pagedata/PageDataCollector.jsm
+++ b/browser/components/pagedata/PageDataCollector.jsm
@ -24,6 +24,7 @@ class PageDataCollector extends EventEmitter {
  static get DATA_TYPE() {
    return {
      PRODUCT: 1,
+      GENERAL: 2,
    };
  }

--- a/browser/components/pagedata/moz.build
+++ b/browser/components/pagedata/moz.build
@ -12,6 +12,7 @@ BROWSER_CHROME_MANIFESTS += [
 ]

 EXTRA_JS_MODULES.pagedata += [
+    "OpenGraphPageData.jsm",
    "PageDataCollector.jsm",
    "PageDataService.jsm",
    "SchemaOrgPageData.jsm",
--- a/browser/components/pagedata/tests/browser/browser.ini
+++ b/browser/components/pagedata/tests/browser/browser.ini
@ -8,8 +8,13 @@ prefs =
  browser.pagedata.enabled=true
 support-files =
  head.js
-  product1.html
-  product2.html

 [browser_pagedata_basic.js]
+[browser_pagedata_opengraph.js]
+support-files =
+  opengraph1.html
+  opengraph2.html
 [browser_pagedata_product.js]
+support-files =
+  product1.html
+  product2.html
--- a/browser/components/pagedata/tests/browser/browser_pagedata_opengraph.js
+++ b/browser/components/pagedata/tests/browser/browser_pagedata_opengraph.js
@ -0,0 +1,72 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests that the page data service can parse Open Graph metadata.
+ */
+
+const BASE_URL = getRootDirectory(gTestPath).replace(
+  "chrome://mochitests/content",
+  "https://example.com"
+);
+
+add_task(async function test_type_website() {
+  let promise = PageDataService.once("page-data");
+
+  const TEST_URL = BASE_URL + "opengraph1.html";
+
+  await BrowserTestUtils.withNewTab(TEST_URL, async browser => {
+    let pageData = await promise;
+    Assert.equal(pageData.url, TEST_URL, "Should have returned the loaded URL");
+    Assert.equal(pageData.data.length, 1, "Should have only one data item");
+    Assert.deepEqual(
+      pageData.data,
+      [
+        {
+          type: PageDataCollector.DATA_TYPE.GENERAL,
+          data: [
+            {
+              type: "website",
+              site_name: "Mozilla",
+              url: "https://www.mozilla.org/",
+              image: "https://example.com/preview-image",
+              title: "Internet for people, not profit",
+            },
+          ],
+        },
+      ],
+      "Should have returned the expected data"
+    );
+  });
+});
+
+add_task(async function test_type_movie() {
+  let promise = PageDataService.once("page-data");
+
+  const TEST_URL = BASE_URL + "opengraph2.html";
+
+  await BrowserTestUtils.withNewTab(TEST_URL, async browser => {
+    let pageData = await promise;
+    Assert.equal(pageData.url, TEST_URL, "Should have returned the loaded URL");
+    Assert.equal(pageData.data.length, 1, "Should have only one data item");
+    Assert.deepEqual(
+      pageData.data,
+      [
+        {
+          type: PageDataCollector.DATA_TYPE.GENERAL,
+          data: [
+            {
+              type: "video.movie",
+              site_name: undefined,
+              url: "https://www.imdb.com/title/tt0499004/",
+              image: "https://example.com/preview-code-rush",
+              title: "Code Rush (TV Movie 2000) - IMDb",
+            },
+          ],
+        },
+      ],
+      "Should have returned the expected data"
+    );
+  });
+});
--- a/browser/components/pagedata/tests/browser/opengraph1.html
+++ b/browser/components/pagedata/tests/browser/opengraph1.html
@ -0,0 +1,17 @@
+<html>
+<head>
+  <title>Internet for people, not profit — Mozilla</title>
+  <meta http-equiv="Content-Type" content="text/html;charset=utf-8"></meta>
+  <meta property="og:type" content="website">
+  <meta property="og:site_name" content="Mozilla">
+  <meta property="og:url" content="https://www.mozilla.org/">
+  <meta property="og:image" content="https://example.com/preview-image">
+  <meta property="og:title" content="Internet for people, not profit">
+  <!-- We expect the test will ignore tags the parser does not recognize. -->
+  <meta property="og:locale" content="en_CA">
+  <meta property="og:description" content="Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online.">
+</head>
+<body>
+  <p>Test page</p>
+</body>
+</html>
--- a/browser/components/pagedata/tests/browser/opengraph2.html
+++ b/browser/components/pagedata/tests/browser/opengraph2.html
@ -0,0 +1,16 @@
+<html>
+<head>
+  <title>Code Rush (TV Movie 2000)</title>
+  <meta property="og:url" content="https://www.imdb.com/title/tt0499004/"/>
+  <!-- Omitting og:site_name to test that the parser doesn't break on missing tags. -->
+  <meta property="og:title" content="Code Rush (TV Movie 2000) - IMDb"/>
+  <meta property="og:description" content="This is the description of the movie."/>
+  <meta property="og:type" content="video.movie"/>
+  <meta property="og:image" content="https://example.com/preview-code-rush"/>
+  <meta property="og:image:height" content="750"/>
+  <meta property="og:image:width" content="1000"/>
+</head>
+<body>
+  <p>Test page</p>
+</body>
+</html>