Bug 1724320 - Collect Open Graph page data. r=mossop

Differential Revision: https://phabricator.services.mozilla.com/D121927
This commit is contained in:
Harry Twyford 2021-08-09 17:16:29 +00:00
Родитель f4c00a8872
Коммит 56b979ba71
8 изменённых файлов: 222 добавлений и 3 удалений

Просмотреть файл

@ -0,0 +1,106 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
var EXPORTED_SYMBOLS = ["OpenGraphPageData"];
const { PageDataCollector } = ChromeUtils.import(
"resource:///modules/pagedata/PageDataCollector.jsm"
);
/**
* @typedef {object} GeneralPageData
* Data about a product.
* @property {string | undefined} title
* The title describing the page.
* @property {string | undefined} site_name
* The name of the site the page is on.
* @property {string | undefined} type
* The type of the object being described by Open Graph. See
* https://ogp.me/#types for a list of possible types.
* @property {string | undefined} image
* A URL pointing to an image that describes the page.
* @property {string | undefined} url
* The permalink to the page.
*/
const RELEVANT_TAGS = ["title", "site_name", "image", "type", "url"];
/**
* Collects Open Graph related data from a page.
*
* TODO: Respond to DOM mutations to trigger recollection.
*/
class OpenGraphPageData extends PageDataCollector {
/**
* @see PageDataCollector.init
*/
async init() {
return this.#collect();
}
/**
* Collects data from the meta tags on the page.
* See https://ogp.me/ for the parsing spec.
*
* @param {NodeList} tags
* A NodeList of Open Graph meta tags.
* @returns {GeneralPageData}
* Data describing the webpage.
*/
#collectOpenGraphTags(tags) {
// Ensure all tags are present in the returned object, even if their values
// are undefined.
let pageData = Object.fromEntries(
RELEVANT_TAGS.map(tag => [tag, undefined])
);
for (let tag of tags) {
// Stripping "og:" from the property name.
let propertyName = tag.getAttribute("property").substring(3);
if (RELEVANT_TAGS.includes(propertyName)) {
pageData[propertyName] = tag.getAttribute("content");
}
}
return pageData;
}
/**
* Collects the existing data from the page.
*
* @returns {Data[]}
*/
#collect() {
/**
* A map from item type to an array of the items found in the page.
*/
let items = new Map();
let insert = (type, item) => {
let data = items.get(type);
if (!data) {
data = [];
items.set(type, data);
}
data.push(item);
};
// Sites can technically define an Open Graph prefix other than `og:`.
// However, `og:` is one of the default RDFa prefixes and it's likely
// uncommon that sites use a custom prefix. If we find that metadata is
// missing for common sites due to this issue, we could consider adding a
// basic RDFa parser.
let openGraphTags = this.document.querySelectorAll("meta[property^='og:'");
if (!openGraphTags.length) {
return [];
}
insert(
PageDataCollector.DATA_TYPE.GENERAL,
this.#collectOpenGraphTags(openGraphTags)
);
return Array.from(items, ([type, data]) => ({ type, data }));
}
}

Просмотреть файл

@ -11,6 +11,7 @@ const { XPCOMUtils } = ChromeUtils.import(
);
XPCOMUtils.defineLazyModuleGetters(this, {
OpenGraphPageData: "resource:///modules/pagedata/OpenGraphPageData.jsm",
PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.jsm",
SchemaOrgPageData: "resource:///modules/pagedata/SchemaOrgPageData.jsm",
Services: "resource://gre/modules/Services.jsm",
@ -42,7 +43,7 @@ XPCOMUtils.defineLazyPreferenceGetter(
* @returns {PageDataCollector[]}
*/
function getCollectors(document) {
return [new SchemaOrgPageData(document)];
return [new SchemaOrgPageData(document), new OpenGraphPageData(document)];
}
/**

Просмотреть файл

@ -24,6 +24,7 @@ class PageDataCollector extends EventEmitter {
static get DATA_TYPE() {
return {
PRODUCT: 1,
GENERAL: 2,
};
}

Просмотреть файл

@ -12,6 +12,7 @@ BROWSER_CHROME_MANIFESTS += [
]
EXTRA_JS_MODULES.pagedata += [
"OpenGraphPageData.jsm",
"PageDataCollector.jsm",
"PageDataService.jsm",
"SchemaOrgPageData.jsm",

Просмотреть файл

@ -8,8 +8,13 @@ prefs =
browser.pagedata.enabled=true
support-files =
head.js
product1.html
product2.html
[browser_pagedata_basic.js]
[browser_pagedata_opengraph.js]
support-files =
opengraph1.html
opengraph2.html
[browser_pagedata_product.js]
support-files =
product1.html
product2.html

Просмотреть файл

@ -0,0 +1,72 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* Tests that the page data service can parse Open Graph metadata.
*/
const BASE_URL = getRootDirectory(gTestPath).replace(
"chrome://mochitests/content",
"https://example.com"
);
add_task(async function test_type_website() {
let promise = PageDataService.once("page-data");
const TEST_URL = BASE_URL + "opengraph1.html";
await BrowserTestUtils.withNewTab(TEST_URL, async browser => {
let pageData = await promise;
Assert.equal(pageData.url, TEST_URL, "Should have returned the loaded URL");
Assert.equal(pageData.data.length, 1, "Should have only one data item");
Assert.deepEqual(
pageData.data,
[
{
type: PageDataCollector.DATA_TYPE.GENERAL,
data: [
{
type: "website",
site_name: "Mozilla",
url: "https://www.mozilla.org/",
image: "https://example.com/preview-image",
title: "Internet for people, not profit",
},
],
},
],
"Should have returned the expected data"
);
});
});
add_task(async function test_type_movie() {
let promise = PageDataService.once("page-data");
const TEST_URL = BASE_URL + "opengraph2.html";
await BrowserTestUtils.withNewTab(TEST_URL, async browser => {
let pageData = await promise;
Assert.equal(pageData.url, TEST_URL, "Should have returned the loaded URL");
Assert.equal(pageData.data.length, 1, "Should have only one data item");
Assert.deepEqual(
pageData.data,
[
{
type: PageDataCollector.DATA_TYPE.GENERAL,
data: [
{
type: "video.movie",
site_name: undefined,
url: "https://www.imdb.com/title/tt0499004/",
image: "https://example.com/preview-code-rush",
title: "Code Rush (TV Movie 2000) - IMDb",
},
],
},
],
"Should have returned the expected data"
);
});
});

Просмотреть файл

@ -0,0 +1,17 @@
<html>
<head>
<title>Internet for people, not profit — Mozilla</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"></meta>
<meta property="og:type" content="website">
<meta property="og:site_name" content="Mozilla">
<meta property="og:url" content="https://www.mozilla.org/">
<meta property="og:image" content="https://example.com/preview-image">
<meta property="og:title" content="Internet for people, not profit">
<!-- We expect the test will ignore tags the parser does not recognize. -->
<meta property="og:locale" content="en_CA">
<meta property="og:description" content="Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online.">
</head>
<body>
<p>Test page</p>
</body>
</html>

Просмотреть файл

@ -0,0 +1,16 @@
<html>
<head>
<title>Code Rush (TV Movie 2000)</title>
<meta property="og:url" content="https://www.imdb.com/title/tt0499004/"/>
<!-- Omitting og:site_name to test that the parser doesn't break on missing tags. -->
<meta property="og:title" content="Code Rush (TV Movie 2000) - IMDb"/>
<meta property="og:description" content="This is the description of the movie."/>
<meta property="og:type" content="video.movie"/>
<meta property="og:image" content="https://example.com/preview-code-rush"/>
<meta property="og:image:height" content="750"/>
<meta property="og:image:width" content="1000"/>
</head>
<body>
<p>Test page</p>
</body>
</html>