mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-08 19:04:45 +00:00
Bug 1301715 - Extract website metadata and make it available. r=ahunt
This patch introduces WebsiteMetadata.jsm which imports fathom and page-metadata-parser. The code has been slightly modified to not depend on more node libraries. On DOMContentLoaded the module will extract the metadata asynchronously and send it with a 'Website:Metadata' event. MozReview-Commit-ID: LxhYOTvvdsF --HG-- extra : rebase_source : e31286bd7268ad62d55f1a5318cde79442e9acba
This commit is contained in:
parent
e3bfff82b5
commit
63a89783cc
@ -116,6 +116,8 @@ XPCOMUtils.defineLazyModuleGetter(this, "Snackbars", "resource://gre/modules/Sna
|
||||
|
||||
XPCOMUtils.defineLazyModuleGetter(this, "RuntimePermissions", "resource://gre/modules/RuntimePermissions.jsm");
|
||||
|
||||
XPCOMUtils.defineLazyModuleGetter(this, "WebsiteMetadata", "resource://gre/modules/WebsiteMetadata.jsm");
|
||||
|
||||
XPCOMUtils.defineLazyServiceGetter(this, "FontEnumerator",
|
||||
"@mozilla.org/gfx/fontenumerator;1",
|
||||
"nsIFontEnumerator");
|
||||
@ -3930,6 +3932,9 @@ Tab.prototype = {
|
||||
|
||||
this.browser.addEventListener("pagehide", listener, true);
|
||||
}
|
||||
|
||||
WebsiteMetadata.parseAsynchronously(this.browser.contentDocument);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
468
mobile/android/modules/WebsiteMetadata.jsm
Normal file
468
mobile/android/modules/WebsiteMetadata.jsm
Normal file
@ -0,0 +1,468 @@
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
'use strict';
|
||||
|
||||
const { classes: Cc, interfaces: Ci, utils: Cu } = Components;
|
||||
|
||||
this.EXPORTED_SYMBOLS = ["WebsiteMetadata"];
|
||||
|
||||
Cu.import("resource://gre/modules/XPCOMUtils.jsm");
|
||||
|
||||
XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm");
|
||||
XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm");
|
||||
|
||||
var WebsiteMetadata = {
|
||||
/**
|
||||
* Asynchronously parse the document extract metadata. A 'Website:Metadata' event with the metadata
|
||||
* will be sent.
|
||||
*/
|
||||
parseAsynchronously: function(doc) {
|
||||
Task.spawn(function() {
|
||||
let metadata = getMetadata(doc, doc.location.href);
|
||||
|
||||
let msg = {
|
||||
type: 'Website:Metadata',
|
||||
location: doc.location.href,
|
||||
metadata: metadata,
|
||||
};
|
||||
|
||||
Messaging.sendRequest(msg);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// #################################################################################################
|
||||
// # Modified version of makeUrlAbsolute() to not import url parser library (and dependencies)
|
||||
// #################################################################################################
|
||||
|
||||
function makeUrlAbsolute(context, relative) {
|
||||
var a = context.doc.createElement('a');
|
||||
a.href = relative;
|
||||
return a.href;
|
||||
}
|
||||
|
||||
// #################################################################################################
|
||||
// # page-metadata-parser
|
||||
// # https://github.com/mozilla/page-metadata-parser/
|
||||
// # 61c58cbd0f0bf2153df832a388a79c66b288b98c
|
||||
// #################################################################################################
|
||||
|
||||
function buildRuleset(name, rules, processors) {
|
||||
const reversedRules = Array.from(rules).reverse();
|
||||
const builtRuleset = ruleset(...reversedRules.map(([query, handler], order) => rule(
|
||||
dom(query),
|
||||
node => [{
|
||||
score: order,
|
||||
flavor: name,
|
||||
notes: handler(node),
|
||||
}]
|
||||
)));
|
||||
|
||||
return (doc, context) => {
|
||||
const kb = builtRuleset.score(doc);
|
||||
const maxNode = kb.max(name);
|
||||
|
||||
if (maxNode) {
|
||||
let value = maxNode.flavors.get(name);
|
||||
|
||||
if (processors) {
|
||||
processors.forEach(processor => {
|
||||
value = processor(value, context);
|
||||
});
|
||||
}
|
||||
|
||||
if (value) {
|
||||
if (value.trim) {
|
||||
return value.trim();
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
const metadataRules = {
|
||||
description: {
|
||||
rules: [
|
||||
['meta[property="og:description"]', node => node.element.getAttribute('content')],
|
||||
['meta[name="description"]', node => node.element.getAttribute('content')],
|
||||
],
|
||||
},
|
||||
|
||||
icon_url: {
|
||||
rules: [
|
||||
['link[rel="apple-touch-icon"]', node => node.element.getAttribute('href')],
|
||||
['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute('href')],
|
||||
['link[rel="icon"]', node => node.element.getAttribute('href')],
|
||||
['link[rel="fluid-icon"]', node => node.element.getAttribute('href')],
|
||||
['link[rel="shortcut icon"]', node => node.element.getAttribute('href')],
|
||||
['link[rel="Shortcut Icon"]', node => node.element.getAttribute('href')],
|
||||
['link[rel="mask-icon"]', node => node.element.getAttribute('href')],
|
||||
],
|
||||
processors: [
|
||||
(icon_url, context) => makeUrlAbsolute(context, icon_url)
|
||||
]
|
||||
},
|
||||
|
||||
image_url: {
|
||||
rules: [
|
||||
['meta[property="og:image:secure_url"]', node => node.element.getAttribute('content')],
|
||||
['meta[property="og:image:url"]', node => node.element.getAttribute('content')],
|
||||
['meta[property="og:image"]', node => node.element.getAttribute('content')],
|
||||
['meta[property="twitter:image"]', node => node.element.getAttribute('content')],
|
||||
['meta[name="thumbnail"]', node => node.element.getAttribute('content')],
|
||||
],
|
||||
processors: [
|
||||
(image_url, context) => makeUrlAbsolute(context, image_url)
|
||||
],
|
||||
},
|
||||
|
||||
keywords: {
|
||||
rules: [
|
||||
['meta[name="keywords"]', node => node.element.getAttribute('content')],
|
||||
],
|
||||
processors: [
|
||||
(keywords) => keywords.split(',').map((keyword) => keyword.trim()),
|
||||
]
|
||||
},
|
||||
|
||||
title: {
|
||||
rules: [
|
||||
['meta[property="og:title"]', node => node.element.getAttribute('content')],
|
||||
['meta[property="twitter:title"]', node => node.element.getAttribute('content')],
|
||||
['meta[name="hdl"]', node => node.element.getAttribute('content')],
|
||||
['title', node => node.element.text],
|
||||
],
|
||||
},
|
||||
|
||||
type: {
|
||||
rules: [
|
||||
['meta[property="og:type"]', node => node.element.getAttribute('content')],
|
||||
],
|
||||
},
|
||||
|
||||
url: {
|
||||
rules: [
|
||||
['meta[property="og:url"]', node => node.element.getAttribute('content')],
|
||||
['link[rel="canonical"]', node => node.element.getAttribute('href')],
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
function getMetadata(doc, url, rules) {
|
||||
const metadata = {};
|
||||
const context = {url,doc};
|
||||
const ruleSet = rules || metadataRules;
|
||||
|
||||
Object.keys(ruleSet).map(metadataKey => {
|
||||
const metadataRule = ruleSet[metadataKey];
|
||||
|
||||
if(Array.isArray(metadataRule.rules)) {
|
||||
const builtRule = buildRuleset(metadataKey, metadataRule.rules, metadataRule.processors);
|
||||
metadata[metadataKey] = builtRule(doc, context);
|
||||
} else {
|
||||
metadata[metadataKey] = getMetadata(doc, url, metadataRule);
|
||||
}
|
||||
});
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
// #################################################################################################
|
||||
// # Fathom dependencies resolved
|
||||
// #################################################################################################
|
||||
|
||||
// const {forEach} = require('wu');
|
||||
function forEach(fn, obj) {
|
||||
for (let x of obj) {
|
||||
fn(x);
|
||||
}
|
||||
}
|
||||
|
||||
function best(iterable, by, isBetter) {
|
||||
let bestSoFar, bestKeySoFar;
|
||||
let isFirst = true;
|
||||
forEach(
|
||||
function (item) {
|
||||
const key = by(item);
|
||||
if (isBetter(key, bestKeySoFar) || isFirst) {
|
||||
bestSoFar = item;
|
||||
bestKeySoFar = key;
|
||||
isFirst = false;
|
||||
}
|
||||
},
|
||||
iterable);
|
||||
if (isFirst) {
|
||||
throw new Error('Tried to call best() on empty iterable');
|
||||
}
|
||||
return bestSoFar;
|
||||
}
|
||||
|
||||
// const {max} = require('./utils');
|
||||
function max(iterable, by = identity) {
|
||||
return best(iterable, by, (a, b) => a > b);
|
||||
}
|
||||
|
||||
// #################################################################################################
|
||||
// # Fathom
|
||||
// # https://github.com/mozilla/fathom
|
||||
// # cac59e470816f17fc1efd4a34437b585e3e451cd
|
||||
// #################################################################################################
|
||||
|
||||
// Get a key of a map, first setting it to a default value if it's missing.
|
||||
function getDefault(map, key, defaultMaker) {
|
||||
if (map.has(key)) {
|
||||
return map.get(key);
|
||||
}
|
||||
const defaultValue = defaultMaker();
|
||||
map.set(key, defaultValue);
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
|
||||
// Construct a filtration network of rules.
|
||||
function ruleset(...rules) {
|
||||
const rulesByInputFlavor = new Map(); // [someInputFlavor: [rule, ...]]
|
||||
|
||||
// File each rule under its input flavor:
|
||||
forEach(rule => getDefault(rulesByInputFlavor, rule.source.inputFlavor, () => []).push(rule),
|
||||
rules);
|
||||
|
||||
return {
|
||||
// Iterate over a DOM tree or subtree, building up a knowledgebase, a
|
||||
// data structure holding scores and annotations for interesting
|
||||
// elements. Return the knowledgebase.
|
||||
//
|
||||
// This is the "rank" portion of the rank-and-yank algorithm.
|
||||
score: function (tree) {
|
||||
const kb = knowledgebase();
|
||||
|
||||
// Introduce the whole DOM into the KB as flavor 'dom' to get
|
||||
// things started:
|
||||
const nonterminals = [[{tree}, 'dom']]; // [[node, flavor], [node, flavor], ...]
|
||||
|
||||
// While there are new facts, run the applicable rules over them to
|
||||
// generate even newer facts. Repeat until everything's fully
|
||||
// digested. Rules run in no particular guaranteed order.
|
||||
while (nonterminals.length) {
|
||||
const [inNode, inFlavor] = nonterminals.pop();
|
||||
for (let rule of getDefault(rulesByInputFlavor, inFlavor, () => [])) {
|
||||
const outFacts = resultsOf(rule, inNode, inFlavor, kb);
|
||||
for (let fact of outFacts) {
|
||||
const outNode = kb.nodeForElement(fact.element);
|
||||
|
||||
// No matter whether or not this flavor has been
|
||||
// emitted before for this node, we multiply the score.
|
||||
// We want to be able to add rules that refine the
|
||||
// scoring of a node, without having to rewire the path
|
||||
// of flavors that winds through the ruleset.
|
||||
//
|
||||
// 1 score per Node is plenty. That simplifies our
|
||||
// data, our rankers, our flavor system (since we don't
|
||||
// need to represent score axes), and our engine. If
|
||||
// somebody wants more score axes, they can fake it
|
||||
// themselves with notes, thus paying only for what
|
||||
// they eat. (We can even provide functions that help
|
||||
// with that.) Most rulesets will probably be concerned
|
||||
// with scoring only 1 thing at a time anyway. So,
|
||||
// rankers return a score multiplier + 0 or more new
|
||||
// flavors with optional notes. Facts can never be
|
||||
// deleted from the KB by rankers (or order would start
|
||||
// to matter); after all, they're *facts*.
|
||||
outNode.score *= fact.score;
|
||||
|
||||
// Add a new annotation to a node--but only if there
|
||||
// wasn't already one of the given flavor already
|
||||
// there; otherwise there's no point.
|
||||
//
|
||||
// You might argue that we might want to modify an
|
||||
// existing note here, but that would be a bad
|
||||
// idea. Notes of a given flavor should be
|
||||
// considered immutable once laid down. Otherwise, the
|
||||
// order of execution of same-flavored rules could
|
||||
// matter, hurting pluggability. Emit a new flavor and
|
||||
// a new note if you want to do that.
|
||||
//
|
||||
// Also, choosing not to add a new fact to nonterminals
|
||||
// when we're not adding a new flavor saves the work of
|
||||
// running the rules against it, which would be
|
||||
// entirely redundant and perform no new work (unless
|
||||
// the rankers were nondeterministic, but don't do
|
||||
// that).
|
||||
if (!outNode.flavors.has(fact.flavor)) {
|
||||
outNode.flavors.set(fact.flavor, fact.notes);
|
||||
kb.indexNodeByFlavor(outNode, fact.flavor); // TODO: better encapsulation rather than indexing explicitly
|
||||
nonterminals.push([outNode, fact.flavor]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return kb;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// Construct a container for storing and querying facts, where a fact has a
|
||||
// flavor (used to dispatch further rules upon), a corresponding DOM element, a
|
||||
// score, and some other arbitrary notes opaque to fathom.
|
||||
function knowledgebase() {
|
||||
const nodesByFlavor = new Map(); // Map{'texty' -> [NodeA],
|
||||
// 'spiffy' -> [NodeA, NodeB]}
|
||||
// NodeA = {element: <someElement>,
|
||||
//
|
||||
// // Global nodewide score. Add
|
||||
// // custom ones with notes if
|
||||
// // you want.
|
||||
// score: 8,
|
||||
//
|
||||
// // Flavors is a map of flavor names to notes:
|
||||
// flavors: Map{'texty' -> {ownText: 'blah',
|
||||
// someOtherNote: 'foo',
|
||||
// someCustomScore: 10},
|
||||
// // This is an empty note:
|
||||
// 'fluffy' -> undefined}}
|
||||
const nodesByElement = new Map();
|
||||
|
||||
return {
|
||||
// Return the "node" (our own data structure that we control) that
|
||||
// corresponds to a given DOM element, creating one if necessary.
|
||||
nodeForElement: function (element) {
|
||||
return getDefault(nodesByElement,
|
||||
element,
|
||||
() => ({element,
|
||||
score: 1,
|
||||
flavors: new Map()}));
|
||||
},
|
||||
|
||||
// Return the highest-scored node of the given flavor, undefined if
|
||||
// there is none.
|
||||
max: function (flavor) {
|
||||
const nodes = nodesByFlavor.get(flavor);
|
||||
return nodes === undefined ? undefined : max(nodes, node => node.score);
|
||||
},
|
||||
|
||||
// Let the KB know that a new flavor has been added to an element.
|
||||
indexNodeByFlavor: function (node, flavor) {
|
||||
getDefault(nodesByFlavor, flavor, () => []).push(node);
|
||||
},
|
||||
|
||||
nodesOfFlavor: function (flavor) {
|
||||
return getDefault(nodesByFlavor, flavor, () => []);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// Apply a rule (as returned by a call to rule()) to a fact, and return the
|
||||
// new facts that result.
|
||||
function resultsOf(rule, node, flavor, kb) {
|
||||
// If more types of rule pop up someday, do fancier dispatching here.
|
||||
return rule.source.flavor === 'flavor' ? resultsOfFlavorRule(rule, node, flavor) : resultsOfDomRule(rule, node, kb);
|
||||
}
|
||||
|
||||
|
||||
// Pull the DOM tree off the special property of the root "dom" fact, and query
|
||||
// against it.
|
||||
function *resultsOfDomRule(rule, specialDomNode, kb) {
|
||||
// Use the special "tree" property of the special starting node:
|
||||
const matches = specialDomNode.tree.querySelectorAll(rule.source.selector);
|
||||
|
||||
for (let i = 0; i < matches.length; i++) { // matches is a NodeList, which doesn't conform to iterator protocol
|
||||
const element = matches[i];
|
||||
const newFacts = explicitFacts(rule.ranker(kb.nodeForElement(element)));
|
||||
for (let fact of newFacts) {
|
||||
if (fact.element === undefined) {
|
||||
fact.element = element;
|
||||
}
|
||||
if (fact.flavor === undefined) {
|
||||
throw new Error('Rankers of dom() rules must return a flavor in each fact. Otherwise, there is no way for that fact to be used later.');
|
||||
}
|
||||
yield fact;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function *resultsOfFlavorRule(rule, node, flavor) {
|
||||
const newFacts = explicitFacts(rule.ranker(node));
|
||||
|
||||
for (let fact of newFacts) {
|
||||
// If the ranker didn't specify a different element, assume it's
|
||||
// talking about the one we passed in:
|
||||
if (fact.element === undefined) {
|
||||
fact.element = node.element;
|
||||
}
|
||||
if (fact.flavor === undefined) {
|
||||
fact.flavor = flavor;
|
||||
}
|
||||
yield fact;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Take the possibly abbreviated output of a ranker function, and make it
|
||||
// explicitly an iterable with a defined score.
|
||||
//
|
||||
// Rankers can return undefined, which means "no facts", a single fact, or an
|
||||
// array of facts.
|
||||
function *explicitFacts(rankerResult) {
|
||||
const array = (rankerResult === undefined) ? [] : (Array.isArray(rankerResult) ? rankerResult : [rankerResult]);
|
||||
for (let fact of array) {
|
||||
if (fact.score === undefined) {
|
||||
fact.score = 1;
|
||||
}
|
||||
yield fact;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO: For the moment, a lot of responsibility is on the rankers to return a
|
||||
// pretty big data structure of up to 4 properties. This is a bit verbose for
|
||||
// an arrow function (as I hope we can use most of the time) and the usual case
|
||||
// will probably be returning just a score multiplier. Make that case more
|
||||
// concise.
|
||||
|
||||
// TODO: It is likely that rankers should receive the notes of their input type
|
||||
// as a 2nd arg, for brevity.
|
||||
|
||||
|
||||
// Return a condition that uses a DOM selector to find its matches from the
|
||||
// original DOM tree.
|
||||
//
|
||||
// For consistency, Nodes will still be delivered to the transformers, but
|
||||
// they'll have empty flavors and score = 1.
|
||||
//
|
||||
// Condition constructors like dom() and flavor() build stupid, introspectable
|
||||
// objects that the query engine can read. They don't actually do the query
|
||||
// themselves. That way, the query planner can be smarter than them, figuring
|
||||
// out which indices to use based on all of them. (We'll probably keep a heap
|
||||
// by each dimension's score and a hash by flavor name, for starters.) Someday,
|
||||
// fancy things like this may be possible: rule(and(tag('p'), klass('snork')),
|
||||
// ...)
|
||||
function dom(selector) {
|
||||
return {
|
||||
flavor: 'dom',
|
||||
inputFlavor: 'dom',
|
||||
selector
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// Return a condition that discriminates on nodes of the knowledgebase by flavor.
|
||||
function flavor(inputFlavor) {
|
||||
return {
|
||||
flavor: 'flavor',
|
||||
inputFlavor
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
function rule(source, ranker) {
|
||||
return {
|
||||
source,
|
||||
ranker
|
||||
};
|
||||
}
|
@ -29,4 +29,5 @@ EXTRA_JS_MODULES += [
|
||||
'Snackbars.jsm',
|
||||
'SSLExceptions.jsm',
|
||||
'TabMirror.jsm',
|
||||
'WebsiteMetadata.jsm'
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user