2016-12-30 01:58:25 +03:00
|
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
// Licensed under the MIT License.
|
|
|
|
|
2016-12-01 04:54:48 +03:00
|
|
|
const Policy = require('./traversalPolicy');
|
2016-11-11 10:59:42 +03:00
|
|
|
|
2016-11-24 16:46:39 +03:00
|
|
|
/**
|
2016-12-01 04:54:48 +03:00
|
|
|
* Requests describe a resource to capture and process as well as the context for that processing.
|
|
|
|
*/
|
2016-11-11 10:59:42 +03:00
|
|
|
class Request {
|
2017-01-21 04:49:25 +03:00
|
|
|
constructor(type, url, context = null) {
|
2016-11-12 03:46:31 +03:00
|
|
|
this.type = type;
|
|
|
|
this.url = url;
|
2016-11-21 09:45:18 +03:00
|
|
|
this.context = context || {};
|
2017-01-22 09:55:56 +03:00
|
|
|
this.policy = Policy.default(type);
|
2016-11-18 02:33:44 +03:00
|
|
|
}
|
|
|
|
|
2016-12-01 04:54:48 +03:00
|
|
|
static adopt(object) {
|
|
|
|
if (object.__proto__ !== Request.prototype) {
|
|
|
|
object.__proto__ = Request.prototype;
|
|
|
|
}
|
2017-01-22 09:55:56 +03:00
|
|
|
if (object.policy) {
|
|
|
|
object.policy = Request._getResolvedPolicy(object);
|
|
|
|
Policy.adopt(object.policy);
|
|
|
|
} else {
|
|
|
|
Policy.default(this.type);
|
2016-12-01 04:54:48 +03:00
|
|
|
}
|
|
|
|
return object;
|
|
|
|
}
|
|
|
|
|
2017-01-22 09:55:56 +03:00
|
|
|
static _getResolvedPolicy(request) {
|
|
|
|
let policyOrSpec = request.policy;
|
|
|
|
if (typeof policyOrSpec !== 'string') {
|
|
|
|
return policyOrSpec;
|
|
|
|
}
|
|
|
|
policyOrSpec = policyOrSpec.includes(':') ? policyOrSpec : `${policyOrSpec}:${request.type}`;
|
|
|
|
return Policy.getPolicy(policyOrSpec);
|
2017-01-06 10:00:32 +03:00
|
|
|
}
|
|
|
|
|
2017-01-10 08:24:02 +03:00
|
|
|
// Setup some internal context and open this request for handling.
|
2017-01-15 06:30:25 +03:00
|
|
|
open(crawler) {
|
|
|
|
this.crawler = crawler;
|
|
|
|
this.start = Date.now();
|
2017-01-10 08:24:02 +03:00
|
|
|
this.context = this.context || {};
|
|
|
|
this._addHistory();
|
2017-01-22 09:55:56 +03:00
|
|
|
this._resolvePolicy();
|
2017-01-10 08:24:02 +03:00
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
2017-01-22 09:55:56 +03:00
|
|
|
_resolvePolicy() {
|
2017-01-21 04:49:25 +03:00
|
|
|
if (!this.policy) {
|
|
|
|
return this.crawler.queueDead(this);
|
|
|
|
}
|
2017-01-10 08:24:02 +03:00
|
|
|
if (typeof this.policy === 'string') {
|
2017-01-21 04:49:25 +03:00
|
|
|
// if the policy spec does not include a map, default to using the type of this request as the map name
|
|
|
|
const spec = this.policy.includes(':') ? this.policy : `${this.policy}:${this.type}`;
|
|
|
|
const policy = Policy.getPolicy(spec);
|
2017-01-10 08:24:02 +03:00
|
|
|
if (!policy) {
|
2017-01-15 06:30:25 +03:00
|
|
|
return this.crawler.queueDead(this);
|
2017-01-10 08:24:02 +03:00
|
|
|
}
|
|
|
|
this.policy = policy;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
_addHistory() {
|
|
|
|
this.context.history = this.context.history || [];
|
|
|
|
this.context.history.push(this.toString());
|
|
|
|
}
|
|
|
|
|
|
|
|
hasSeen(request) {
|
|
|
|
const history = this.context.history || [];
|
|
|
|
return history.includes(request.toString());
|
|
|
|
}
|
|
|
|
|
2017-01-02 22:42:52 +03:00
|
|
|
getTrackedPromises() {
|
|
|
|
return this.promises || [];
|
|
|
|
}
|
|
|
|
|
2016-11-18 02:33:44 +03:00
|
|
|
track(promises) {
|
|
|
|
if (!promises) {
|
2016-12-01 04:54:48 +03:00
|
|
|
return this;
|
2016-11-18 02:33:44 +03:00
|
|
|
}
|
2017-01-02 22:42:52 +03:00
|
|
|
this.promises = this.promises || [];
|
2016-11-18 02:33:44 +03:00
|
|
|
if (Array.isArray(promises)) {
|
|
|
|
Array.prototype.push.apply(this.promises, promises);
|
|
|
|
} else {
|
|
|
|
this.promises.push(promises);
|
|
|
|
}
|
2016-12-01 04:54:48 +03:00
|
|
|
return this;
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
addMeta(data) {
|
2017-01-10 08:24:02 +03:00
|
|
|
this.meta = Object.assign({}, this.meta, data);
|
2016-11-11 10:59:42 +03:00
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
2016-11-12 11:13:04 +03:00
|
|
|
addRootSelfLink() {
|
2016-12-01 04:54:48 +03:00
|
|
|
this.linkResource('self', this.getRootQualifier());
|
|
|
|
}
|
|
|
|
|
|
|
|
addSelfLink(key = 'id') {
|
|
|
|
this.linkResource('self', this.getChildQualifier(key));
|
|
|
|
}
|
|
|
|
|
|
|
|
getRootQualifier() {
|
|
|
|
return `urn:${this.type}:${this.document.id}`;
|
2016-11-12 11:13:04 +03:00
|
|
|
}
|
|
|
|
|
2016-12-01 04:54:48 +03:00
|
|
|
getChildQualifier(key = 'id') {
|
|
|
|
let qualifier = this.context.qualifier;
|
2016-11-24 16:46:39 +03:00
|
|
|
if (!qualifier || (typeof qualifier !== 'string')) {
|
2016-11-26 05:47:47 +03:00
|
|
|
throw new Error('Need something on which to base the self link URN');
|
2016-11-12 11:13:04 +03:00
|
|
|
}
|
2016-11-11 10:59:42 +03:00
|
|
|
qualifier = qualifier.endsWith(':') ? qualifier : qualifier + ':';
|
2016-12-01 04:54:48 +03:00
|
|
|
return `${qualifier}${this.type}:${this.document[key]}`;
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2016-12-18 22:12:26 +03:00
|
|
|
linkResource(name, urn) {
|
2016-11-11 10:59:42 +03:00
|
|
|
const links = this.document._metadata.links;
|
2016-12-18 22:12:26 +03:00
|
|
|
const key = Array.isArray(urn) ? 'hrefs' : 'href';
|
2016-12-01 04:54:48 +03:00
|
|
|
links[name] = {};
|
2016-12-18 22:12:26 +03:00
|
|
|
links[name][key] = urn;
|
2016-12-01 04:54:48 +03:00
|
|
|
links[name].type = 'resource';
|
|
|
|
}
|
|
|
|
|
|
|
|
linkSiblings(href) {
|
|
|
|
const links = this.document._metadata.links;
|
2016-12-04 03:35:59 +03:00
|
|
|
links.siblings = { href: href, type: 'collection' };
|
2016-12-01 04:54:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
linkCollection(name, href) {
|
|
|
|
const links = this.document._metadata.links;
|
2016-12-04 03:35:59 +03:00
|
|
|
links[name] = { href: href, type: 'collection' };
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2016-12-01 04:54:48 +03:00
|
|
|
linkRelation(name, href) {
|
2016-11-11 10:59:42 +03:00
|
|
|
const links = this.document._metadata.links;
|
2016-12-04 03:35:59 +03:00
|
|
|
links[name] = { href: href, type: 'relation' };
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2017-01-21 04:49:25 +03:00
|
|
|
getNextPolicy(name) {
|
|
|
|
return this.policy.getNextPolicy(name);
|
|
|
|
}
|
|
|
|
|
2017-01-10 08:24:02 +03:00
|
|
|
queueRequests(requests, name = null) {
|
2017-01-13 20:29:53 +03:00
|
|
|
requests = Array.isArray(requests) ? requests : [requests];
|
|
|
|
const toQueue = requests.filter(request => !this.hasSeen(request));
|
2017-01-11 10:20:30 +03:00
|
|
|
this.track(this.crawler.queue(toQueue, name));
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2017-01-21 04:49:25 +03:00
|
|
|
queue(type, url, policy, context = null, pruneRelation = true) {
|
2016-12-01 04:54:48 +03:00
|
|
|
if (!policy) {
|
2016-11-24 16:46:39 +03:00
|
|
|
return;
|
|
|
|
}
|
2017-01-10 08:24:02 +03:00
|
|
|
context = Object.assign({}, this.context, context);
|
|
|
|
context.qualifier = context.qualifier || 'urn:';
|
2017-01-21 04:49:25 +03:00
|
|
|
const newRequest = new Request(type, url, context);
|
2016-12-01 04:54:48 +03:00
|
|
|
newRequest.policy = policy;
|
2016-12-07 22:23:02 +03:00
|
|
|
// relations are not transitive so ensure any relation is stripped off
|
2017-01-10 08:24:02 +03:00
|
|
|
if (pruneRelation) {
|
|
|
|
delete newRequest.context.relation;
|
2016-11-24 16:46:39 +03:00
|
|
|
}
|
2017-01-10 08:24:02 +03:00
|
|
|
this.queueRequests(newRequest);
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2017-01-15 06:30:25 +03:00
|
|
|
markDead(outcome, message) {
|
|
|
|
this.crawler.queueDead(this);
|
|
|
|
return this.markSkip(outcome, message);
|
|
|
|
}
|
|
|
|
|
2016-11-11 10:59:42 +03:00
|
|
|
markSkip(outcome, message) {
|
2017-01-15 05:28:06 +03:00
|
|
|
return this._cutShort(outcome, message, 'skip');
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2016-11-12 01:06:10 +03:00
|
|
|
markRequeue(outcome, message) {
|
2017-01-15 05:28:06 +03:00
|
|
|
return this._cutShort(outcome, message, 'requeue');
|
|
|
|
}
|
|
|
|
|
|
|
|
_cutShort(outcome, message, reason) {
|
2016-12-26 19:47:57 +03:00
|
|
|
// if we are already skipping/requeuing, keep the original as the official outcome but log this new one so its not missed
|
|
|
|
if (this.shouldSkip()) {
|
2017-01-15 05:28:06 +03:00
|
|
|
this._log('verbose', `Redundant ${reason}: ${outcome}, ${message}`, this.meta);
|
2016-11-14 10:38:43 +03:00
|
|
|
return this;
|
|
|
|
}
|
2017-01-15 05:28:06 +03:00
|
|
|
this.processControl = reason;
|
|
|
|
// overwrite previous outcomes if this is an error and the current is not.
|
|
|
|
if (outcome === 'Error' && this.outcome !== 'Error') {
|
|
|
|
this.outcome = outcome;
|
|
|
|
this.message = message;
|
|
|
|
} else {
|
|
|
|
this.outcome = this.outcome || outcome;
|
|
|
|
this.message = this.message || message;
|
|
|
|
}
|
2016-11-11 21:55:30 +03:00
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
2017-01-06 10:00:32 +03:00
|
|
|
markSave() {
|
|
|
|
this.save = true;
|
2017-01-15 05:28:06 +03:00
|
|
|
return this;
|
2017-01-06 10:00:32 +03:00
|
|
|
}
|
|
|
|
|
2016-12-12 07:07:07 +03:00
|
|
|
markNoSave() {
|
|
|
|
this.save = false;
|
2017-01-15 05:28:06 +03:00
|
|
|
return this;
|
2016-12-12 07:07:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
shouldSave() {
|
2017-01-06 10:00:32 +03:00
|
|
|
return this.document && (this.save === true || (this.save !== false && this.contentOrigin !== 'cacheOfOrigin'));
|
2016-12-12 07:07:07 +03:00
|
|
|
}
|
|
|
|
|
2016-11-11 21:55:30 +03:00
|
|
|
shouldSkip() {
|
2016-11-12 01:06:10 +03:00
|
|
|
return this.processControl === 'skip' || this.processControl === 'requeue';
|
|
|
|
}
|
|
|
|
|
2016-11-16 09:21:33 +03:00
|
|
|
delayUntil(time) {
|
|
|
|
if (!this.nextRequestTime || this.nextRequestTime < time) {
|
|
|
|
this.nextRequestTime = time;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-26 05:47:47 +03:00
|
|
|
delay(milliseconds = 2000) {
|
2016-11-16 09:21:33 +03:00
|
|
|
this.delayUntil(Date.now() + milliseconds);
|
|
|
|
}
|
|
|
|
|
2016-11-11 21:55:30 +03:00
|
|
|
shouldRequeue() {
|
|
|
|
return this.processControl === 'requeue';
|
|
|
|
}
|
|
|
|
|
2017-01-10 08:24:02 +03:00
|
|
|
createRequeuable() {
|
|
|
|
// Create a new request data structure that has just the things we should queue
|
2017-01-21 04:49:25 +03:00
|
|
|
const queuable = new Request(this.type, this.url, this.context);
|
2017-01-10 08:24:02 +03:00
|
|
|
queuable.attemptCount = this.attemptCount;
|
|
|
|
queuable.policy = this.policy;
|
|
|
|
if (this.payload) {
|
|
|
|
queuable.payload = this.payload;
|
|
|
|
}
|
|
|
|
return queuable;
|
2016-12-09 08:41:28 +03:00
|
|
|
}
|
|
|
|
|
2016-12-04 03:35:59 +03:00
|
|
|
toString() {
|
|
|
|
return `${this.type}@${this.url}`;
|
|
|
|
}
|
|
|
|
|
|
|
|
toUniqueString() {
|
2017-01-22 09:55:56 +03:00
|
|
|
const policyName = this.policy ? Request._getResolvedPolicy(this).getShortForm() : 'NN';
|
2017-01-11 10:20:30 +03:00
|
|
|
return `${this.type}@${this.url}:${policyName}`;
|
2016-12-04 03:35:59 +03:00
|
|
|
}
|
|
|
|
|
2016-12-26 19:47:57 +03:00
|
|
|
_log(level, message, meta = null) {
|
|
|
|
if (this.crawler) {
|
|
|
|
this.crawler.logger.log(level, message, meta);
|
|
|
|
}
|
|
|
|
}
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = Request;
|