2016-11-11 10:59:42 +03:00
|
|
|
const extend = require('extend');
|
|
|
|
|
2016-11-24 16:46:39 +03:00
|
|
|
/**
|
|
|
|
Requests describe a resource to capture and process as well as the context for that processing.
|
|
|
|
|
|
|
|
Transitivity
|
|
|
|
* none - Only process this exact resource
|
|
|
|
* normal - Process this resource if not previously seen and do normal processing on non-roots and roots
|
|
|
|
* forceNone - Process this resource and force processing on non-roots and no processing of roots
|
|
|
|
* forceNormal - Force processing of children plus normal processing of roots
|
|
|
|
* forceForce - Force processing of children and roots. Decays to forceNormal on roots
|
|
|
|
Basically, once you are forcing, force transitivity for all children, but still allow control over transitivity
|
|
|
|
when traversing to a root. When traversing with forceForce, queued roots end up as forceNormal. Similarly,
|
|
|
|
when traversing with forceNormal, queued roots end up as normal.
|
|
|
|
|
|
|
|
Fetch behavior
|
|
|
|
* none - Only use existing content. Skip this resource if we don't already have it
|
|
|
|
* normal - Use existing content if we have it and it matches. Otherwise, get content from original source
|
|
|
|
* force - Ignore exiting content and get contenf from original source
|
|
|
|
*/
|
|
|
|
|
2016-11-11 10:59:42 +03:00
|
|
|
class Request {
|
2016-11-21 09:45:18 +03:00
|
|
|
constructor(type, url, context = null) {
|
2016-11-12 03:46:31 +03:00
|
|
|
this.type = type;
|
|
|
|
this.url = url;
|
2016-11-24 16:46:39 +03:00
|
|
|
this.transitivity = 'normal';
|
|
|
|
this.fetch = 'normal';
|
2016-11-21 09:45:18 +03:00
|
|
|
this.context = context || {};
|
2016-11-18 02:33:44 +03:00
|
|
|
this.promises = [];
|
|
|
|
}
|
|
|
|
|
|
|
|
track(promises) {
|
|
|
|
if (!promises) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (Array.isArray(promises)) {
|
|
|
|
Array.prototype.push.apply(this.promises, promises);
|
|
|
|
} else {
|
|
|
|
this.promises.push(promises);
|
|
|
|
}
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
addMeta(data) {
|
|
|
|
this.meta = extend({}, this.meta, data);
|
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
2016-11-12 11:13:04 +03:00
|
|
|
addRootSelfLink() {
|
|
|
|
this.addSelfLink('id', 'urn:');
|
|
|
|
}
|
|
|
|
|
|
|
|
addSelfLink(key = 'id', base = null) {
|
2016-11-11 10:59:42 +03:00
|
|
|
let qualifier = base ? base : this.context.qualifier;
|
2016-11-24 16:46:39 +03:00
|
|
|
if (!qualifier || (typeof qualifier !== 'string')) {
|
2016-11-26 05:47:47 +03:00
|
|
|
throw new Error('Need something on which to base the self link URN');
|
2016-11-12 11:13:04 +03:00
|
|
|
}
|
2016-11-11 10:59:42 +03:00
|
|
|
qualifier = qualifier.endsWith(':') ? qualifier : qualifier + ':';
|
|
|
|
this.linkSelf('self', `${qualifier}${this.type}:${this.document[key]}`);
|
|
|
|
}
|
|
|
|
|
|
|
|
linkSelf(name, value) {
|
|
|
|
const links = this.document._metadata.links;
|
|
|
|
const key = Array.isArray(value) ? 'hrefs' : 'href';
|
|
|
|
links[name] = { type: 'self' };
|
|
|
|
links[name][key] = value;
|
|
|
|
}
|
|
|
|
|
|
|
|
linkSiblings(name, href) {
|
|
|
|
const links = this.document._metadata.links;
|
|
|
|
links[name] = { type: 'siblings', href: href };
|
|
|
|
}
|
|
|
|
|
2016-11-18 02:33:44 +03:00
|
|
|
queue(type, url, context) {
|
2016-11-12 03:46:31 +03:00
|
|
|
const newRequest = new Request(type, url);
|
2016-11-11 10:59:42 +03:00
|
|
|
newRequest.context = context;
|
2016-11-26 05:47:47 +03:00
|
|
|
newRequest.fetch = this.fetch;
|
2016-11-18 02:33:44 +03:00
|
|
|
this.track(this.crawler.queue(newRequest));
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2016-11-24 16:46:39 +03:00
|
|
|
queueRoot(type, url) {
|
|
|
|
const transitivity = this._getRootTransitivity();
|
|
|
|
if (!transitivity) {
|
|
|
|
return;
|
|
|
|
}
|
2016-11-12 11:13:04 +03:00
|
|
|
const newRequest = new Request(type, url);
|
|
|
|
newRequest.context = { qualifier: 'urn:' };
|
2016-11-24 16:46:39 +03:00
|
|
|
// set the new request's transitivity to the next value
|
|
|
|
newRequest.transitivity = transitivity;
|
2016-11-26 05:47:47 +03:00
|
|
|
newRequest.fetch = this.fetch;
|
2016-11-18 02:33:44 +03:00
|
|
|
this.track(this.crawler.queue(newRequest));
|
2016-11-12 11:13:04 +03:00
|
|
|
}
|
|
|
|
|
2016-11-26 05:47:47 +03:00
|
|
|
queueRoots(type, url, context = null) {
|
2016-11-24 16:46:39 +03:00
|
|
|
const transitivity = this._getRootTransitivity();
|
|
|
|
if (!transitivity) {
|
|
|
|
return;
|
|
|
|
}
|
2016-11-12 11:13:04 +03:00
|
|
|
const newRequest = new Request(type, url);
|
2016-11-26 05:47:47 +03:00
|
|
|
const newContext = extend({}, this.context, context);
|
|
|
|
newContext.qualifier = this.document._metadata.links.self.href;
|
|
|
|
newRequest.context = newContext;
|
2016-11-24 16:46:39 +03:00
|
|
|
// carry over this requests transitivity as we are queuing a collection
|
|
|
|
newRequest.transitivity = this.transitivity;
|
2016-11-26 05:47:47 +03:00
|
|
|
newRequest.fetch = this.fetch;
|
2016-11-18 02:33:44 +03:00
|
|
|
this.track(this.crawler.queue(newRequest));
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2016-11-24 16:46:39 +03:00
|
|
|
queueCollectionElement(type, url, qualifier) {
|
|
|
|
if (this.isRootType(type)) {
|
|
|
|
return this.queueRoot(type, url);
|
|
|
|
}
|
|
|
|
return this.queueChild(type, url, qualifier);
|
|
|
|
}
|
|
|
|
|
2016-11-11 10:59:42 +03:00
|
|
|
queueChild(type, url, qualifier) {
|
2016-11-24 16:46:39 +03:00
|
|
|
const transitivity = this._getChildTransitivity();
|
|
|
|
if (!transitivity) {
|
|
|
|
return;
|
|
|
|
}
|
2016-11-12 03:46:31 +03:00
|
|
|
const newRequest = new Request(type, url);
|
2016-11-11 10:59:42 +03:00
|
|
|
newRequest.context = this.context || {};
|
|
|
|
newRequest.context.qualifier = qualifier;
|
2016-11-24 16:46:39 +03:00
|
|
|
newRequest.transitivity = transitivity;
|
2016-11-26 05:47:47 +03:00
|
|
|
newRequest.fetch = this.fetch;
|
2016-11-18 02:33:44 +03:00
|
|
|
this.track(this.crawler.queue(newRequest));
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
queueChildren(type, url, context = null) {
|
2016-11-24 16:46:39 +03:00
|
|
|
const transitivity = this._getChildTransitivity();
|
|
|
|
if (!transitivity) {
|
|
|
|
return;
|
|
|
|
}
|
2016-11-12 03:46:31 +03:00
|
|
|
const newRequest = new Request(type, url);
|
2016-11-26 05:47:47 +03:00
|
|
|
const newContext = extend({}, this.context, context);
|
2016-11-11 10:59:42 +03:00
|
|
|
newContext.qualifier = this.document._metadata.links.self.href;
|
2016-11-12 11:13:04 +03:00
|
|
|
newRequest.context = newContext;
|
2016-11-24 16:46:39 +03:00
|
|
|
// carry over this requests transitivity as we are queuing a collection
|
|
|
|
newRequest.transitivity = this.transitivity;
|
2016-11-26 05:47:47 +03:00
|
|
|
newRequest.fetch = this.fetch;
|
2016-11-18 02:33:44 +03:00
|
|
|
this.track(this.crawler.queue(newRequest));
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
2016-11-24 16:46:39 +03:00
|
|
|
_getRootTransitivity() {
|
|
|
|
return { normal: 'normal', forceNormal: 'normal', forceForce: 'forceNormal' }[this.transitivity];
|
|
|
|
}
|
|
|
|
|
|
|
|
_getChildTransitivity() {
|
|
|
|
return { normal: 'normal', forceNone: 'forceNone', forceNormal: 'forceNormal', forceForce: 'forceNormal' }[this.transitivity];
|
|
|
|
}
|
|
|
|
|
|
|
|
isReprocessing() {
|
|
|
|
return this.fetch === 'none';
|
|
|
|
}
|
|
|
|
|
|
|
|
isForced() {
|
|
|
|
return this.transitivity.startsWith('force');
|
|
|
|
}
|
|
|
|
|
|
|
|
isForcedFetch() {
|
|
|
|
return this.fetch === 'force';
|
|
|
|
}
|
|
|
|
|
2016-11-11 10:59:42 +03:00
|
|
|
markSkip(outcome, message) {
|
2016-11-14 10:38:43 +03:00
|
|
|
if (this.shouldSkip()) {
|
|
|
|
return this;
|
|
|
|
}
|
2016-11-11 21:55:30 +03:00
|
|
|
this.processControl = 'skip';
|
2016-11-11 10:59:42 +03:00
|
|
|
this.outcome = this.outcome || outcome;
|
|
|
|
this.message = this.message || message;
|
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
2016-11-12 01:06:10 +03:00
|
|
|
markRequeue(outcome, message) {
|
2016-11-14 10:38:43 +03:00
|
|
|
if (this.shouldRequeue()) {
|
|
|
|
return this;
|
|
|
|
}
|
2016-11-11 21:55:30 +03:00
|
|
|
this.processControl = 'requeue';
|
2016-11-12 01:06:10 +03:00
|
|
|
this.outcome = this.outcome || outcome;
|
2016-11-11 21:55:30 +03:00
|
|
|
this.message = this.message || message;
|
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
|
|
|
shouldSkip() {
|
2016-11-12 01:06:10 +03:00
|
|
|
return this.processControl === 'skip' || this.processControl === 'requeue';
|
|
|
|
}
|
|
|
|
|
2016-11-16 09:21:33 +03:00
|
|
|
delayUntil(time) {
|
|
|
|
if (!this.nextRequestTime || this.nextRequestTime < time) {
|
|
|
|
this.nextRequestTime = time;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-26 05:47:47 +03:00
|
|
|
delay(milliseconds = 2000) {
|
2016-11-16 09:21:33 +03:00
|
|
|
this.delayUntil(Date.now() + milliseconds);
|
|
|
|
}
|
|
|
|
|
2016-11-11 21:55:30 +03:00
|
|
|
shouldRequeue() {
|
|
|
|
return this.processControl === 'requeue';
|
|
|
|
}
|
|
|
|
|
2016-11-11 10:59:42 +03:00
|
|
|
eventHelper(references) {
|
|
|
|
const document = this.document;
|
|
|
|
// TODO understand if the actor is typically the same as the creator or pusher in the payload
|
|
|
|
const repo = document.repo ? document.repo.id : null;
|
|
|
|
const urn = repo ? `urn:repo:${repo}` : `urn:org:${document.org.id}`;
|
|
|
|
this.linkSelf('self', `${urn}:${this.type}:${document.id}`);
|
|
|
|
this.linkSelf('actor', `urn:login:${document.actor.id}`);
|
|
|
|
this.linkSelf('repo', `urn:repo:${document.repo.id}`);
|
|
|
|
this.linkSelf('org', `urn:org:${document.org.id}`);
|
|
|
|
this.queueRoot('login', document.actor.url);
|
|
|
|
this.queueRoot('repo', document.repo.url);
|
|
|
|
this.queueRoot('org', document.org.url);
|
|
|
|
return document.payload;
|
|
|
|
}
|
|
|
|
|
|
|
|
getCollectionType() {
|
|
|
|
const collections = {
|
2016-11-26 05:47:47 +03:00
|
|
|
orgs: 'org', repos: 'repo', issues: 'issue', issue_comments: 'issue_comment', commits: 'commit', teams: 'team', users: 'user', team_members: 'user', team_repos: 'repo', collaborators: 'user', contributors: 'user', subscribers: 'user'
|
2016-11-11 10:59:42 +03:00
|
|
|
};
|
|
|
|
return collections[this.type];
|
|
|
|
}
|
2016-11-24 16:46:39 +03:00
|
|
|
|
|
|
|
isRootType(type) {
|
|
|
|
const roots = new Set(['orgs', 'org', 'repos', 'repo', 'teams', 'team', 'users', 'user']);
|
|
|
|
return roots.has(type);
|
|
|
|
}
|
2016-11-11 10:59:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = Request;
|