ghcrawler/lib/request.js

229 строки
7.2 KiB
JavaScript
Исходник Обычный вид История

2016-11-11 10:59:42 +03:00
const extend = require('extend');
/**
Requests describe a resource to capture and process as well as the context for that processing.
Transitivity
* none - Only process this exact resource
* normal - Process this resource if not previously seen and do normal processing on non-roots and roots
* forceNone - Process this resource and force processing on non-roots and no processing of roots
* forceNormal - Force processing of children plus normal processing of roots
* forceForce - Force processing of children and roots. Decays to forceNormal on roots
Basically, once you are forcing, force transitivity for all children, but still allow control over transitivity
when traversing to a root. When traversing with forceForce, queued roots end up as forceNormal. Similarly,
when traversing with forceNormal, queued roots end up as normal.
Fetch behavior
* none - Only use existing content. Skip this resource if we don't already have it
* normal - Use existing content if we have it and it matches. Otherwise, get content from original source
* force - Ignore exiting content and get contenf from original source
*/
2016-11-11 10:59:42 +03:00
class Request {
2016-11-21 09:45:18 +03:00
constructor(type, url, context = null) {
2016-11-12 03:46:31 +03:00
this.type = type;
this.url = url;
this.transitivity = 'normal';
this.fetch = 'normal';
2016-11-21 09:45:18 +03:00
this.context = context || {};
2016-11-18 02:33:44 +03:00
this.promises = [];
}
track(promises) {
if (!promises) {
return;
}
if (Array.isArray(promises)) {
Array.prototype.push.apply(this.promises, promises);
} else {
this.promises.push(promises);
}
2016-11-11 10:59:42 +03:00
}
addMeta(data) {
this.meta = extend({}, this.meta, data);
return this;
}
2016-11-12 11:13:04 +03:00
addRootSelfLink() {
this.addSelfLink('id', 'urn:');
}
addSelfLink(key = 'id', base = null) {
2016-11-11 10:59:42 +03:00
let qualifier = base ? base : this.context.qualifier;
if (!qualifier || (typeof qualifier !== 'string')) {
throw new Error('Need something on which to base the self link URN');
2016-11-12 11:13:04 +03:00
}
2016-11-11 10:59:42 +03:00
qualifier = qualifier.endsWith(':') ? qualifier : qualifier + ':';
this.linkSelf('self', `${qualifier}${this.type}:${this.document[key]}`);
}
linkSelf(name, value) {
const links = this.document._metadata.links;
const key = Array.isArray(value) ? 'hrefs' : 'href';
links[name] = { type: 'self' };
links[name][key] = value;
}
linkSiblings(name, href) {
const links = this.document._metadata.links;
links[name] = { type: 'siblings', href: href };
}
2016-11-18 02:33:44 +03:00
queue(type, url, context) {
2016-11-12 03:46:31 +03:00
const newRequest = new Request(type, url);
2016-11-11 10:59:42 +03:00
newRequest.context = context;
newRequest.fetch = this.fetch;
2016-11-18 02:33:44 +03:00
this.track(this.crawler.queue(newRequest));
2016-11-11 10:59:42 +03:00
}
queueRoot(type, url) {
const transitivity = this._getRootTransitivity();
if (!transitivity) {
return;
}
2016-11-12 11:13:04 +03:00
const newRequest = new Request(type, url);
newRequest.context = { qualifier: 'urn:' };
// set the new request's transitivity to the next value
newRequest.transitivity = transitivity;
newRequest.fetch = this.fetch;
2016-11-18 02:33:44 +03:00
this.track(this.crawler.queue(newRequest));
2016-11-12 11:13:04 +03:00
}
queueRoots(type, url, context = null) {
const transitivity = this._getRootTransitivity();
if (!transitivity) {
return;
}
2016-11-12 11:13:04 +03:00
const newRequest = new Request(type, url);
const newContext = extend({}, this.context, context);
newContext.qualifier = this.document._metadata.links.self.href;
newRequest.context = newContext;
// carry over this requests transitivity as we are queuing a collection
newRequest.transitivity = this.transitivity;
newRequest.fetch = this.fetch;
2016-11-18 02:33:44 +03:00
this.track(this.crawler.queue(newRequest));
2016-11-11 10:59:42 +03:00
}
queueCollectionElement(type, url, qualifier) {
if (this.isRootType(type)) {
return this.queueRoot(type, url);
}
return this.queueChild(type, url, qualifier);
}
2016-11-11 10:59:42 +03:00
queueChild(type, url, qualifier) {
const transitivity = this._getChildTransitivity();
if (!transitivity) {
return;
}
2016-11-12 03:46:31 +03:00
const newRequest = new Request(type, url);
2016-11-11 10:59:42 +03:00
newRequest.context = this.context || {};
newRequest.context.qualifier = qualifier;
newRequest.transitivity = transitivity;
newRequest.fetch = this.fetch;
2016-11-18 02:33:44 +03:00
this.track(this.crawler.queue(newRequest));
2016-11-11 10:59:42 +03:00
}
queueChildren(type, url, context = null) {
const transitivity = this._getChildTransitivity();
if (!transitivity) {
return;
}
2016-11-12 03:46:31 +03:00
const newRequest = new Request(type, url);
const newContext = extend({}, this.context, context);
2016-11-11 10:59:42 +03:00
newContext.qualifier = this.document._metadata.links.self.href;
2016-11-12 11:13:04 +03:00
newRequest.context = newContext;
// carry over this requests transitivity as we are queuing a collection
newRequest.transitivity = this.transitivity;
newRequest.fetch = this.fetch;
2016-11-18 02:33:44 +03:00
this.track(this.crawler.queue(newRequest));
2016-11-11 10:59:42 +03:00
}
_getRootTransitivity() {
return { normal: 'normal', forceNormal: 'normal', forceForce: 'forceNormal' }[this.transitivity];
}
_getChildTransitivity() {
return { normal: 'normal', forceNone: 'forceNone', forceNormal: 'forceNormal', forceForce: 'forceNormal' }[this.transitivity];
}
isReprocessing() {
return this.fetch === 'none';
}
isForced() {
return this.transitivity.startsWith('force');
}
isForcedFetch() {
return this.fetch === 'force';
}
2016-11-11 10:59:42 +03:00
markSkip(outcome, message) {
if (this.shouldSkip()) {
return this;
}
2016-11-11 21:55:30 +03:00
this.processControl = 'skip';
2016-11-11 10:59:42 +03:00
this.outcome = this.outcome || outcome;
this.message = this.message || message;
return this;
}
2016-11-12 01:06:10 +03:00
markRequeue(outcome, message) {
if (this.shouldRequeue()) {
return this;
}
2016-11-11 21:55:30 +03:00
this.processControl = 'requeue';
2016-11-12 01:06:10 +03:00
this.outcome = this.outcome || outcome;
2016-11-11 21:55:30 +03:00
this.message = this.message || message;
return this;
}
shouldSkip() {
2016-11-12 01:06:10 +03:00
return this.processControl === 'skip' || this.processControl === 'requeue';
}
2016-11-16 09:21:33 +03:00
delayUntil(time) {
if (!this.nextRequestTime || this.nextRequestTime < time) {
this.nextRequestTime = time;
}
}
delay(milliseconds = 2000) {
2016-11-16 09:21:33 +03:00
this.delayUntil(Date.now() + milliseconds);
}
2016-11-11 21:55:30 +03:00
shouldRequeue() {
return this.processControl === 'requeue';
}
2016-11-11 10:59:42 +03:00
eventHelper(references) {
const document = this.document;
// TODO understand if the actor is typically the same as the creator or pusher in the payload
const repo = document.repo ? document.repo.id : null;
const urn = repo ? `urn:repo:${repo}` : `urn:org:${document.org.id}`;
this.linkSelf('self', `${urn}:${this.type}:${document.id}`);
this.linkSelf('actor', `urn:login:${document.actor.id}`);
this.linkSelf('repo', `urn:repo:${document.repo.id}`);
this.linkSelf('org', `urn:org:${document.org.id}`);
this.queueRoot('login', document.actor.url);
this.queueRoot('repo', document.repo.url);
this.queueRoot('org', document.org.url);
return document.payload;
}
getCollectionType() {
const collections = {
orgs: 'org', repos: 'repo', issues: 'issue', issue_comments: 'issue_comment', commits: 'commit', teams: 'team', users: 'user', team_members: 'user', team_repos: 'repo', collaborators: 'user', contributors: 'user', subscribers: 'user'
2016-11-11 10:59:42 +03:00
};
return collections[this.type];
}
isRootType(type) {
const roots = new Set(['orgs', 'org', 'repos', 'repo', 'teams', 'team', 'users', 'user']);
return roots.has(type);
}
2016-11-11 10:59:42 +03:00
}
module.exports = Request;