This commit is contained in:
Jeff McAffer 2016-11-30 17:54:48 -08:00
Родитель 59c3febe90
Коммит 2dbaa3b6ac
11 изменённых файлов: 557 добавлений и 368 удалений

23
.eslintrc.json Normal file
Просмотреть файл

@ -0,0 +1,23 @@
{
"env": {
"browser": false,
"commonjs": true,
"es6": true,
"node": true
},
"parserOptions": {
"ecmaFeatures": {
"jsx": true
},
"sourceType": "module"
},
"rules": {
"no-const-assign": "warn",
"no-this-before-super": "warn",
"no-undef": "warn",
"no-unreachable": "warn",
"no-unused-vars": "warn",
"constructor-super": "warn",
"valid-typeof": "warn"
}
}

Просмотреть файл

@ -1,6 +1,7 @@
module.exports.crawler = require('./lib/crawler');
module.exports.crawlerService = require('./lib/crawlerService');
module.exports.eventFinder = require('./lib/eventFinder');
module.exports.policy = require('./lib/traversalPolicy');
module.exports.processor = require('./lib/processor');
module.exports.queueSet = require('./lib/queueSet');
module.exports.request = require('./lib/request');

Просмотреть файл

@ -6,6 +6,7 @@ const Request = require('./request');
const URL = require('url');
class Crawler {
constructor(queues, store, locker, requestor, options) {
this.queues = queues;
this.store = store;
@ -17,11 +18,12 @@ class Crawler {
}
run(context) {
let delay = context.delay;
if (delay === -1) {
// We are done call the done handler and return without continuing the loop
if (context.delay === -1) {
// We are done so call the done handler and return without continuing the loop
return context.done ? context.done() : null;
}
const delay = context.currentDelay;
context.currentDelay = 0;
if (delay) {
this.logger.verbose(`Crawler: ${context.name} waiting for ${delay}ms`);
}
@ -184,13 +186,19 @@ class Crawler {
}
request.attemptCount = request.attemptCount || 0;
if (++request.attemptCount > 5) {
this.logger.warn(`Exceeded attempt count for ${request.type} ${request.url}`);
request.track(this._queueDead(request, request));
} else {
request.addMeta({ attempt: request.attemptCount });
this.logger.info(`Requeuing attempt ${request.attemptCount} of request ${request.type} for ${request.url}`);
request.track(this._requeueOrigin(request));
this.logger.warn(`Exceeded attempt count for ${request.type}@${request.url}`);
return request.track(this._queueDead(request, request));
}
request.addMeta({ attempt: request.attemptCount });
this.logger.info(`Requeuing attempt ${request.attemptCount} of request ${request.type}@${request.url}`);
const queuable = this._createQueuable(request);
let promise = null;
if (request.newQueue) {
promise = this.queues.push(queuable, request.newQueue);
} else {
promise = this.queues.repush(request, queuable);
}
return request.track(promise);
}
_filter(request) {
@ -204,24 +212,26 @@ class Crawler {
if (request.shouldSkip()) {
return request;
}
const self = this;
if (request.isReprocessing()) {
return this._fetchFromStore(request);
const initial = request.policy.initialFetch(request);
if (initial === 'storage') {
return this._fetchFromStorage(request);
}
return this._fetchFromGitHub(request);
const checkEtag = request.policy.fetch === 'originStorage';
return this._fetchFromGitHub(request, checkEtag);
}
_fetchFromGitHub(request) {
_fetchFromGitHub(request, checkEtag) {
const self = this;
return this._getEtag(request, request.type).then(etag => {
const etagPromise = checkEtag ? this.store.etag(request.type, request.url) : Q(null);
return etagPromise.then(etag => {
const options = etag ? { headers: { 'If-None-Match': etag } } : {};
const start = Date.now();
return self.requestor.get(request.url, options).then(githubResponse => {
const status = githubResponse.statusCode;
request.addMeta({ status: status, fetch: Date.now() - start });
if (status !== 200 && status !== 304) {
if (status === 409) {
return request.markSkip('Empty repo', `Code: ${status} for: ${request.url}`);
if (status === 409 || status === 204) {
return request.markSkip('Empty resource', `Code ${status} for ${request.url}`);
}
// if GitHub is explicitly throttling us, we missed out on this request, requeue
// and wait a bit before processing more requests
@ -229,17 +239,17 @@ class Crawler {
const delay = self.options.forbiddenDelay || 120000;
request.delay(delay);
request.addMeta({ forbiddenDelay: delay });
return request.markRequeue(`GitHub throttled: ${request.url}`);
return request.markRequeue(`GitHub throttled ${request.url}`);
}
throw new Error(`Code: ${status} for: ${request.url}`);
throw new Error(`Code ${status} for ${request.url}`);
}
request.response = githubResponse;
self._checkGitHubRateLimit(request, githubResponse);
if (status === 304) {
// We have the content for this element. If we are forcing, get the content from the
// We already have the content for this element. If we are forcing, get the content from the
// store and process. Otherwise, skip.
if (request.isForced()) {
if (request.policy.fetchExisting(request)) {
return this.store.get(request.type, request.url).then(document => {
return this._prepareCachedRequest(request, document, false);
});
@ -252,29 +262,25 @@ class Crawler {
});
}
_getEtag(request, fetchType) {
if (request.isForcedFetch()) {
return Q(null);
}
return this.store.etag(fetchType, request.url);
}
_fetchFromStore(request) {
_fetchFromStorage(request) {
return this.store.get(request.type, request.url).then(
document => {
request.response = { headers: {}};
request.response = { headers: {} };
return this._prepareCachedRequest(request, document, true);
},
error => {
// The doc is not in the store. Likely it was not traversed previously. Either way,
// it is not an error.
// TODO allow choice of skipping or requeuing with fetch = normal
return request.markSkip('Unreachable for reprocessing');
// The doc could not be loaded from storage. Either storage has failed somehow or this
// is a new processing path. Rethrow the error, or use the origin store, respectively.
const missing = request.policy.missingFetch(request);
if (!missing) {
return request.markSkip('Unreachable for reprocessing');
}
return this._fetchFromGitHub(request, false);
});
}
// tack on any content we want to carry over from the current document for future processing
_prepareCachedRequest(request, document, storeFlag) {
// tack on any content we want to carry over from the current document for future processing
const metadata = {
fetchedAt: document._metadata.fetchedAt,
version: document._metadata.version
@ -357,19 +363,14 @@ class Crawler {
this.logger.error(error);
} else {
request.addMeta({ time: Date.now() - request.start });
const transitivity = request.getTransivityShortForm();
this.logger.info(`${transitivity} ${outcome} ${request.type} [${request.url}] ${request.message || ''}`, request.meta);
const policy = request.policy.getShortForm();
this.logger.info(`${policy} ${outcome} ${request.type}@${request.url} ${request.message || ''}`, request.meta);
}
return request;
}
// =============== Helpers ============
_requeueOrigin(request) {
const queuable = this._createQueuable(request);
return this.queues.repush(request, queuable);
}
_queueDead(request) {
const queuable = this._createQueuable(request);
return this.queues.pushDead(queuable);
@ -377,7 +378,7 @@ class Crawler {
queue(request, name = 'normal') {
if (!this._shouldInclude(request.type, request.url)) {
this.logger.verbose(`Filtered ${request.type} [${request.url}]`);
this.logger.verbose(`Filtered ${request.type}@${request.url}`);
return [];
}
const queuable = this._createQueuable(request);
@ -388,8 +389,7 @@ class Crawler {
// Create a new request data structure that has just the things we should queue
const queuable = new Request(request.type, request.url, request.context);
queuable.attemptCount = request.attemptCount;
queuable.transitivity = request.transitivity || 'normal';
queuable.fetch = request.fetch || 'normal';
queuable.policy = request.policy;
return queuable;
}
@ -414,7 +414,7 @@ class Crawler {
// If we hit the low water mark for requests, proactively sleep until the next ratelimit reset
// This code is not designed to handle the 403 scenarios. That is handled by the retry logic.
const remaining = parseInt(response.headers['x-ratelimit-remaining']) || 0;
const remaining = parseInt(response.headers['x-ratelimit-remaining'], 10) || 0;
request.addMeta({ remaining: remaining });
const tokenLowerBound = this.options ? (this.options.tokenLowerBound || 50) : 50;
if (remaining < tokenLowerBound) {
@ -455,7 +455,7 @@ class Crawler {
this.logger.silly(`Promise Enter`);
thing.then(
result => { this.logger.silly(`Promise Success: ${result}`); },
error => { this.logger.silly(`Promise Error: ${result}`, error); });
error => { this.logger.silly(`Promise Error: ${error.message}`, error); });
return thing;
}
}

Просмотреть файл

@ -1,11 +1,8 @@
const parse = require('parse-link-header');
const queryString = require('query-string');
const Request = require('./request');
const URL = require('url');
// TODO open questions
// * relationship between login, user and org
// * convention for the pluralization of siblings links. Some places it is "issues", others it is "org" and "user"
// * should we track followers on users, repos... Perhaps if there are more than a certain threshold of people involved?
class Processor {
@ -16,20 +13,15 @@ class Processor {
process(request) {
const handler = this._getHandler(request);
if (!handler) {
request.markSkip('Warning', `No handler found for request type: ${request.type}`);
request.markSkip('Skip', `No handler found for request type: ${request.type}`);
return request.document;
}
if (request.isReprocessing()) {
if (request.document._metadata.version === this.version) {
request.markSkip('Up to date', `Already at version: ${this.version}`);
return request.document;
}
if (request.document._metadata.version > this.version) {
request.markSkip('Superceded', `Current version: ${request.document._metadata.version} > requested version: ${this.version}`);
return request.document;
}
if (!request.policy.shouldProcess(request, this.version)) {
request.markSkip('Excluded', `Traversal policy excluded this resource`);
return request.document;
}
const result = handler.call(this, request);
result._metadata.version = this.version;
return result;
@ -45,9 +37,8 @@ class Processor {
for (let i = 2; i <= links.last.page; i++) {
const url = request.url + `?page=${i}&per_page=100`;
const newRequest = new Request(request.type, url);
// Carry through this request's transitivity, fetch and qualifier for subsequent pages
newRequest.fetch = request.fetch;
newRequest.transitivity = request.transitivity;
// Carry through this request's policy and qualifier for subsequent pages
newRequest.policy = request.policy;
newRequest.context = { qualifier: request.context.qualifier };
requests.push(newRequest);
}
@ -63,18 +54,13 @@ class Processor {
page(page, request) {
const document = request.document;
const qualifier = request.context.qualifier;
request.linkSelf('self', `${qualifier}:${request.type}:page:${page}`);
request.linkResource('self', `${qualifier}:${request.type}:page:${page}`);
// If the context defines a relation, create a link in this page. This can be used to
// track that a page defines a relation between an entity and a set of entities. For example,
// a repo and its teams. The teams are not exclusively "part of" the repo, they are just related.
if (request.context.relation) {
const relation = request.context.relation;
const handler = this[relation];
if (!handler) {
request.markSkip('Warning', `No handler found for relation type: ${relation}`);
return document;
}
handler.call(this, request);
const relation = request.context.relation;
if (relation) {
this._processRelation(request, relation.origin, relation.name, relation.type);
}
const elementType = request.getCollectionType();
document.elements.forEach(item => {
@ -83,26 +69,15 @@ class Processor {
return document;
}
_getHandler(request, type = request.type) {
const parsed = URL.parse(request.url, true);
const page = parsed.query.page;
if (page) {
return this.page.bind(this, page);
}
const collectionType = request.getCollectionType();
if (collectionType) {
return this.collection;
}
return (this[type]);
}
org(request) {
const document = request.document;
request.addRootSelfLink();
request.linkSiblings('repos', `urn:login:${document.id}:repos`);
request.linkSiblings('siblings', 'urn:org');
request.queueRoots('repos', document.repos_url);
request.queueRoots('members', document.members_url.replace('{/member}', ''));
request.linkSiblings('urn:orgs');
this._addRoot(request, 'user', 'user', document.url.replace('/orgs/', '/users/'));
this._addCollection(request, 'repos', "repo", null, `urn:user:${document.id}:repos`);
this._addRelation(request, 'members', "user", document.members_url.replace('{/member}', ''));
return document;
}
@ -112,9 +87,9 @@ class Processor {
// * following
const document = request.document;
request.addRootSelfLink();
request.linkSiblings('repos', `urn:login:${document.id}:repos`);
request.linkSiblings('siblings', 'urn:user');
request.queueRoots('repos', document.repos_url);
request.linkSiblings('urn:users');
this._addCollection(request, 'repos', "repo");
return document;
}
@ -125,40 +100,16 @@ class Processor {
// * labels
const document = request.document;
request.addRootSelfLink();
request.linkSelf('owner', `urn:login:${document.owner.id}`);
request.linkSelf('parent', `urn:login:${document.owner.id}`);
request.linkSiblings('siblings', `urn:login:${document.owner.id}:repos`);
request.queueRoot('login', document.owner.url);
request.queueRoots('teams', document.teams_url, { relation: 'repo_teams_relation' });
request.queueRoots('collaborators', document.collaborators_url.replace('{/collaborator}', ''), { relation: 'repo_collaborators_relation' });
request.queueRoots('contributors', document.contributors_url, { relation: 'repo_contributors_relation' });
request.queueRoots('subscribers', document.subscribers_url, { relation: 'repo_subscribers_relation' });
request.queueChildren('issues', document.issues_url.replace('{/number}', ''), { repo: document.id });
request.queueChildren('commits', document.commits_url.replace('{/sha}', ''), { repo: document.id });
return document;
}
request.linkSiblings(`urn:user:${document.owner.id}:repos`);
repo_teams_relation(request) {
return this._processRelation(request, 'teams', 'repo', 'team');
}
this._addRoot(request, 'owner', 'user');
this._addRelation(request, 'teams', 'team');
this._addRelation(request, 'collaborators', 'user', document.collaborators_url.replace('{/collaborator}', ''));
this._addRelation(request, 'contributors', 'user');
this._addRelation(request, 'subscribers', 'user');
this._addCollection(request, 'issues', 'issue', document.issues_url.replace('{/number}', ''));
this._addCollection(request, 'commits', 'commit', document.commits_url.replace('{/sha}', ''));
repo_collaborators_relation(request) {
return this._processRelation(request, 'collaborators', 'repo', 'user');
}
repo_contributors_relation(request) {
return this._processRelation(request, 'contributors', 'repo', 'user');
}
repo_subscribers_relation(request) {
return this._processRelation(request, 'contributors', 'repo', 'user');
}
_processRelation(request, name, originType, targetType) {
const document = request.document;
request.linkSelf(originType, `${request.context.qualifier}`);
const urns = document.elements.map(element => `urn:${targetType}:${element.id}`);
request.linkSelf(name, urns);
return document;
}
@ -166,19 +117,15 @@ class Processor {
const document = request.document;
const context = request.context;
request.addSelfLink('sha');
request.linkSiblings(`${context.qualifier}:commits`);
this._addRoot(request, 'repo', 'repo', document.url.replace(/\/commits\/.*/, ''), `${context.qualifier}`);
// TODO some commits have author and committer properties, others have email info in a "commit" property
// For the former, this code works. For the latter, consider queuing an email lookup and storing a
// email key here for the author/committer.
this._addRoot(request, 'author', 'user');
this._addRoot(request, 'committer', 'user');
request.linkSelf('repo', `urn:repo:${context.repo}`);
request.linkSiblings('siblings', `urn:repo:${context.repo}:commits`);
// TODO not sure what the following line does
// document._metadata.links.parent = document._metadata.links.parent;
if (document.author) {
request.linkSelf('author', `urn:login:${document.author.id}`);
request.queueRoot('user', document.author.url);
}
if (document.committer) {
request.linkSelf('committer', `urn:login:${document.committer.id}`);
request.queueRoot('user', document.committer.url);
}
if (document.files) {
document.files.forEach(file => {
delete file.patch;
@ -187,22 +134,6 @@ class Processor {
return document;
}
login(request) {
// TODO sort out relationship of login to user and org.
const document = request.document;
request.addRootSelfLink();
request.linkSelf('self', `urn:login:${document.id}`);
// TODO should we do repos here and in the user/org?
request.linkSiblings('repos', `urn:login:${document.id}:repos`);
request.linkSiblings('siblings', 'urn:login');
if (document.type === 'Organization') {
request.queueRoot('org', `https://api.github.com/orgs/${document.login}`);
} else if (document.type === 'User') {
request.queueRoot('user', `https://api.github.com/users/${document.login}`);
}
return document;
}
issue(request) {
// TODO links to consider
// * milestone
@ -213,23 +144,19 @@ class Processor {
const document = request.document;
const context = request.context;
request.addSelfLink();
request.linkSelf('assignees', document.assignees.map(assignee => { return `urn:login:${assignee.id}`; }));
request.linkSelf('repo', `urn:repo:${context.repo}`);
request.linkSelf('parent', `urn:repo:${context.repo}`);
request.linkSelf('user', `urn:login:${document.user.id}`);
request.linkSiblings('siblings', `urn:repo:${context.repo}:issues`);
request.queueRoot('login', document.user.url);
request.queueRoot('repo', document.repository_url);
if (document.assignee) {
request.linkSelf('assignee', `urn:login:${document.assignee.id}`);
request.queueRoot('login', document.assignee.url);
}
if (document.closed_by) {
request.linkSelf('closed_by', `urn:login:${document.closed_by.id}`);
request.queueRoot('login', document.closed_by.url);
request.linkSiblings(`${context.qualifier}:issues`);
const assignees = document.assignees.map(assignee => { return `urn:user:${assignee.id}`; });
if (assignees.length > 0) {
request.linkResource('assignees', assignees);
}
request.queueChildren('issue_comments', document.comments_url, { issue: document.id, repo: context.repo });
this._addRoot(request, 'user', 'user');
this._addRoot(request, 'repo', 'repo', document.repository_url, context.qualifier);
this._addRoot(request, 'assignee', 'user');
this._addRoot(request, 'closed_by', 'user');
this._addCollection(request, 'comments', 'issue_comment');
return document;
}
@ -239,37 +166,28 @@ class Processor {
const document = request.document;
const context = request.context;
request.addSelfLink();
request.linkSelf('user', `urn:login:${document.user.id}`);
request.linkSiblings('siblings', `urn:repo:${context.repo}:issue:${context.issue}:comments`);
request.queueRoot('login', document.user.url);
request.linkSiblings(`${context.qualifier}:comments`);
this._addRoot(request, 'user', 'user');
return document;
}
team(request) {
const document = request.document;
request.addSelfLink();
request.linkSelf('org', `urn:org:${document.organization.id}`);
request.linkSelf('login', `urn:login:${document.organization.id}`);
request.linkSiblings('siblings', `urn:org:${document.organization.id}:teams`);
request.queueRoots('team_members', document.members_url, { relation: 'team_members_relation' });
request.queueRoots('team_repos', document.repositories_url, { relation: 'team_repos_relation' });
request.linkResource('org', `urn:org:${document.organization.id}`);
request.linkSiblings(`urn:org:${document.organization.id}:teams`);
this._addRelation(request, 'members', 'user', document.members_url.replace('{/member}', ''));
this._addRelation(request, 'repos', 'repo', document.repositories_url);
return document;
}
team_members_relation(request) {
return this._processRelation(request, 'members', 'team', 'user');
}
team_repos_relation(request) {
return this._processRelation(request, 'repos', 'team', 'repo');
}
// =============== Event Processors ============
CommitCommentEvent(request) {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(request);
request.linkSelf('comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
request.linkResource('comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
// TODO siblings?
request.queue('comment', payload.comment.url);
return document;
@ -294,7 +212,7 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
request.linkResource('deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
request.queue('deployment', payload.deployment.url);
return document;
}
@ -303,8 +221,8 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('deployment_status', `urn:repo:${context.repo}:deployment:${payload.deployment.id}:status:${payload.deployment_status.id}`);
request.linkSelf('deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
request.linkResource('deployment_status', `urn:repo:${context.repo}:deployment:${payload.deployment.id}:status:${payload.deployment_status.id}`);
request.linkResource('deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
request.queue('deployment', payload.deployment.url);
return document;
}
@ -328,8 +246,8 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
request.linkSelf('comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
request.linkResource('issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
request.linkResource('comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
request.queue('comment', payload.comment.url);
request.queue('issue', payload.issue.url);
return document;
@ -339,7 +257,7 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
request.linkResource('issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
request.queue('issue', payload.issue.url);
return document;
}
@ -355,8 +273,8 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('member', `urn:login:${payload.member.id}`);
request.queueRoot('login', payload.member.url);
request.linkResource('member', `urn:user:${payload.member.id}`);
request.queueRoot('user', payload.member.url);
return document;
}
@ -364,11 +282,11 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('self', `urn:team:${payload.team.id}:membership_event:${document.id}`);
request.linkSelf('member', `urn:login:${payload.member.id}`);
request.linkSelf('team', `urn:team:${payload.team.id}`);
request.linkSelf('org', `urn:org:${payload.organization.id}`);
request.queueRoot('login', payload.member.url);
request.linkResource('self', `urn:team:${payload.team.id}:membership_event:${document.id}`);
request.linkResource('member', `urn:user:${payload.member.id}`);
request.linkResource('team', `urn:team:${payload.team.id}`);
request.linkResource('org', `urn:org:${payload.organization.id}`);
request.queueRoot('user', payload.member.url);
request.queueRoot('org', payload.organization.url);
request.queue('team', payload.team.url);
return document;
@ -378,7 +296,7 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('milestone', `urn:repo:${context.repo}:milestone:${payload.milestone.id}`);
request.linkResource('milestone', `urn:repo:${context.repo}:milestone:${payload.milestone.id}`);
request.queue('milestone', payload.milestone.url);
return document;
}
@ -387,7 +305,7 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('page_build', `urn:repo:${context.repo}:page_builds:${payload.id}`);
request.linkResource('page_build', `urn:repo:${context.repo}:page_builds:${payload.id}`);
request.queue('page_build', payload.build.url);
return document;
}
@ -403,7 +321,7 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
request.linkResource('pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
request.queue('pull', payload.pull_request.url);
return document;
}
@ -412,8 +330,8 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('review', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:review:${payload.review.id}`);
request.linkSelf('pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
request.linkResource('review', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:review:${payload.review.id}`);
request.linkResource('pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
request.queue('pull_review', payload.pull_request.review_comment_url.replace('{/number}', `/${payload.review.id}`));
request.queue('pull', payload.pull_request.url);
return document;
@ -423,8 +341,8 @@ class Processor {
const document = request.document;
const context = request.context;
const payload = request.eventHelper(document);
request.linkSelf('comment', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:comment:${payload.comment.id}`);
request.linkSelf('pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
request.linkResource('comment', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:comment:${payload.comment.id}`);
request.linkResource('pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
// TODO see if all the various comments can be the same type
request.queue('pull_comment', payload.comment.url);
request.queue('pull', payload.pull_request.url);
@ -438,6 +356,76 @@ class Processor {
// TODO figure out what to do with the commits
return document;
}
_getHandler(request, type = request.type) {
const parsed = URL.parse(request.url, true);
const page = parsed.query.page;
if (page) {
return this.page.bind(this, page);
}
const collectionType = request.getCollectionType();
if (collectionType) {
return this.collection;
}
return (this[type]);
}
_addCollection(request, name, type, url = null, urn = null) {
const qualifier = request.getQualifier();
urn = urn || `${qualifier}:${name}`;
url = url || request.document[`${name}_url`];
request.linkCollection(name, urn);
if (request.isRootType(type)) {
request.queueRoots(name, url);
} else {
request.context.qualifier = qualifier;
request.queueChildren(name, url, request.context);
}
}
_addRoot(request, name, type, url = null, urn = null) {
const element = request.document[name];
if (!element && !(urn && url)) {
return;
}
urn = urn || `urn:${type}:${element.id}`;
url = url || element.url;
request.linkResource(name, urn);
request.queueRoot(type, url);
}
/**
* Relate this document to a collection of other documents of the given type. For example,
* a repo to its collaborators which are users.
*
* This creates a relationship between the current document being processed and the named
* target resource of the given type. This results in a siblings link with the given name
* and urn being added to this document and a relation request queued for the given url.
* The document produced by processing that url will have matching siblings links (called 'siblings')
*/
_addRelation(request, name, type, url = null, urn = null) {
urn = urn || `${request.getQualifier()}:${name}`;
url = url || request.document[`${name}_url`];
request.linkRelation(name, urn + ':pages');
request.queueRoots(name, url, { relation: { origin: request.type, name: name, type: type } });
}
/**
* Process a page resource for a relation. Add links identifying this page as part of a
* relation with the given info and enumerate links for the resources referenced from this page.
* Note that currently relations can only point to root resources.
*/
_processRelation(request, origin, name, type) {
const document = request.document;
request.linkResource('origin', `${request.context.qualifier}`);
request.linkResource(origin, `${request.context.qualifier}`);
request.linkSiblings(`${request.context.qualifier}:${name}:pages`);
const urns = document.elements.map(element => `urn:${type}:${element.id}`);
request.linkResource('resources', urns);
return document;
}
}
module.exports = Processor;

Просмотреть файл

@ -1,3 +1,4 @@
const jsonpatch = require('fast-json-patch');
const Q = require('q');
const qlimit = require('qlimit');
@ -19,11 +20,11 @@ class QueueSet {
}
reconfigure(patches) {
const weightsPatch = patches.find(patch => patch.path === '/weights');
if (weightsPatch) {
const weights = weightsPatch.value;
this.options.weights = weights;
this._startMap = this._createStartMap(weights);
// remember options that need processing, apply and then do any necessary processing
const currentWeights = this.options.weights;
jsonpatch.apply(this.options, patches);
if (currentWeights !== this.options.weights) {
this._startMap = this._createStartMap(this.options.weights);
}
return Q();
}
@ -72,13 +73,13 @@ class QueueSet {
done(request) {
const acked = request.acked;
request.acked = true;
return !acked ? request._originQueue.done(request) : Q();
return !acked && request._originQueue ? request._originQueue.done(request) : Q();
}
abandon(request) {
const acked = request.acked;
request.acked = true;
return !acked ? request._originQueue.abandon(request) : Q();
return !acked && request._originQueue ? request._originQueue.abandon(request) : Q();
}
_findQueue(name) {

Просмотреть файл

@ -1,43 +1,38 @@
const extend = require('extend');
const Policy = require('./traversalPolicy');
/**
Requests describe a resource to capture and process as well as the context for that processing.
Transitivity
* none - Only process this exact resource
* normal - Process this resource if not previously seen and do normal processing on non-roots and roots
* forceNone - Process this resource and force processing on non-roots and no processing of roots
* forceNormal - Force processing of children plus normal processing of roots
* forceForce - Force processing of children and roots. Decays to forceNormal on roots
Basically, once you are forcing, force transitivity for all children, but still allow control over transitivity
when traversing to a root. When traversing with forceForce, queued roots end up as forceNormal. Similarly,
when traversing with forceNormal, queued roots end up as normal.
Fetch behavior
* none - Only use existing content. Skip this resource if we don't already have it
* normal - Use existing content if we have it and it matches. Otherwise, get content from original source
* force - Ignore exiting content and get contenf from original source
*/
* Requests describe a resource to capture and process as well as the context for that processing.
*/
class Request {
constructor(type, url, context = null) {
this.type = type;
this.url = url;
this.transitivity = 'normal';
this.fetch = 'normal';
this.policy = Policy.default();
this.context = context || {};
this.promises = [];
}
static adopt(object) {
if (object.__proto__ !== Request.prototype) {
object.__proto__ = Request.prototype;
}
if (object.policy && object.policy__proto__ !== Policy.prototype) {
object.policy.__proto__ = Policy.prototype;
}
return object;
}
track(promises) {
if (!promises) {
return;
return this;
}
if (Array.isArray(promises)) {
Array.prototype.push.apply(this.promises, promises);
} else {
this.promises.push(promises);
}
return this;
}
addMeta(data) {
@ -46,28 +41,51 @@ class Request {
}
addRootSelfLink() {
this.addSelfLink('id', 'urn:');
this.linkResource('self', this.getRootQualifier());
}
addSelfLink(key = 'id', base = null) {
let qualifier = base ? base : this.context.qualifier;
addSelfLink(key = 'id') {
this.linkResource('self', this.getChildQualifier(key));
}
getQualifier() {
return this.isRootType(this.type) ? this.getRootQualifier() : this.getChildQualifier();
}
getRootQualifier() {
return `urn:${this.type}:${this.document.id}`;
}
getChildQualifier(key = 'id') {
let qualifier = this.context.qualifier;
if (!qualifier || (typeof qualifier !== 'string')) {
throw new Error('Need something on which to base the self link URN');
}
qualifier = qualifier.endsWith(':') ? qualifier : qualifier + ':';
this.linkSelf('self', `${qualifier}${this.type}:${this.document[key]}`);
return `${qualifier}${this.type}:${this.document[key]}`;
}
linkSelf(name, value) {
linkResource(name, value) {
const links = this.document._metadata.links;
const key = Array.isArray(value) ? 'hrefs' : 'href';
links[name] = { type: 'self' };
links[name] = {};
links[name][key] = value;
links[name].type = 'resource';
}
linkSiblings(name, href) {
linkSiblings(href) {
const links = this.document._metadata.links;
links[name] = { type: 'siblings', href: href };
links.siblings = { href: href, type: 'collection'};
}
linkCollection(name, href) {
const links = this.document._metadata.links;
links[name] = { href: href, type: 'collection'};
}
linkRelation(name, href) {
const links = this.document._metadata.links;
links[name] = { href: href, type: 'relation'};
}
queue(type, url, context) {
@ -78,30 +96,28 @@ class Request {
}
queueRoot(type, url) {
const transitivity = this._getRootTransitivity();
if (!transitivity) {
const policy = this.policy.createPolicyForRoot();
if (!policy) {
return;
}
const newRequest = new Request(type, url);
newRequest.context = { qualifier: 'urn:' };
// set the new request's transitivity to the next value
newRequest.transitivity = transitivity;
newRequest.fetch = this.fetch;
newRequest.policy = policy;
this.track(this.crawler.queue(newRequest));
}
queueRoots(type, url, context = null) {
const transitivity = this._getRootTransitivity();
if (!transitivity) {
const policy = this.policy.createPolicyForRoot();
if (!policy) {
return;
}
const newRequest = new Request(type, url);
const newContext = extend({}, this.context, context);
newContext.qualifier = this.document._metadata.links.self.href;
newRequest.context = newContext;
// carry over this requests transitivity as we are queuing a collection
newRequest.transitivity = this.transitivity;
newRequest.fetch = this.fetch;
// We are queuing a collection so carry this request's policy over. A new policy may
// apply to the elements in the collection
newRequest.policy = this.policy;
this.track(this.crawler.queue(newRequest));
}
@ -113,57 +129,32 @@ class Request {
}
queueChild(type, url, qualifier) {
const transitivity = this._getChildTransitivity();
if (!transitivity) {
const policy = this.policy.createPolicyForChild();
if (!policy) {
return;
}
const newRequest = new Request(type, url);
newRequest.context = this.context || {};
newRequest.context.qualifier = qualifier;
newRequest.transitivity = transitivity;
newRequest.fetch = this.fetch;
newRequest.policy = policy;
this.track(this.crawler.queue(newRequest));
}
queueChildren(type, url, context = null) {
const transitivity = this._getChildTransitivity();
if (!transitivity) {
const policy = this.policy.createPolicyForChild();
if (!policy) {
return;
}
const newRequest = new Request(type, url);
const newContext = extend({}, this.context, context);
newContext.qualifier = this.document._metadata.links.self.href;
newRequest.context = newContext;
// carry over this requests transitivity as we are queuing a collection
newRequest.transitivity = this.transitivity;
newRequest.fetch = this.fetch;
// We are queuing a collection so carry this request's policy over. A new policy may
// apply to the elements in the collection
newRequest.policy = this.policy;
this.track(this.crawler.queue(newRequest));
}
_getRootTransitivity() {
return { normal: 'normal', forceNormal: 'normal', forceForce: 'forceNormal' }[this.transitivity];
}
_getChildTransitivity() {
return { normal: 'normal', forceNone: 'forceNone', forceNormal: 'forceNormal', forceForce: 'forceNormal' }[this.transitivity];
}
getTransivityShortForm() {
return { normal: 'NN', forceNone: 'F0', forceNormal: 'FN', forceForce: 'FF' }[this.transitivity];
}
isReprocessing() {
return this.fetch === 'none';
}
isForced() {
return this.transitivity.startsWith('force');
}
isForcedFetch() {
return this.fetch === 'force';
}
markSkip(outcome, message) {
if (this.shouldSkip()) {
return this;
@ -207,11 +198,11 @@ class Request {
// TODO understand if the actor is typically the same as the creator or pusher in the payload
const repo = document.repo ? document.repo.id : null;
const urn = repo ? `urn:repo:${repo}` : `urn:org:${document.org.id}`;
this.linkSelf('self', `${urn}:${this.type}:${document.id}`);
this.linkSelf('actor', `urn:login:${document.actor.id}`);
this.linkSelf('repo', `urn:repo:${document.repo.id}`);
this.linkSelf('org', `urn:org:${document.org.id}`);
this.queueRoot('login', document.actor.url);
this.linkResource('self', `${urn}:${this.type}:${document.id}`);
this.linkResource('actor', `urn:user:${document.actor.id}`);
this.linkResource('repo', `urn:repo:${document.repo.id}`);
this.linkResource('org', `urn:org:${document.org.id}`);
this.queueRoot('user', document.actor.url);
this.queueRoot('repo', document.repo.url);
this.queueRoot('org', document.org.url);
return document.payload;

197
lib/traversalPolicy.js Normal file
Просмотреть файл

@ -0,0 +1,197 @@
/**
Fetch behavior
* storageOnly - Only use stored content. Skip this resource if we don't already have it
* originStorage - Use stored content if it is up to date. Otherwise, get content from original source
* storageOriginIfMissing - Use stored content. If missing, get content from original source
* originOnly - Always get content from original source
Freshness -- How age of the resource, relative what we have seen/done before, factors into whether or not process the resource.
* always - process the resource no matter what
* match - process the resource if origin and stored docs do NOT match
* N - process the resource if newer or if the stored copy is N days old
* version - process the resource if the current stored doc's processing version is behind current
* matchOrVersion - process the resource if stored and origin do not match or the stored processed version is out of date
Processing -- Which processing to do for a given resource.
* documentAndRelated - generate links etc and queue referenced resources for further processing
* documentAndChildren - generate links etc and queue referenced child resources (i.e., not roots) for further processing
* documentOnly - generate links
Transitivity -- How related resources should be queued.
* shallow - Queue all non-roots and roots with the current Freshness policy
* deepShallow - Queue NON-roots as always and roots with the current Freshness policy, Roots will have transitivity set to shallow.
* deepDeep - Queue all related resources as always. Roots will have transitivity set to deepShallow
Basically, once you are doind deep traversal, carry that through for all children, but still allow transivity
control when traversing to a root. A deepDeep traversal to a root will queue that root as deepShallow. Similarly,
when traversing with deepShallow, queued roots end up as shallow. This approach gives you the ability to push deep
for one level.
=============== Scenarios
Initialization -- Traverse a subgraph ensuring everything is fetched. If something has already been processed, great, assume it is up to date
* fetch = originStorage
* freshness = match
* processing = documentAndRelated
* transitivity = [deepShallow | deepDeep]
Update -- Ensure a subgraph up to date. If something has already been processed, get it again
* fetch = originStorage
* freshness = always
* processing = documentAndRelated
* transitivity = [deepShallow | deepDeep]
Events -- Given an event, traverse its subgraph until encountering something previously seen. This ensures the event is recorded and the related resources are present. They may not be completely up to date.
* fetch = originStorage
* freshness = match
* processing = documentAndRelated
* transitivity = shallow
Events and update -- Traverse a subgraph until encountering something previously seen. If that
resource is older than N days, ensure the it is updated
// TODO, what is N's relation to match?
* fetch = originStorage
* freshness = N
* processing = documentAndRelated
* transitivity = shallow
Just Reprocess -- Reprocess just the exact resources we have already fetched
* fetch = storageOnly
* freshness = version
* processing = documentOnly
* transitivity = NA
Reprocess and Rediscover -- Reprocess the resources we have and traverse to new/missing resources discovered during reprocessing. Process those as desired.
* fetch = storageOriginIfMissing
* freshness = version
* processing = documentAndRelated
* transitivity = <any>
Reprocess and Update -- Reprocess anything that is EITHER older version or out of date.
* fetch = originStorage
* freshness = matchOrVersion
* processing = documentAndRelated
* transitivity = <any>
*/
const moment = require('moment');
class TraversalPolicy {
static default() {
return new TraversalPolicy('originStorage', 'match', 'documentAndRelated', 'shallow');
}
static update() {
return new TraversalPolicy('originStorage', 'always', 'documentAndRelated', 'deepDeep');
}
static events() {
return TraversalPolicy.default();
}
static clone(policy) {
return new TraversalPolicy(policy.fetch, policy.freshness, policy.processing, policy.transitivity);
}
constructor(fetch, freshness, processing, transitivity) {
this.fetch = fetch;
this.freshness = freshness;
this.processing = processing;
this.transitivity = transitivity;
}
/**
* Create and return a new policy suitable for use a root object reached from processing
* a resource with this resource.
*/
createPolicyForRoot() {
if (this.processing === 'documentOnly' || this.processing === 'documentAndChildren') {
return null;
}
const transitivity = { shallow: 'shallow', deepShallow: 'shallow', deepDeep: 'deepShallow' }[this.transitivity];
return new TraversalPolicy(this.fetch, this.freshness, this.processing, transitivity);
}
/**
* Create and return a new policy suitable for use by a child object reached from processing
* a resource with this resource.
*/
createPolicyForChild() {
if (this.processing === 'documentOnly') {
return null;
}
const transitivity = { shallow: 'shallow', deepShallow: 'deepShallow', deepDeep: 'deepShallow' }[this.transitivity];
const freshness = { shallow: this.freshness, deepShallow: this.freshness, deepDeep: 'always' }[this.transitivity];
return new TraversalPolicy(this.fetch, freshness, this.processing, transitivity);
}
/**
* Given a request for which the requisite content has been fetched, determine whether or not processing
* should happen.
*/
shouldProcess(request, version) {
// Note: if the freshness is match, we only got here if the content should be processed.
if (this.freshness === 'always' || this.freshness === 'match') {
return true;
}
if (typeof this.freshness === 'number') {
return moment.diff(request.document._metadata.processedAt, 'hours') > this.freshness * 24;
}
if (this.freshness === 'version' || this.freshness === 'matchOrVersion') {
return !request.document._metadata.version || (version > request.document._metadata.version);
}
throw new Error('Invalid freshness in traversal policy');
}
/**
* Given a request for which we have found existing content in our doc store, say whether or not that existing
* content should be fetched. The only case where we do NOT fetch is match. All others require the actual cached
* content to determine whether or not processing is required.
*/
fetchExisting(request) {
return this.freshness !== 'match';
}
/**
* Return the source from which to perform the initial fetch for the given request's resource.
*/
initialFetch(request) {
const result = { storageOnly: 'storage', originStorage: 'origin', storageOriginIfMissing: 'storage', originOnly: 'origin' }[this.fetch];
if (!result) {
throw new Error(`Fetch policy misconfigured ${this.fetch}`);
}
return result;
}
/**
* Return the source from which to fetch if the original fetch did not find any content
*/
missingFetch(request) {
const result = { storageOnly: null, originStorage: 'origin', storageOriginIfMissing: 'origin', originOnly: null }[this.fetch];
if (result === undefined) {
throw new Error(`Fetch policy misconfigured ${this.fetch}`);
}
return result;
}
/**
* Return a symbolic short form to uniquely identify this policy.
*/
getShortForm() {
const fetch = { storageOnly: 'S', originStorage: 'o', storageOriginIfMissing: 'm', originOnly: 'O' }[this.fetch];
let freshness = { always: 'A', match: 'M', version: 'V', matchOrVersion: 'm' }[this.freshness];
if (!freshness) {
if (typeof this.policy.freshness === 'number') {
freshness = 'N';
}
}
const processing = { documentOnly: 'D', documentAndRelated: 'r', documentAndChildren: 'c' }[this.processing];
const transitivity = { shallow: 'S', deepShallow: 'd', deepDeep: 'D' }[this.transitivity];
return fetch + freshness + processing + transitivity;
}
}
module.exports = TraversalPolicy;

Просмотреть файл

@ -27,11 +27,11 @@
"moment": "2.15.2",
"parse-link-header": "^0.4.1",
"q": "1.4.1",
"qlimit": "^0.1.1",
"query-string": "^4.2.3"
"qlimit": "^0.1.1"
},
"devDependencies": {
"chai": "^3.5.0",
"flow-bin": "^0.36.0",
"grunt": "^1.0.1",
"grunt-mocha-test": "^0.13.2",
"istanbul": "^0.4.5",

Просмотреть файл

@ -7,6 +7,7 @@ const Q = require('q');
const QueueSet = require('../lib/queueSet');
const Request = require('../lib/request');
const sinon = require('sinon');
const TraversalPolicy = require('../lib/traversalPolicy');
describe('Crawler get request', () => {
it('should get from the priority queue first', () => {
@ -248,7 +249,7 @@ describe('Crawler fetch', () => {
it('should return cached content and not save and response for 304 with force', () => {
const url = 'http://test';
const request = new Request('repos', url);
request.transitivity = 'forceNormal';
request.policy = TraversalPolicy.update();
let getArgs = null;
const responses = [createResponse(null, 304, 42)];
const requestor = createBaseRequestor({
@ -269,7 +270,7 @@ describe('Crawler fetch', () => {
it('should return cached content and headers for 304 with force', () => {
const url = 'http://test';
const request = new Request('repos', url);
request.transitivity = 'forceNormal';
request.policy = TraversalPolicy.update();
let getArgs = null;
const responses = [createResponse(null, 304, 42)];
const requestor = createBaseRequestor({
@ -300,9 +301,9 @@ describe('Crawler fetch', () => {
});
});
it('should get from requestor even with a 304 when fetch == force', () => {
it('should get from origin with originOnly fetch policy', () => {
const request = new Request('foo', 'http://test');
request.fetch = 'force';
request.policy.fetch = 'originOnly';
const responses = [createResponse('hey there')];
const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } });
const crawler = createBaseCrawler({ requestor: requestor });
@ -312,10 +313,9 @@ describe('Crawler fetch', () => {
});
});
it('should pull from store only if fetch == none', () => {
it('should pull from storage only storageOnly fetch policy', () => {
const request = new Request('foo', 'http://test');
request.fetch = 'none';
const responses = [createResponse(null, 304, 42)];
request.policy.fetch = 'storageOnly';
const store = createBaseStore({ get: () => { return Q({ _metadata: {}, id: 'test' }); } });
const crawler = createBaseCrawler({ store: store });
return crawler._fetch(request).then(request => {
@ -365,7 +365,7 @@ describe('Crawler fetch', () => {
it('should throw for store get errors', () => {
const request = new Request('repos', 'http://test');
request.transitivity = 'forceNormal';
request.policy = TraversalPolicy.update();
const responses = [createResponse(null, 304, 42)];
const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } });
const store = createBaseStore({ etag: () => { return Q(42); }, get: () => { throw new Error('test'); } });
@ -1308,7 +1308,7 @@ function createErrorResponse(error) {
};
}
function createBaseCrawler({queues = createBaseQueues(), store = createBaseStore(), locker = createBaseLocker, requestor = createBaseRequestor(), options = createBaseOptions().crawler } = {}) {
function createBaseCrawler({queues = createBaseQueues(), store = createBaseStore(), locker = createBaseLocker(), requestor = createBaseRequestor(), options = createBaseOptions().crawler } = {}) {
return new Crawler(queues, store, locker, requestor, options);
}

Просмотреть файл

@ -4,12 +4,13 @@ const expect = require('chai').expect;
const Processor = require('../lib/processor.js');
const Request = require('../lib/request.js');
const sinon = require('sinon');
const TraversalPolicy = require('../lib/traversalPolicy');
describe('Processor reprocessing', () => {
it('will skip if at same version', () => {
const processor = new Processor();
const request = new Request('user', 'http://test.com/users/user1');
request.fetch = 'none';
request.policy.freshness = 'version';
request.document = { _metadata: { version: processor.version } };
sinon.stub(processor, 'user', () => { });
processor.process(request);
@ -20,12 +21,12 @@ describe('Processor reprocessing', () => {
it('will skip and warn if at greater version', () => {
const processor = new Processor();
const request = new Request('user', 'http://test.com/users/user1');
request.fetch = 'none';
request.policy.freshness = 'version';
request.document = { _metadata: { version: processor.version + 1 } };
sinon.stub(processor, 'user', () => { });
processor.process(request);
expect(request.shouldSkip()).to.be.true;
expect(request.outcome).to.be.equal('Superceded');
expect(request.outcome).to.be.equal('Excluded');
expect(processor.user.callCount).to.be.equal(0);
});
@ -43,9 +44,9 @@ describe('Processor reprocessing', () => {
});
describe('Collection processing', () => {
it('should queue forceNormal normal collection pages as forceNormal and elements as forceNormal', () => {
it('should queue collection pages as deepShallow and elements as deepShallow', () => {
const request = new Request('issues', 'http://test.com/issues');
request.transitivity = 'forceNormal';
request.policy.transitivity = 'deepShallow';
request.response = {
headers: { link: createLinkHeader(request.url, null, 2, 2) }
};
@ -55,26 +56,26 @@ describe('Collection processing', () => {
const push = sinon.spy(request.crawler.queues, 'push');
const processor = new Processor();
processor.collection(request);
processor.process(request);
expect(request.crawler.queues.push.callCount).to.be.equal(1);
expect(push.getCall(0).args[1]).to.be.equal('soon');
const newPages = request.crawler.queues.push.getCall(0).args[0];
expect(newPages.length).to.be.equal(1);
expect(newPages[0].transitivity).to.be.equal('forceNormal');
expect(newPages[0].policy.transitivity).to.be.equal('deepShallow');
expect(newPages[0].url).to.be.equal('http://test.com/issues?page=2&per_page=100');
expect(newPages[0].type).to.be.equal('issues');
expect(request.crawler.queue.callCount).to.be.equal(1);
const newRequest = request.crawler.queue.getCall(0).args[0];
expect(newRequest.transitivity).to.be.equal('forceNormal');
expect(newRequest.policy.transitivity).to.be.equal('deepShallow');
expect(newRequest.url).to.be.equal('http://child1');
expect(newRequest.type).to.be.equal('issue');
});
it('should queue forceNormal root collection as forceNormal and elements as normal', () => {
it('should queue deepShallow root collections as deepShallow and elements as shallow', () => {
const request = new Request('orgs', 'http://test.com/orgs');
request.transitivity = 'forceNormal';
request.policy.transitivity = 'deepShallow';
request.response = {
headers: { link: createLinkHeader(request.url, null, 2, 2) }
};
@ -84,27 +85,27 @@ describe('Collection processing', () => {
const push = sinon.spy(request.crawler.queues, 'push');
const processor = new Processor();
processor.collection(request);
processor.process(request);
expect(push.callCount).to.be.equal(1);
expect(push.getCall(0).args[1]).to.be.equal('soon');
const newPages = push.getCall(0).args[0];
expect(newPages.length).to.be.equal(1);
expect(newPages[0].transitivity).to.be.equal('forceNormal');
expect(newPages[0].policy.transitivity).to.be.equal('deepShallow');
expect(newPages[0].url).to.be.equal('http://test.com/orgs?page=2&per_page=100');
expect(newPages[0].type).to.be.equal('orgs');
expect(request.crawler.queue.callCount).to.be.equal(1);
const newRequest = request.crawler.queue.getCall(0).args[0];
expect(newRequest.transitivity).to.be.equal('normal');
expect(newRequest.policy.transitivity).to.be.equal('shallow');
expect(newRequest.url).to.be.equal('http://child1');
expect(newRequest.type).to.be.equal('org');
});
it('should queue forceForce root collection pages as forceForce and elements as forceNormal', () => {
const request = new Request('orgs', 'http://test.com/orgs');
request.transitivity = 'forceForce';
request.policy = TraversalPolicy.update();
request.response = {
headers: { link: createLinkHeader(request.url, null, 2, 2) }
};
@ -114,26 +115,26 @@ describe('Collection processing', () => {
const push = sinon.spy(request.crawler.queues, 'push');
const processor = new Processor();
processor.collection(request);
processor.process(request);
expect(push.callCount).to.be.equal(1);
expect(push.getCall(0).args[1]).to.be.equal('soon');
const newPages = push.getCall(0).args[0];
expect(newPages.length).to.be.equal(1);
expect(newPages[0].transitivity).to.be.equal('forceForce');
expect(newPages[0].policy.transitivity).to.be.equal('deepDeep');
expect(newPages[0].url).to.be.equal('http://test.com/orgs?page=2&per_page=100');
expect(newPages[0].type).to.be.equal('orgs');
expect(request.crawler.queue.callCount).to.be.equal(1);
const newRequest = request.crawler.queue.getCall(0).args[0];
expect(newRequest.transitivity).to.be.equal('forceNormal');
expect(newRequest.policy.transitivity).to.be.equal('deepShallow');
expect(newRequest.url).to.be.equal('http://child1');
expect(newRequest.type).to.be.equal('org');
});
it('should queue forceForce page elements with forceNormal transitivity', () => {
const request = new Request('orgs', 'http://test.com/orgs?page=2&per_page=100');
request.transitivity = 'forceForce';
request.policy = TraversalPolicy.update();
request.document = { _metadata: { links: {} }, elements: [{ url: 'http://child1' }] };
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
@ -142,7 +143,7 @@ describe('Collection processing', () => {
processor.page(2, request);
expect(request.crawler.queue.callCount).to.be.equal(1);
const newRequest = request.crawler.queue.getCall(0).args[0];
expect(newRequest.transitivity).to.be.equal('forceNormal');
expect(newRequest.policy.transitivity).to.be.equal('deepShallow');
expect(newRequest.url).to.be.equal('http://child1');
expect(newRequest.type).to.be.equal('org');
});
@ -161,13 +162,13 @@ describe('URN building', () => {
expect(request.crawler.queue.callCount).to.be.at.least(4);
const teamsRequest = request.crawler.queue.getCall(1).args[0];
expect(teamsRequest.context.qualifier).to.be.equal('urn:repo:42');
expect(teamsRequest.context.relation).to.be.equal('repo_teams_relation');
expect(teamsRequest.context.relation).to.be.deep.equal({ origin: 'repo', name: 'teams', type: 'team' } );
request.crawler.queue.reset();
teamsRequest.type = 'teams';
teamsRequest.document = { _metadata: { links: {} }, elements: [{ id: 13, url: 'http://team1' }] };
teamsRequest.crawler = request.crawler;
const teamsPage = processor.collection(teamsRequest);
const teamsPage = processor.process(teamsRequest);
const links = teamsPage._metadata.links;
expect(links.teams.type).to.be.equal('self');
expect(links.teams.hrefs.length).to.be.equal(1);

Просмотреть файл

@ -3,146 +3,132 @@ const chai = require('chai');
const expect = require('chai').expect;
const Request = require('../lib/request.js');
const sinon = require('sinon');
const TraversalPolicy = require('../lib/traversalPolicy');
describe('Request transitivity', () => {
it('will not queueRoot if none transitivity', () => {
it('will not queueRoot if documentOnly processing', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'none';
request.policy.processing = 'documentOnly';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueRoot('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(0);
});
it('will not queueRoots if none transitivity', () => {
it('will not queueRoots if documentOnly processing', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'none';
request.policy.processing = 'documentOnly';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueRoots('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(0);
});
it('will not queueChild if none transitivity', () => {
it('will not queueChild if documentOnly processing', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'none';
request.policy.processing = 'documentOnly';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueChild('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(0);
});
it('will not queueChildren if none transitivity', () => {
it('will not queueChildren if documentOnly processing', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'none';
request.policy.processing = 'documentOnly';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueChildren('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(0);
});
it('will queueRoot normal if normal transitivity', () => {
it('will queueRoot shallow if documentAndRelated processing', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'normal';
request.fetch = 'none';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueRoot('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
expect(request.crawler.queue.getCall(0).args[0].transitivity).to.be.equal('normal');
expect(request.crawler.queue.getCall(0).args[0].fetch).to.be.equal('none');
expect(request.crawler.queue.getCall(0).args[0].policy.transitivity).to.be.equal('shallow');
});
it('will not queueRoot if forceNone transitivity', () => {
it('will not queueRoot if documentAndChildren processing', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'forceNone';
request.policy.processing = 'documentAndChildren';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueRoot('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(0);
});
it('will queueRoot normal if forceNormal transitivity', () => {
it('will queueRoot shallow if deepShallow transitivity', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'forceNormal';
request.transitivity = 'deepShallow';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueRoot('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
expect(request.crawler.queue.getCall(0).args[0].transitivity).to.be.equal('normal');
expect(request.crawler.queue.getCall(0).args[0].policy.transitivity).to.be.equal('shallow');
});
it('will queueRoot forceNormal if forceForce transitivity', () => {
it('will queueRoot deepShallow if deepDeep transitivity', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'forceForce';
request.policy.transitivity = 'deepDeep';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueRoot('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
expect(request.crawler.queue.getCall(0).args[0].transitivity).to.be.equal('forceNormal');
expect(request.crawler.queue.getCall(0).args[0].policy.transitivity).to.be.equal('deepShallow');
});
it('queueRoots will not change transitivity and will carry through fetch', () => {
it('queueRoots will not change policy ', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'forceForce';
request.fetch = 'force';
request.policy = TraversalPolicy.update();
request.document = { _metadata: { links: { self: { href: 'urn:pick:me' } } } };
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueRoots('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
const newRequest = request.crawler.queue.getCall(0).args[0];
expect(newRequest.transitivity).to.be.equal('forceForce');
expect(newRequest.fetch).to.be.equal('force');
expect(newRequest.policy).to.be.deep.equal(TraversalPolicy.update());
});
it('will not queueChild if none transitivity', () => {
it('will not queueChild if documentOnly processing', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'none';
request.policy.processing = 'documentOnly';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueChild('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(0);
});
it('will queueChild normal if normal transitivity', () => {
it('will queueChild shallow if shallow transitivity', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'normal';
request.policy.transitivity = 'shallow';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueChild('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
expect(request.crawler.queue.getCall(0).args[0].transitivity).to.be.equal('normal');
expect(request.crawler.queue.getCall(0).args[0].policy.transitivity).to.be.equal('shallow');
});
it('will queueChild force if force transitivity', () => {
it('will queueChild deepShallow if deepShallow transitivity', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'forceNone';
request.policy.transitivity = 'deepShallow';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueChild('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
expect(request.crawler.queue.getCall(0).args[0].transitivity).to.be.equal('forceNone');
expect(request.crawler.queue.getCall(0).args[0].policy.transitivity).to.be.equal('deepShallow');
});
it('will queueChild foceNormal if forceNormal transitivity', () => {
it('will queueChild deepShallow if deepDeep transitivity', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'forceNormal';
request.policy.transitivity = 'deepDeep';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueChild('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
expect(request.crawler.queue.getCall(0).args[0].transitivity).to.be.equal('forceNormal');
});
it('will queueChild foceNormal if forceForce transitivity', () => {
const request = new Request('user', 'http://test.com/users/user1');
request.transitivity = 'forceForce';
request.crawler = { queue: () => { } };
sinon.spy(request.crawler, 'queue');
request.queueChild('foo', 'http://');
expect(request.crawler.queue.callCount).to.be.equal(1);
expect(request.crawler.queue.getCall(0).args[0].transitivity).to.be.equal('forceNormal');
expect(request.crawler.queue.getCall(0).args[0].policy.transitivity).to.be.equal('deepShallow');
});
});
@ -165,7 +151,8 @@ describe('Request link management', () => {
it('will add a : to the qualifier', () => {
const request = new Request('foo', 'http://test');
request.document = { id: 4, _metadata: { links: {} } };
request.addSelfLink('id', 'test');
request.context.qualifier = 'test';
request.addSelfLink();
expect(request.document._metadata.links.self.href.startsWith('test:foo'));
});
});