Merge pull request #7 from Microsoft/jm/etag

Add etag support
This commit is contained in:
Jeff McAffer 2016-11-10 21:34:21 -08:00 коммит произвёл GitHub
Родитель a6b6d33e1e e242c1aaa2
Коммит 0f0ea65c09
3 изменённых файлов: 390 добавлений и 217 удалений

5
.vscode/settings.json поставляемый
Просмотреть файл

@ -2,5 +2,8 @@
{
"jshint.options": {
"esnext": true
}
},
"editor.folding": false,
"editor.tabSize": 2,
"editor.detectIndentation": false
}

Просмотреть файл

@ -1,15 +1,17 @@
const extend = require('extend');
const moment = require('moment');
const parse = require('parse-link-header');
const Q = require('q');
const url = require('url');
const URL = require('url');
const collections = {
orgs: 'org', repos: 'repo', issues: 'issue', issue_comments: 'issue_comment', commits: 'commit', teams: 'team'
orgs: 'org', repos: 'repo', issues: 'issue', issue_comments: 'issue_comment', commits: 'commit', teams: 'team', users: 'user'
};
class Crawler {
constructor(queue, store, requestor, config, logger) {
this.seen = {};
constructor(queue, priorityQueue, store, requestor, config, logger) {
this.queue = queue;
this.priorityQueue = priorityQueue;
this.store = store;
this.requestor = requestor;
this.config = config;
@ -17,16 +19,29 @@ class Crawler {
}
start() {
return this.queue.pop()
return this._pop(this.priorityQueue)
.then(this._pop.bind(this, this.queue))
.then(this._trackStart.bind(this))
.then(this._filter.bind(this))
.then(this._fetch.bind(this))
.then(this._convertToDocument.bind(this))
.then(this._processDocument.bind(this))
.then(this._storeDocument.bind(this))
.then(this._deleteFromQueue.bind(this))
.then(this._markSeen.bind(this))
.then(this._logOutcome.bind(this))
.then(this._startNext.bind(this));
.then(this._startNext.bind(this))
.catch(error => {
this.logger.log('error', `${error.message}`);
});
}
_pop(queue, request = null) {
return request ? Q(request) : queue.pop();
}
_trackStart(request) {
request.start = Date.now();
return Q(request);
}
_startNext() {
@ -34,9 +49,7 @@ class Crawler {
}
_filter(request) {
if (this.seen[request.url]) {
this._markSkip(request, 'Seen');
} else if (this._configFilter(request.type, request.url)) {
if (this._configFilter(request.type, request.url)) {
this._markSkip(request, 'Filtered');
}
return Q.resolve(request);
@ -46,18 +59,49 @@ class Crawler {
if (request.skip) {
return Q.resolve(request);
}
// rewrite the request type for collections remember the collection subType
// Also setup 'page' as the document type to look up for etags etc.
let fetchType = request.type;
let subType = collections[request.type];
if (subType) {
request.type = 'collection';
request.subType = subType;
fetchType = 'page';
}
const self = this;
return this.store.etag(fetchType, request.url).then(etag => {
const options = etag ? { headers: { 'If-None-Match': etag } } : {};
const start = Date.now();
return self.requestor.get(request.url, options).then(githubResponse => {
const status = githubResponse.statusCode;
this._addMeta(request, { status: status, fetch: Date.now() - start });
if (status !== 200 && status !== 304) {
self._markSkip(request, 'Error', `Code: ${status} for: ${request.url}`);
return request;
}
const getCollection = collections.hasOwnProperty(request.type);
const getFunction = getCollection ? this.requestor.getAll : this.requestor.get;
return getFunction.call(this.requestor, request.url)
.then(githubResponse => {
request.response = getCollection ? githubResponse : githubResponse.body;
if (status === 304 && githubResponse.headers.etag === etag) {
// We have the content for this element. If it is immutable, skip.
// Otherwise get it from the store and process.
if (!request.force) {
return self._markSkip(request, 'Unmodified');
}
return self.store.get(fetchType, request.url).then(document => {
request.document = document;
request.response = githubResponse;
// Our store is up to date so don't '
request.store = false;
return request;
});
}
request.document = githubResponse.body;
request.response = githubResponse;
return request;
})
.catch(error => {
// TODO retryable vs non-retryable
return this._markSkip(request, 'Error', error.message);
});
}).catch(error => {
// TODO can this request be requeued?
return this._markSkip(request, 'Error', error.message);
});
}
_convertToDocument(request) {
@ -65,13 +109,18 @@ class Crawler {
return Q.resolve(request);
}
request.response._metadata = {
// If the doc is an array, wrap it in an object to make storage more consistent (Mongo can't store arrays directly)
if (Array.isArray(request.document)) {
request.document = { elements: request.document };
}
request.document._metadata = {
type: request.type,
url: request.url,
etag: request.response.headers.etag,
fetchedAt: moment.utc().toISOString(),
links: {}
};
request.promises = [];
return Q.resolve(request);
}
@ -79,23 +128,19 @@ class Crawler {
if (request.skip) {
return Q.resolve(request);
}
let document = null;
if (collections.hasOwnProperty(request.type)) {
document = this._processCollection(request.response, collections[request.type], request.context);
} else {
const handler = this[request.type];
if (handler && typeof handler === 'function') {
document = handler.call(this, request.response, request.context);
} else {
// TODO log something saying we did not know how to handle the type
}
const handler = this[request.type];
if (!handler) {
this._markSkip(request, 'Error', `No handler found for request type: ${request.type}`);
return request;
}
request.document = document;
request.document = handler.call(this, request);
return Q.resolve(request);
}
_storeDocument(request) {
if (request.skip || !this.store || !request.document) {
// See if we should skip storing the document. Test request.store explicitly for false as it may just not be set.
if (request.skip || !this.store || !request.document || request.store === false) {
return Q.resolve(request);
}
@ -105,83 +150,107 @@ class Crawler {
});
}
_queue(type, url, context) {
if (this._configFilter(type, url)) {
this.logger.log('info', `Skipped queuing ${type} [${url}]`);
} else {
this.queue.push(type, url, context);
}
}
_deleteFromQueue(request) {
if (!request.message) {
return Q.resolve(request);
}
return this.queue.done(request).then(() => { return request; });
}
_markSeen(request) {
// TODO retryable vs non-retryable and re-queue
this.seen[request.url] = true;
return Q.resolve(request);
}
_logOutcome(request) {
const outcome = request.outcome ? request.outcome : 'Processed';
const message = request.message;
this.logger.log('info', `${outcome} ${request.type} [${request.url}] ${message || ''}`);
this._addMeta(request, { total: Date.now() - request.start });
this.logger.log('info', `${outcome} ${request.type} [${request.url}] ${message || ''}`, request.meta);
return request;
}
_addMeta(request, data) {
request.meta = extend({}, request.meta, data);
return request;
}
// =============== Entity Processors ============
_processCollection(document, type, context) {
document.forEach(item => {
this._queue(type, item.url, context);
collection(request) {
// if there are additional pages, queue them up to be processed. Note that these go
// on the high priority queue so they are loaded before they change much.
const linkHeader = request.response.headers.link;
if (linkHeader) {
const links = parse(linkHeader);
for (let i = 2; i <= links.last.page; i++) {
const url = request.url + `?page=${i}&per_page=100`;
const context = { qualifier: request.context.qualifier };
this._queueBase(request, { type: 'page', url: url, subType: request.subType, page: i, force: request.force, context: context }, this.priorityQueue);
}
}
// Rewrite the request and document to be a 'page' and then process.
request.page = 1;
request.document._metadata.type = 'page';
return this.page(request);
}
page(request) {
const document = request.document;
const type = request.subType;
const first = document.elements[0];
const qualifier = request.context.qualifier;
this._linkSelf(request, 'self', `${qualifier}:${type}:pages:${request.page}`);
document.elements.forEach(item => {
this._queueChild(request, type, item.url, qualifier);
});
return null;
}
org(document) {
document._metadata.links.self = { type: 'self', href: `urn:org:${document.id}` };
document._metadata.links.repos = { type: 'siblings', href: `urn:org:${document.id}:repos` };
document._metadata.links.siblings = { type: 'siblings', href: 'urn:org' };
this._queue('repos', document.repos_url);
return document;
}
user(document) {
document._metadata.links.self = { type: 'self', href: `urn:user:${document.id}` };
document._metadata.links.repos = { type: 'siblings', href: `urn:user:${document.id}:repos` };
document._metadata.links.siblings = { type: 'siblings', href: 'urn:user' };
this._queue('repos', document.repos_url);
org(request) {
const document = request.document;
this._addSelfLink(request, 'urn:');
this._linkSiblings(request, 'repos', `urn:org:${document.id}:repos`);
this._linkSiblings(request, 'siblings', 'urn:org');
this._queueChildren(request, 'repos', document.repos_url);
// TODO is this "logins"
this._queueChildren(request, 'users', document.members_url.replace('{/member}', ''));
return document;
}
repo(document) {
document._metadata.links.self = { type: 'self', href: `urn:repo:${document.id}` };
document._metadata.links.owner = { type: 'self', href: `urn:login:${document.owner.id}` };
document._metadata.links.parent = { type: 'self', href: `urn:login:${document.owner.id}` };
document._metadata.links.siblings = { type: 'siblings', href: `urn:login:${document.owner.id}:repos` };
this._queue('login', document.owner.url);
this._queue('issues', document.issues_url.replace('{/number}', ''), { repo: document.id });
this._queue('commits', document.commits_url.replace('{/sha}', ''), { repo: document.id });
user(request) {
const document = request.document;
this._addSelfLink(request, 'urn:');
this._linkSiblings(request, 'repos', `urn:user:${document.id}:repos`);
this._linkSiblings(request, 'siblings', 'urn:user');
this._queueChildren(request, 'repos', document.repos_url);
return document;
}
commit(document, context) {
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:commit:${document.sha}` };
document._metadata.links.siblings = { type: 'siblings', href: `urn:repo:${context.repo}:commits` };
document._metadata.links.repo = { type: 'self', href: `urn:repo:${context.repo}` };
document._metadata.links.parent = document._metadata.links.parent;
repo(request) {
const document = request.document;
this._addSelfLink(request, 'urn:');
this._linkSelf(request, 'owner', `urn:login:${document.owner.id}`);
this._linkSelf(request, 'parent', `urn:login:${document.owner.id}`);
this._linkSiblings(request, 'siblings', `urn:login:${document.owner.id}:repos`);
this._queueRoot(request, 'login', document.owner.url);
this._queueChildren(request, 'issues', document.issues_url.replace('{/number}', ''), { repo: document.id });
this._queueChildren(request, 'commits', document.commits_url.replace('{/sha}', ''), { repo: document.id });
return document;
}
commit(request) {
const document = request.document;
const context = request.context;
this._addSelfLink(request, null, 'sha');
this._linkSelf(request, 'repo', `urn:repo:${context.repo}`);
this._linkSiblings(request, 'siblings', `urn:repo:${context.repo}:commits`);
// TODO not sure what the following line does
// document._metadata.links.parent = document._metadata.links.parent;
if (document.author) {
document._metadata.links.author = { type: 'self', href: `urn:login:${document.author.id}` };
this._queue('login', document.author.url);
this._linkSelf(request, 'author', `urn:login:${document.author.id}`);
this._queueRoot(request, 'login', document.author.url);
}
if (document.committer) {
document._metadata.links.committer = { type: 'self', href: `urn:login:${document.committer.id}` };
this._queue('login', document.committer.url);
this._linkSelf(request, 'committer', `urn:login:${document.committer.id}`);
this._queueRoot(request, 'login', document.committer.url);
}
if (document.files) {
document.files.forEach(file => {
@ -191,231 +260,323 @@ class Crawler {
return document;
}
login(document) {
document._metadata.links.self = { type: 'self', href: `urn:login:${document.id}` };
login(request) {
const document = request.document;
this._addSelfLink(request, 'urn:');
this._linkSelf(request, 'self', `urn:login:${document.id}`);
// TODO should we do repos here and in the user/org?
this._linkSiblings(request, 'repos', `urn:login:${document.id}:repos`);
this._linkSiblings(request, 'siblings', 'urn:login');
if (document.type === 'Organization') {
this._queueRoot(request, 'org', `https://api.github.com/orgs/${document.login}`);
} else if (document.type === 'User') {
this._queueRoot(request, 'user', `https://api.github.com/users/${document.login}`);
}
this._queueChildren(request, 'repos', document.repos_url);
return document;
}
issue(document, context) {
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issue:${document.id}` };
document._metadata.links.siblings = { type: 'siblings', href: `urn:repo:${context.repo}:issues` };
document._metadata.links.assignees = { type: 'self', hrefs: document.assignees.map(assignee => { return `urn:login:${assignee.id}` }) };
document._metadata.links.repo = { type: 'self', href: `urn:repo:${context.repo}` };
document._metadata.links.parent = document._metadata.links.repo;
document._metadata.links.user = { type: 'self', href: `urn:login:${document.user.id}` };
this._queue('login', document.user.url);
issue(request) {
const document = request.document;
const context = request.context;
this._addSelfLink(request);
this._linkSelf(request, 'assignees', document.assignees.map(assignee => { return `urn:login:${assignee.id}`; }));
this._linkSelf(request, 'repo', `urn:repo:${context.repo}`);
this._linkSelf(request, 'parent', `urn:repo:${context.repo}`);
this._linkSelf(request, 'user', `urn:login:${document.user.id}`);
this._linkSiblings(request, 'siblings', `urn:repo:${context.repo}:issues`);
this._queueRoot(request, 'login', document.user.url);
if (document.assignee) {
document._metadata.links.assignee = { type: 'self', href: `urn:login:${document.assignee.id}` };
this._queue('login', document.assignee.url);
this._linkSelf(request, 'assignee', `urn:login:${document.assignee.id}`);
this._queueRoot(request, 'login', document.assignee.url);
}
if (document.closed_by) {
document._metadata.links.closed_by = { type: 'self', href: `urn:login:${document.closed_by.id}` };
this._queue('login', document.closed_by.url);
this._linkSelf(request, 'closed_by', `urn:login:${document.closed_by.id}`);
this._queueRoot(request, 'login', document.closed_by.url);
}
// milestone
// pull request
// events
// labels
this._queue('issue_comments', document.comments_url, { issue: document.id, repo: context.repo });
this._queueChildren(request, 'issue_comments', document.comments_url, { issue: document.id, repo: context.repo });
return document;
}
issue_comment(document, context) {
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issue_comment:${document.id}` };
document._metadata.links.user = { type: 'self', href: `urn:login:${document.user.id}` };
document._metadata.links.siblings = { type: 'siblings', href: `urn:repo:${context.repo}:issue:${context.issue}:comments` };
this._queue('login', document.user.url);
issue_comment(request) {
const document = request.document;
const context = request.context;
this._addSelfLink(request);
this._linkSelf(request, 'user', `urn:login:${document.user.id}`);
this._linkSiblings(request, 'siblings', `urn:repo:${context.repo}:issue:${context.issue}:comments`);
this._queue(request, 'login', document.user.url);
return document;
}
team(document, context) {
document._metadata.links.self = { type: 'self', href: `urn:org:${document.organization.id}:team:${document.id}` };
document._metadata.links.org = { type: 'self', href: `urn:org:${document.organization.id}` };
document._metadata.links.login = { type: 'self', href: `urn:login:${document.organization.id}` };
document._metadata.links.siblings = { type: 'siblings', href: `urn:org:${document.organization.id}:teams` };
this._queue('team_members', document.members_url);
this._queue('team_repos', document.repositories_url);
team(request) {
const document = request.document;
this._addSelfLink(request, `urn:org:${document.organization.id}`);
this._linkSelf(request, 'org', `urn:org:${document.organization.id}`);
this._linkSelf(request, 'login', `urn:login:${document.organization.id}`);
this._linkSiblings(request, 'siblings', `urn:org:${document.organization.id}:teams`);
this._queueChildren(request, 'team_members', document.members_url);
this._queueChildren(request, 'team_repos', document.repositories_url);
return document;
}
team_members(document, context) {
document._metadata.links.self = { type: 'self', href: `urn:org:${document.organization.id}:team:${document.id}:members` };
team_members(request) {
const document = request.document;
this._addSelfLink(request, `urn:org:${document.organization.id}`);
return document;
}
team_repos(document, context) {
document._metadata.links.self = { type: 'self', href: `urn:org:${document.organization.id}:team:${document.id}:repos` };
team_repos(request) {
this._addSelfLink(request, `urn:org:${document.organization.id}`);
return document;
}
// =============== Event Processors ============
CommitCommentEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:commit_comment_event:${document.id}` };
document._metadata.links.comment = { type: 'self', href: `urn:repo:${context.repo}:comment:${payload.comment.id}` };
CommitCommentEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(request);
this._linkSelf(request, 'comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
// TODO siblings?
this._queue('comment', payload.comment.url);
this._queue(request, 'comment', payload.comment.url);
return document;
}
CreateEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:create_event:${document.id}` };
CreateEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
return document;
}
DeleteEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:delete_event:${document.id}` };
DeleteEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
// TODO do something for interesting deletions e.g., where ref-type === 'repository'
return document;
}
DeploymentEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:deployment_event:${document.id}` };
document._metadata.links.deployment = { type: 'self', href: `urn:repo:${context.repo}:deployment:${payload.deployment.id}` };
this._queue('deployment', payload.deployment.url);
DeploymentEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
this._queue(request, 'deployment', payload.deployment.url);
return document;
}
DeploymentStatusEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:deployment_status_event:${document.id}` };
document._metadata.links.deployment_status = { type: 'self', href: `urn:repo:${context.repo}:deployment:${payload.deployment.id}:status:${payload.deployment_status.id}` };
document._metadata.links.deployment = { type: 'self', href: `urn:repo:${context.repo}:deployment:${payload.deployment.id}` };
this._queue('deployment', payload.deployment.url);
DeploymentStatusEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'deployment_status', `urn:repo:${context.repo}:deployment:${payload.deployment.id}:status:${payload.deployment_status.id}`);
this._linkSelf(request, 'deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
this._queue(request, 'deployment', payload.deployment.url);
return document;
}
ForkEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:fork_event:${document.id}` };
ForkEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
// TODO figure out what else to do
return document;
}
GollumEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:gollum_event:${document.id}` };
GollumEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
return document;
}
IssueCommentEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issue_comment_event:${document.id}` };
document._metadata.links.issue = { type: 'self', href: `urn:repo:${context.repo}:issue:${payload.issue.id}` };
document._metadata.links.comment = { type: 'self', href: `urn:repo:${context.repo}:comment:${payload.comment.id}` };
this._queue('comment', payload.comment.url);
this._queue('issue', payload.issue.url);
IssueCommentEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
this._linkSelf(request, 'comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
this._queue(request, 'comment', payload.comment.url);
this._queue(request, 'issue', payload.issue.url);
return document;
}
IssuesEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issued_event:${document.id}` };
document._metadata.links.issue = { type: 'self', href: `urn:repo:${context.repo}:issue:${payload.issue.id}` };
this._queue('issue', payload.issue.url);
IssuesEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
this._queue(request, 'issue', payload.issue.url);
return document;
}
LabelEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:label_event:${document.id}` };
LabelEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
return document;
}
MemberEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:member_event:${document.id}` };
document._metadata.links.member = { type: 'self', href: `urn:login:${payload.member.id}` };
this._queue('login', payload.member.url);
MemberEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'member', `urn:login:${payload.member.id}`);
this._queueRoot(request, 'login', payload.member.url);
return document;
}
MembershipEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:team:${payload.team.id}:membership_event:${document.id}` };
document._metadata.links.member = { type: 'self', href: `urn:login:${payload.member.id}` };
document._metadata.links.team = { type: 'self', href: `urn:team:${payload.team.id}` };
document._metadata.links.org = { type: 'self', href: `urn:org:${payload.organization.id}` };
this._queue('login', payload.member.url);
this._queue('org', payload.organization.url);
this._queue('team', payload.team.url);
MembershipEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'self', `urn:team:${payload.team.id}:membership_event:${document.id}`);
this._linkSelf(request, 'member', `urn:login:${payload.member.id}`);
this._linkSelf(request, 'team', `urn:team:${payload.team.id}`);
this._linkSelf(request, 'org', `urn:org:${payload.organization.id}`);
this._queueRoot(request, 'login', payload.member.url);
this._queueRoot(request, 'org', payload.organization.url);
this._queue(request, 'team', payload.team.url);
return document;
}
MilestoneEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:milestone_event:${document.id}` };
document._metadata.links.milestone = { type: 'self', href: `urn:repo:${context.repo}:milestone:${payload.milestone.id}` };
this._queue('milestone', payload.milestone.url);
MilestoneEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'milestone', `urn:repo:${context.repo}:milestone:${payload.milestone.id}`);
this._queue(request, 'milestone', payload.milestone.url);
return document;
}
PageBuildEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:page_build_event:${document.id}` };
document._metadata.links.page_build = { type: 'self', href: `urn:repo:${context.repo}:page_builds:${payload.id}` };
this._queue('page_build', payload.build.url);
PageBuildEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'page_build', `urn:repo:${context.repo}:page_builds:${payload.id}`);
this._queue(request, 'page_build', payload.build.url);
return document;
}
PublicEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:public_event:${document.id}` };
PublicEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
return document;
}
PullRequestEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:pull_request_event:${document.id}` };
document._metadata.links.pull = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}` };
this._queue('pull', payload.pull_request.url);
PullRequestEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
this._queue(request, 'pull', payload.pull_request.url);
return document;
}
PullRequestReviewEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:pull_request_review_event:${document.id}` };
document._metadata.links.review = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}:review:${payload.review.id}` };
document._metadata.links.pull = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}` };
this._queue('pull_review', payload.pull_request.review_comment_url.replace('{/number}', `/${payload.review.id}`));
this._queue('pull', payload.pull_request.url);
PullRequestReviewEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'review', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:review:${payload.review.id}`);
this._linkSelf(request, 'pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
this._queue(request, 'pull_review', payload.pull_request.review_comment_url.replace('{/number}', `/${payload.review.id}`));
this._queue(request, 'pull', payload.pull_request.url);
return document;
}
PullRequestReviewCommentEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:pull_request_review_comment_event:${document.id}` };
document._metadata.links.comment = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}:comment:${payload.comment.id}` };
document._metadata.links.pull = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}` };
PullRequestReviewCommentEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
this._linkSelf(request, 'comment', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:comment:${payload.comment.id}`);
this._linkSelf(request, 'pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
// TODO see if all the various comments can be the same type
this._queue('pull_comment', payload.comment.url);
this._queue('pull', payload.pull_request.url);
this._queue(request, 'pull_comment', payload.comment.url);
this._queue(request, 'pull', payload.pull_request.url);
return document;
}
PushEvent(document, context) {
const payload = _eventHelper(document);
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:push_event:${document.id}` };
PushEvent(request) {
const document = request.document;
const context = request.context;
const payload = this._eventHelper(document);
// TODO figure out what to do with the commits
return document;
}
// =============== Helpers ============
_addSelfLink(request, base = null, key = 'id') {
let qualifier = base ? base : request.context.qualifier;
qualifier = qualifier.endsWith(':') ? qualifier : qualifier + ':';
this._linkSelf(request, 'self', `${qualifier}${request.type}:${request.document[key]}`);
}
_linkSelf(request, name, value) {
const links = request.document._metadata.links;
const key = Array.isArray(value) ? 'hrefs' : 'href';
links[name] = { type: 'self' };
links[name][key] = value;
}
_linkSiblings(request, name, href) {
const links = request.document._metadata.links;
links[name] = { type: 'siblings', href: href };
}
_queue(request, type, url, context, queue = null) {
const newRequest = { type: type, url: url };
newRequest.context = context;
this._queueBase(request, newRequest, queue);
}
_queueRoot(request, type, url) {
this._queueBase(request, { type: type, url: url });
}
_queueChild(request, type, url, qualifier) {
const newRequest = { type: type, url: url };
newRequest.context = request.context || {};
newRequest.context.qualifier = qualifier;
if (request.force) {
newRequest.force = request.force;
}
this._queueBase(request, newRequest);
}
_queueChildren(request, type, url, context = null) {
const newRequest = { type: type, url: url };
const newContext = extend(request.context || {}, context);
newRequest.context = newContext;
newContext.qualifier = request.document._metadata.links.self.href;
if (request.force) {
newRequest.force = request.force;
}
this._queueBase(request, newRequest);
}
// TODO make a queue all and add promises (then) to the code below
_queueBase(request, newRequest, queue = null) {
if (this._configFilter(newRequest.type, newRequest.url)) {
this.logger.log('info', `Skipped queuing ${newRequest.type} [${newRequest.url}]`);
return;
}
queue = queue || this.queue;
request.promises.push(queue.push(newRequest));
}
_configFilter(type, target) {
if (!this.config.orgFilter) {
return false;
}
if (type === 'repo' || type === 'org') {
const parsed = url.parse(target);
if (type === 'repo' || type === 'repos' || type === 'org') {
const parsed = URL.parse(target);
const org = parsed.path.split('/')[2];
return !this.config.orgFilter.has(org.toLowerCase());
}
@ -429,16 +590,24 @@ class Crawler {
return request;
}
_eventHelper(document) {
_eventHelper(request, references) {
const document = request.document;
// TODO understand if the actor is typically the same as the creator or pusher in the payload
document._metadata.links.actor = { type: 'self', href: `urn:login:${document.actor.id}` };
document._metadata.links.repo = { type: 'self', href: `urn:repo:${document.repo.id}` };
document._metadata.links.org = { type: 'self', href: `urn:org:${document.org.id}` };
this._queue('login', document.actor.url);
this._queue('repo', document.repo.url);
this._queue('org', document.org.url);
const repo = document.repo ? document.repo.id : null;
const urn = repo ? `urn:repo:${repo}` : `urn:org:${document.org.id}`;
this._linkSelf(request, 'self', `${urn}:${request.type}:${document.id}`);
this._linkSelf(request, 'actor', `urn:login:${document.actor.id}`);
this._linkSelf(request, 'repo', `urn:repo:${document.repo.id}`);
this._linkSelf(request, 'org', `urn:org:${document.org.id}`);
this._queueRoot(request, 'login', document.actor.url);
this._queueRoot(request, 'repo', document.repo.url);
this._queueRoot(request, 'org', document.org.url);
return document.payload;
}
_isCollectionRequest(request) {
return collections.hasOwnProperty(request.type);
}
}
module.exports = Crawler;

Просмотреть файл

@ -24,6 +24,7 @@
},
"dependencies": {
"moment": "2.15.2",
"parse-link-header": "^0.4.1",
"q": "1.4.1",
"qlimit": "^0.1.1"
},