зеркало из https://github.com/microsoft/ghcrawler.git
Коммит
0f0ea65c09
|
@ -2,5 +2,8 @@
|
|||
{
|
||||
"jshint.options": {
|
||||
"esnext": true
|
||||
}
|
||||
},
|
||||
"editor.folding": false,
|
||||
"editor.tabSize": 2,
|
||||
"editor.detectIndentation": false
|
||||
}
|
601
lib/crawler.js
601
lib/crawler.js
|
@ -1,15 +1,17 @@
|
|||
const extend = require('extend');
|
||||
const moment = require('moment');
|
||||
const parse = require('parse-link-header');
|
||||
const Q = require('q');
|
||||
const url = require('url');
|
||||
const URL = require('url');
|
||||
|
||||
const collections = {
|
||||
orgs: 'org', repos: 'repo', issues: 'issue', issue_comments: 'issue_comment', commits: 'commit', teams: 'team'
|
||||
orgs: 'org', repos: 'repo', issues: 'issue', issue_comments: 'issue_comment', commits: 'commit', teams: 'team', users: 'user'
|
||||
};
|
||||
|
||||
class Crawler {
|
||||
constructor(queue, store, requestor, config, logger) {
|
||||
this.seen = {};
|
||||
constructor(queue, priorityQueue, store, requestor, config, logger) {
|
||||
this.queue = queue;
|
||||
this.priorityQueue = priorityQueue;
|
||||
this.store = store;
|
||||
this.requestor = requestor;
|
||||
this.config = config;
|
||||
|
@ -17,16 +19,29 @@ class Crawler {
|
|||
}
|
||||
|
||||
start() {
|
||||
return this.queue.pop()
|
||||
return this._pop(this.priorityQueue)
|
||||
.then(this._pop.bind(this, this.queue))
|
||||
.then(this._trackStart.bind(this))
|
||||
.then(this._filter.bind(this))
|
||||
.then(this._fetch.bind(this))
|
||||
.then(this._convertToDocument.bind(this))
|
||||
.then(this._processDocument.bind(this))
|
||||
.then(this._storeDocument.bind(this))
|
||||
.then(this._deleteFromQueue.bind(this))
|
||||
.then(this._markSeen.bind(this))
|
||||
.then(this._logOutcome.bind(this))
|
||||
.then(this._startNext.bind(this));
|
||||
.then(this._startNext.bind(this))
|
||||
.catch(error => {
|
||||
this.logger.log('error', `${error.message}`);
|
||||
});
|
||||
}
|
||||
|
||||
_pop(queue, request = null) {
|
||||
return request ? Q(request) : queue.pop();
|
||||
}
|
||||
|
||||
_trackStart(request) {
|
||||
request.start = Date.now();
|
||||
return Q(request);
|
||||
}
|
||||
|
||||
_startNext() {
|
||||
|
@ -34,9 +49,7 @@ class Crawler {
|
|||
}
|
||||
|
||||
_filter(request) {
|
||||
if (this.seen[request.url]) {
|
||||
this._markSkip(request, 'Seen');
|
||||
} else if (this._configFilter(request.type, request.url)) {
|
||||
if (this._configFilter(request.type, request.url)) {
|
||||
this._markSkip(request, 'Filtered');
|
||||
}
|
||||
return Q.resolve(request);
|
||||
|
@ -46,18 +59,49 @@ class Crawler {
|
|||
if (request.skip) {
|
||||
return Q.resolve(request);
|
||||
}
|
||||
// rewrite the request type for collections remember the collection subType
|
||||
// Also setup 'page' as the document type to look up for etags etc.
|
||||
let fetchType = request.type;
|
||||
let subType = collections[request.type];
|
||||
if (subType) {
|
||||
request.type = 'collection';
|
||||
request.subType = subType;
|
||||
fetchType = 'page';
|
||||
}
|
||||
const self = this;
|
||||
return this.store.etag(fetchType, request.url).then(etag => {
|
||||
const options = etag ? { headers: { 'If-None-Match': etag } } : {};
|
||||
const start = Date.now();
|
||||
return self.requestor.get(request.url, options).then(githubResponse => {
|
||||
const status = githubResponse.statusCode;
|
||||
this._addMeta(request, { status: status, fetch: Date.now() - start });
|
||||
if (status !== 200 && status !== 304) {
|
||||
self._markSkip(request, 'Error', `Code: ${status} for: ${request.url}`);
|
||||
return request;
|
||||
}
|
||||
|
||||
const getCollection = collections.hasOwnProperty(request.type);
|
||||
const getFunction = getCollection ? this.requestor.getAll : this.requestor.get;
|
||||
return getFunction.call(this.requestor, request.url)
|
||||
.then(githubResponse => {
|
||||
request.response = getCollection ? githubResponse : githubResponse.body;
|
||||
if (status === 304 && githubResponse.headers.etag === etag) {
|
||||
// We have the content for this element. If it is immutable, skip.
|
||||
// Otherwise get it from the store and process.
|
||||
if (!request.force) {
|
||||
return self._markSkip(request, 'Unmodified');
|
||||
}
|
||||
return self.store.get(fetchType, request.url).then(document => {
|
||||
request.document = document;
|
||||
request.response = githubResponse;
|
||||
// Our store is up to date so don't '
|
||||
request.store = false;
|
||||
return request;
|
||||
});
|
||||
}
|
||||
request.document = githubResponse.body;
|
||||
request.response = githubResponse;
|
||||
return request;
|
||||
})
|
||||
.catch(error => {
|
||||
// TODO retryable vs non-retryable
|
||||
return this._markSkip(request, 'Error', error.message);
|
||||
});
|
||||
}).catch(error => {
|
||||
// TODO can this request be requeued?
|
||||
return this._markSkip(request, 'Error', error.message);
|
||||
});
|
||||
}
|
||||
|
||||
_convertToDocument(request) {
|
||||
|
@ -65,13 +109,18 @@ class Crawler {
|
|||
return Q.resolve(request);
|
||||
}
|
||||
|
||||
request.response._metadata = {
|
||||
// If the doc is an array, wrap it in an object to make storage more consistent (Mongo can't store arrays directly)
|
||||
if (Array.isArray(request.document)) {
|
||||
request.document = { elements: request.document };
|
||||
}
|
||||
request.document._metadata = {
|
||||
type: request.type,
|
||||
url: request.url,
|
||||
etag: request.response.headers.etag,
|
||||
fetchedAt: moment.utc().toISOString(),
|
||||
links: {}
|
||||
};
|
||||
|
||||
request.promises = [];
|
||||
return Q.resolve(request);
|
||||
}
|
||||
|
||||
|
@ -79,23 +128,19 @@ class Crawler {
|
|||
if (request.skip) {
|
||||
return Q.resolve(request);
|
||||
}
|
||||
let document = null;
|
||||
if (collections.hasOwnProperty(request.type)) {
|
||||
document = this._processCollection(request.response, collections[request.type], request.context);
|
||||
} else {
|
||||
const handler = this[request.type];
|
||||
if (handler && typeof handler === 'function') {
|
||||
document = handler.call(this, request.response, request.context);
|
||||
} else {
|
||||
// TODO log something saying we did not know how to handle the type
|
||||
}
|
||||
const handler = this[request.type];
|
||||
if (!handler) {
|
||||
this._markSkip(request, 'Error', `No handler found for request type: ${request.type}`);
|
||||
return request;
|
||||
}
|
||||
request.document = document;
|
||||
|
||||
request.document = handler.call(this, request);
|
||||
return Q.resolve(request);
|
||||
}
|
||||
|
||||
_storeDocument(request) {
|
||||
if (request.skip || !this.store || !request.document) {
|
||||
// See if we should skip storing the document. Test request.store explicitly for false as it may just not be set.
|
||||
if (request.skip || !this.store || !request.document || request.store === false) {
|
||||
return Q.resolve(request);
|
||||
}
|
||||
|
||||
|
@ -105,83 +150,107 @@ class Crawler {
|
|||
});
|
||||
}
|
||||
|
||||
_queue(type, url, context) {
|
||||
if (this._configFilter(type, url)) {
|
||||
this.logger.log('info', `Skipped queuing ${type} [${url}]`);
|
||||
} else {
|
||||
this.queue.push(type, url, context);
|
||||
}
|
||||
}
|
||||
|
||||
_deleteFromQueue(request) {
|
||||
if (!request.message) {
|
||||
return Q.resolve(request);
|
||||
}
|
||||
|
||||
return this.queue.done(request).then(() => { return request; });
|
||||
}
|
||||
|
||||
_markSeen(request) {
|
||||
// TODO retryable vs non-retryable and re-queue
|
||||
this.seen[request.url] = true;
|
||||
return Q.resolve(request);
|
||||
}
|
||||
|
||||
_logOutcome(request) {
|
||||
const outcome = request.outcome ? request.outcome : 'Processed';
|
||||
const message = request.message;
|
||||
this.logger.log('info', `${outcome} ${request.type} [${request.url}] ${message || ''}`);
|
||||
this._addMeta(request, { total: Date.now() - request.start });
|
||||
this.logger.log('info', `${outcome} ${request.type} [${request.url}] ${message || ''}`, request.meta);
|
||||
return request;
|
||||
}
|
||||
|
||||
_addMeta(request, data) {
|
||||
request.meta = extend({}, request.meta, data);
|
||||
return request;
|
||||
}
|
||||
|
||||
// =============== Entity Processors ============
|
||||
|
||||
_processCollection(document, type, context) {
|
||||
document.forEach(item => {
|
||||
this._queue(type, item.url, context);
|
||||
collection(request) {
|
||||
// if there are additional pages, queue them up to be processed. Note that these go
|
||||
// on the high priority queue so they are loaded before they change much.
|
||||
const linkHeader = request.response.headers.link;
|
||||
if (linkHeader) {
|
||||
const links = parse(linkHeader);
|
||||
for (let i = 2; i <= links.last.page; i++) {
|
||||
const url = request.url + `?page=${i}&per_page=100`;
|
||||
const context = { qualifier: request.context.qualifier };
|
||||
this._queueBase(request, { type: 'page', url: url, subType: request.subType, page: i, force: request.force, context: context }, this.priorityQueue);
|
||||
}
|
||||
}
|
||||
|
||||
// Rewrite the request and document to be a 'page' and then process.
|
||||
request.page = 1;
|
||||
request.document._metadata.type = 'page';
|
||||
return this.page(request);
|
||||
}
|
||||
|
||||
page(request) {
|
||||
const document = request.document;
|
||||
const type = request.subType;
|
||||
const first = document.elements[0];
|
||||
const qualifier = request.context.qualifier;
|
||||
this._linkSelf(request, 'self', `${qualifier}:${type}:pages:${request.page}`);
|
||||
document.elements.forEach(item => {
|
||||
this._queueChild(request, type, item.url, qualifier);
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
org(document) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:org:${document.id}` };
|
||||
document._metadata.links.repos = { type: 'siblings', href: `urn:org:${document.id}:repos` };
|
||||
document._metadata.links.siblings = { type: 'siblings', href: 'urn:org' };
|
||||
this._queue('repos', document.repos_url);
|
||||
return document;
|
||||
}
|
||||
|
||||
user(document) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:user:${document.id}` };
|
||||
document._metadata.links.repos = { type: 'siblings', href: `urn:user:${document.id}:repos` };
|
||||
document._metadata.links.siblings = { type: 'siblings', href: 'urn:user' };
|
||||
this._queue('repos', document.repos_url);
|
||||
org(request) {
|
||||
const document = request.document;
|
||||
this._addSelfLink(request, 'urn:');
|
||||
this._linkSiblings(request, 'repos', `urn:org:${document.id}:repos`);
|
||||
this._linkSiblings(request, 'siblings', 'urn:org');
|
||||
this._queueChildren(request, 'repos', document.repos_url);
|
||||
// TODO is this "logins"
|
||||
this._queueChildren(request, 'users', document.members_url.replace('{/member}', ''));
|
||||
return document;
|
||||
}
|
||||
|
||||
repo(document) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${document.id}` };
|
||||
document._metadata.links.owner = { type: 'self', href: `urn:login:${document.owner.id}` };
|
||||
document._metadata.links.parent = { type: 'self', href: `urn:login:${document.owner.id}` };
|
||||
document._metadata.links.siblings = { type: 'siblings', href: `urn:login:${document.owner.id}:repos` };
|
||||
this._queue('login', document.owner.url);
|
||||
this._queue('issues', document.issues_url.replace('{/number}', ''), { repo: document.id });
|
||||
this._queue('commits', document.commits_url.replace('{/sha}', ''), { repo: document.id });
|
||||
user(request) {
|
||||
const document = request.document;
|
||||
this._addSelfLink(request, 'urn:');
|
||||
this._linkSiblings(request, 'repos', `urn:user:${document.id}:repos`);
|
||||
this._linkSiblings(request, 'siblings', 'urn:user');
|
||||
this._queueChildren(request, 'repos', document.repos_url);
|
||||
return document;
|
||||
}
|
||||
|
||||
commit(document, context) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:commit:${document.sha}` };
|
||||
document._metadata.links.siblings = { type: 'siblings', href: `urn:repo:${context.repo}:commits` };
|
||||
document._metadata.links.repo = { type: 'self', href: `urn:repo:${context.repo}` };
|
||||
document._metadata.links.parent = document._metadata.links.parent;
|
||||
repo(request) {
|
||||
const document = request.document;
|
||||
this._addSelfLink(request, 'urn:');
|
||||
this._linkSelf(request, 'owner', `urn:login:${document.owner.id}`);
|
||||
this._linkSelf(request, 'parent', `urn:login:${document.owner.id}`);
|
||||
this._linkSiblings(request, 'siblings', `urn:login:${document.owner.id}:repos`);
|
||||
this._queueRoot(request, 'login', document.owner.url);
|
||||
this._queueChildren(request, 'issues', document.issues_url.replace('{/number}', ''), { repo: document.id });
|
||||
this._queueChildren(request, 'commits', document.commits_url.replace('{/sha}', ''), { repo: document.id });
|
||||
return document;
|
||||
}
|
||||
|
||||
commit(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
this._addSelfLink(request, null, 'sha');
|
||||
|
||||
this._linkSelf(request, 'repo', `urn:repo:${context.repo}`);
|
||||
this._linkSiblings(request, 'siblings', `urn:repo:${context.repo}:commits`);
|
||||
// TODO not sure what the following line does
|
||||
// document._metadata.links.parent = document._metadata.links.parent;
|
||||
if (document.author) {
|
||||
document._metadata.links.author = { type: 'self', href: `urn:login:${document.author.id}` };
|
||||
this._queue('login', document.author.url);
|
||||
this._linkSelf(request, 'author', `urn:login:${document.author.id}`);
|
||||
this._queueRoot(request, 'login', document.author.url);
|
||||
}
|
||||
if (document.committer) {
|
||||
document._metadata.links.committer = { type: 'self', href: `urn:login:${document.committer.id}` };
|
||||
this._queue('login', document.committer.url);
|
||||
this._linkSelf(request, 'committer', `urn:login:${document.committer.id}`);
|
||||
this._queueRoot(request, 'login', document.committer.url);
|
||||
}
|
||||
if (document.files) {
|
||||
document.files.forEach(file => {
|
||||
|
@ -191,231 +260,323 @@ class Crawler {
|
|||
return document;
|
||||
}
|
||||
|
||||
login(document) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:login:${document.id}` };
|
||||
login(request) {
|
||||
const document = request.document;
|
||||
this._addSelfLink(request, 'urn:');
|
||||
this._linkSelf(request, 'self', `urn:login:${document.id}`);
|
||||
// TODO should we do repos here and in the user/org?
|
||||
this._linkSiblings(request, 'repos', `urn:login:${document.id}:repos`);
|
||||
this._linkSiblings(request, 'siblings', 'urn:login');
|
||||
if (document.type === 'Organization') {
|
||||
|
||||
this._queueRoot(request, 'org', `https://api.github.com/orgs/${document.login}`);
|
||||
} else if (document.type === 'User') {
|
||||
|
||||
this._queueRoot(request, 'user', `https://api.github.com/users/${document.login}`);
|
||||
}
|
||||
this._queueChildren(request, 'repos', document.repos_url);
|
||||
return document;
|
||||
}
|
||||
|
||||
issue(document, context) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issue:${document.id}` };
|
||||
document._metadata.links.siblings = { type: 'siblings', href: `urn:repo:${context.repo}:issues` };
|
||||
document._metadata.links.assignees = { type: 'self', hrefs: document.assignees.map(assignee => { return `urn:login:${assignee.id}` }) };
|
||||
document._metadata.links.repo = { type: 'self', href: `urn:repo:${context.repo}` };
|
||||
document._metadata.links.parent = document._metadata.links.repo;
|
||||
document._metadata.links.user = { type: 'self', href: `urn:login:${document.user.id}` };
|
||||
this._queue('login', document.user.url);
|
||||
issue(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
this._addSelfLink(request);
|
||||
this._linkSelf(request, 'assignees', document.assignees.map(assignee => { return `urn:login:${assignee.id}`; }));
|
||||
this._linkSelf(request, 'repo', `urn:repo:${context.repo}`);
|
||||
this._linkSelf(request, 'parent', `urn:repo:${context.repo}`);
|
||||
this._linkSelf(request, 'user', `urn:login:${document.user.id}`);
|
||||
this._linkSiblings(request, 'siblings', `urn:repo:${context.repo}:issues`);
|
||||
this._queueRoot(request, 'login', document.user.url);
|
||||
if (document.assignee) {
|
||||
document._metadata.links.assignee = { type: 'self', href: `urn:login:${document.assignee.id}` };
|
||||
this._queue('login', document.assignee.url);
|
||||
this._linkSelf(request, 'assignee', `urn:login:${document.assignee.id}`);
|
||||
this._queueRoot(request, 'login', document.assignee.url);
|
||||
}
|
||||
if (document.closed_by) {
|
||||
document._metadata.links.closed_by = { type: 'self', href: `urn:login:${document.closed_by.id}` };
|
||||
this._queue('login', document.closed_by.url);
|
||||
this._linkSelf(request, 'closed_by', `urn:login:${document.closed_by.id}`);
|
||||
this._queueRoot(request, 'login', document.closed_by.url);
|
||||
}
|
||||
|
||||
// milestone
|
||||
// pull request
|
||||
// events
|
||||
// labels
|
||||
this._queue('issue_comments', document.comments_url, { issue: document.id, repo: context.repo });
|
||||
this._queueChildren(request, 'issue_comments', document.comments_url, { issue: document.id, repo: context.repo });
|
||||
return document;
|
||||
}
|
||||
|
||||
issue_comment(document, context) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issue_comment:${document.id}` };
|
||||
document._metadata.links.user = { type: 'self', href: `urn:login:${document.user.id}` };
|
||||
document._metadata.links.siblings = { type: 'siblings', href: `urn:repo:${context.repo}:issue:${context.issue}:comments` };
|
||||
this._queue('login', document.user.url);
|
||||
issue_comment(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
this._addSelfLink(request);
|
||||
this._linkSelf(request, 'user', `urn:login:${document.user.id}`);
|
||||
this._linkSiblings(request, 'siblings', `urn:repo:${context.repo}:issue:${context.issue}:comments`);
|
||||
this._queue(request, 'login', document.user.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
team(document, context) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:org:${document.organization.id}:team:${document.id}` };
|
||||
document._metadata.links.org = { type: 'self', href: `urn:org:${document.organization.id}` };
|
||||
document._metadata.links.login = { type: 'self', href: `urn:login:${document.organization.id}` };
|
||||
document._metadata.links.siblings = { type: 'siblings', href: `urn:org:${document.organization.id}:teams` };
|
||||
this._queue('team_members', document.members_url);
|
||||
this._queue('team_repos', document.repositories_url);
|
||||
team(request) {
|
||||
const document = request.document;
|
||||
this._addSelfLink(request, `urn:org:${document.organization.id}`);
|
||||
this._linkSelf(request, 'org', `urn:org:${document.organization.id}`);
|
||||
this._linkSelf(request, 'login', `urn:login:${document.organization.id}`);
|
||||
this._linkSiblings(request, 'siblings', `urn:org:${document.organization.id}:teams`);
|
||||
this._queueChildren(request, 'team_members', document.members_url);
|
||||
this._queueChildren(request, 'team_repos', document.repositories_url);
|
||||
return document;
|
||||
}
|
||||
|
||||
team_members(document, context) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:org:${document.organization.id}:team:${document.id}:members` };
|
||||
team_members(request) {
|
||||
const document = request.document;
|
||||
this._addSelfLink(request, `urn:org:${document.organization.id}`);
|
||||
return document;
|
||||
}
|
||||
|
||||
team_repos(document, context) {
|
||||
document._metadata.links.self = { type: 'self', href: `urn:org:${document.organization.id}:team:${document.id}:repos` };
|
||||
team_repos(request) {
|
||||
this._addSelfLink(request, `urn:org:${document.organization.id}`);
|
||||
return document;
|
||||
}
|
||||
|
||||
// =============== Event Processors ============
|
||||
CommitCommentEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:commit_comment_event:${document.id}` };
|
||||
document._metadata.links.comment = { type: 'self', href: `urn:repo:${context.repo}:comment:${payload.comment.id}` };
|
||||
CommitCommentEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(request);
|
||||
this._linkSelf(request, 'comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
|
||||
// TODO siblings?
|
||||
this._queue('comment', payload.comment.url);
|
||||
this._queue(request, 'comment', payload.comment.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
CreateEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:create_event:${document.id}` };
|
||||
CreateEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
DeleteEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:delete_event:${document.id}` };
|
||||
DeleteEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
// TODO do something for interesting deletions e.g., where ref-type === 'repository'
|
||||
return document;
|
||||
}
|
||||
|
||||
DeploymentEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:deployment_event:${document.id}` };
|
||||
document._metadata.links.deployment = { type: 'self', href: `urn:repo:${context.repo}:deployment:${payload.deployment.id}` };
|
||||
this._queue('deployment', payload.deployment.url);
|
||||
DeploymentEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
|
||||
this._queue(request, 'deployment', payload.deployment.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
DeploymentStatusEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:deployment_status_event:${document.id}` };
|
||||
document._metadata.links.deployment_status = { type: 'self', href: `urn:repo:${context.repo}:deployment:${payload.deployment.id}:status:${payload.deployment_status.id}` };
|
||||
document._metadata.links.deployment = { type: 'self', href: `urn:repo:${context.repo}:deployment:${payload.deployment.id}` };
|
||||
this._queue('deployment', payload.deployment.url);
|
||||
DeploymentStatusEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'deployment_status', `urn:repo:${context.repo}:deployment:${payload.deployment.id}:status:${payload.deployment_status.id}`);
|
||||
this._linkSelf(request, 'deployment', `urn:repo:${context.repo}:deployment:${payload.deployment.id}`);
|
||||
this._queue(request, 'deployment', payload.deployment.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
ForkEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:fork_event:${document.id}` };
|
||||
ForkEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
// TODO figure out what else to do
|
||||
return document;
|
||||
}
|
||||
|
||||
GollumEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:gollum_event:${document.id}` };
|
||||
GollumEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
IssueCommentEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issue_comment_event:${document.id}` };
|
||||
document._metadata.links.issue = { type: 'self', href: `urn:repo:${context.repo}:issue:${payload.issue.id}` };
|
||||
document._metadata.links.comment = { type: 'self', href: `urn:repo:${context.repo}:comment:${payload.comment.id}` };
|
||||
this._queue('comment', payload.comment.url);
|
||||
this._queue('issue', payload.issue.url);
|
||||
IssueCommentEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
|
||||
this._linkSelf(request, 'comment', `urn:repo:${context.repo}:comment:${payload.comment.id}`);
|
||||
this._queue(request, 'comment', payload.comment.url);
|
||||
this._queue(request, 'issue', payload.issue.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
IssuesEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:issued_event:${document.id}` };
|
||||
document._metadata.links.issue = { type: 'self', href: `urn:repo:${context.repo}:issue:${payload.issue.id}` };
|
||||
this._queue('issue', payload.issue.url);
|
||||
IssuesEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'issue', `urn:repo:${context.repo}:issue:${payload.issue.id}`);
|
||||
this._queue(request, 'issue', payload.issue.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
LabelEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:label_event:${document.id}` };
|
||||
LabelEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
MemberEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:member_event:${document.id}` };
|
||||
document._metadata.links.member = { type: 'self', href: `urn:login:${payload.member.id}` };
|
||||
this._queue('login', payload.member.url);
|
||||
MemberEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'member', `urn:login:${payload.member.id}`);
|
||||
this._queueRoot(request, 'login', payload.member.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
MembershipEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:team:${payload.team.id}:membership_event:${document.id}` };
|
||||
document._metadata.links.member = { type: 'self', href: `urn:login:${payload.member.id}` };
|
||||
document._metadata.links.team = { type: 'self', href: `urn:team:${payload.team.id}` };
|
||||
document._metadata.links.org = { type: 'self', href: `urn:org:${payload.organization.id}` };
|
||||
this._queue('login', payload.member.url);
|
||||
this._queue('org', payload.organization.url);
|
||||
this._queue('team', payload.team.url);
|
||||
MembershipEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'self', `urn:team:${payload.team.id}:membership_event:${document.id}`);
|
||||
this._linkSelf(request, 'member', `urn:login:${payload.member.id}`);
|
||||
this._linkSelf(request, 'team', `urn:team:${payload.team.id}`);
|
||||
this._linkSelf(request, 'org', `urn:org:${payload.organization.id}`);
|
||||
this._queueRoot(request, 'login', payload.member.url);
|
||||
this._queueRoot(request, 'org', payload.organization.url);
|
||||
this._queue(request, 'team', payload.team.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
MilestoneEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:milestone_event:${document.id}` };
|
||||
document._metadata.links.milestone = { type: 'self', href: `urn:repo:${context.repo}:milestone:${payload.milestone.id}` };
|
||||
this._queue('milestone', payload.milestone.url);
|
||||
MilestoneEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'milestone', `urn:repo:${context.repo}:milestone:${payload.milestone.id}`);
|
||||
this._queue(request, 'milestone', payload.milestone.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
PageBuildEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:page_build_event:${document.id}` };
|
||||
document._metadata.links.page_build = { type: 'self', href: `urn:repo:${context.repo}:page_builds:${payload.id}` };
|
||||
this._queue('page_build', payload.build.url);
|
||||
PageBuildEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'page_build', `urn:repo:${context.repo}:page_builds:${payload.id}`);
|
||||
this._queue(request, 'page_build', payload.build.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
PublicEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:public_event:${document.id}` };
|
||||
PublicEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
PullRequestEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:pull_request_event:${document.id}` };
|
||||
document._metadata.links.pull = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}` };
|
||||
this._queue('pull', payload.pull_request.url);
|
||||
PullRequestEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
|
||||
this._queue(request, 'pull', payload.pull_request.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
PullRequestReviewEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:pull_request_review_event:${document.id}` };
|
||||
document._metadata.links.review = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}:review:${payload.review.id}` };
|
||||
document._metadata.links.pull = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}` };
|
||||
this._queue('pull_review', payload.pull_request.review_comment_url.replace('{/number}', `/${payload.review.id}`));
|
||||
this._queue('pull', payload.pull_request.url);
|
||||
PullRequestReviewEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'review', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:review:${payload.review.id}`);
|
||||
this._linkSelf(request, 'pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
|
||||
this._queue(request, 'pull_review', payload.pull_request.review_comment_url.replace('{/number}', `/${payload.review.id}`));
|
||||
this._queue(request, 'pull', payload.pull_request.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
PullRequestReviewCommentEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:pull_request_review_comment_event:${document.id}` };
|
||||
document._metadata.links.comment = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}:comment:${payload.comment.id}` };
|
||||
document._metadata.links.pull = { type: 'self', href: `urn:repo:${context.repo}:pull:${payload.pull_request.id}` };
|
||||
PullRequestReviewCommentEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
this._linkSelf(request, 'comment', `urn:repo:${context.repo}:pull:${payload.pull_request.id}:comment:${payload.comment.id}`);
|
||||
this._linkSelf(request, 'pull', `urn:repo:${context.repo}:pull:${payload.pull_request.id}`);
|
||||
// TODO see if all the various comments can be the same type
|
||||
this._queue('pull_comment', payload.comment.url);
|
||||
this._queue('pull', payload.pull_request.url);
|
||||
this._queue(request, 'pull_comment', payload.comment.url);
|
||||
this._queue(request, 'pull', payload.pull_request.url);
|
||||
return document;
|
||||
}
|
||||
|
||||
PushEvent(document, context) {
|
||||
const payload = _eventHelper(document);
|
||||
document._metadata.links.self = { type: 'self', href: `urn:repo:${context.repo}:push_event:${document.id}` };
|
||||
PushEvent(request) {
|
||||
const document = request.document;
|
||||
const context = request.context;
|
||||
const payload = this._eventHelper(document);
|
||||
// TODO figure out what to do with the commits
|
||||
return document;
|
||||
}
|
||||
|
||||
// =============== Helpers ============
|
||||
|
||||
_addSelfLink(request, base = null, key = 'id') {
|
||||
let qualifier = base ? base : request.context.qualifier;
|
||||
qualifier = qualifier.endsWith(':') ? qualifier : qualifier + ':';
|
||||
this._linkSelf(request, 'self', `${qualifier}${request.type}:${request.document[key]}`);
|
||||
}
|
||||
|
||||
_linkSelf(request, name, value) {
|
||||
const links = request.document._metadata.links;
|
||||
const key = Array.isArray(value) ? 'hrefs' : 'href';
|
||||
links[name] = { type: 'self' };
|
||||
links[name][key] = value;
|
||||
}
|
||||
|
||||
_linkSiblings(request, name, href) {
|
||||
const links = request.document._metadata.links;
|
||||
links[name] = { type: 'siblings', href: href };
|
||||
}
|
||||
|
||||
_queue(request, type, url, context, queue = null) {
|
||||
const newRequest = { type: type, url: url };
|
||||
newRequest.context = context;
|
||||
this._queueBase(request, newRequest, queue);
|
||||
}
|
||||
|
||||
_queueRoot(request, type, url) {
|
||||
this._queueBase(request, { type: type, url: url });
|
||||
}
|
||||
|
||||
_queueChild(request, type, url, qualifier) {
|
||||
const newRequest = { type: type, url: url };
|
||||
newRequest.context = request.context || {};
|
||||
newRequest.context.qualifier = qualifier;
|
||||
if (request.force) {
|
||||
newRequest.force = request.force;
|
||||
}
|
||||
this._queueBase(request, newRequest);
|
||||
}
|
||||
|
||||
_queueChildren(request, type, url, context = null) {
|
||||
const newRequest = { type: type, url: url };
|
||||
const newContext = extend(request.context || {}, context);
|
||||
newRequest.context = newContext;
|
||||
newContext.qualifier = request.document._metadata.links.self.href;
|
||||
if (request.force) {
|
||||
newRequest.force = request.force;
|
||||
}
|
||||
this._queueBase(request, newRequest);
|
||||
}
|
||||
|
||||
// TODO make a queue all and add promises (then) to the code below
|
||||
_queueBase(request, newRequest, queue = null) {
|
||||
if (this._configFilter(newRequest.type, newRequest.url)) {
|
||||
this.logger.log('info', `Skipped queuing ${newRequest.type} [${newRequest.url}]`);
|
||||
return;
|
||||
}
|
||||
queue = queue || this.queue;
|
||||
request.promises.push(queue.push(newRequest));
|
||||
}
|
||||
|
||||
_configFilter(type, target) {
|
||||
if (!this.config.orgFilter) {
|
||||
return false;
|
||||
}
|
||||
if (type === 'repo' || type === 'org') {
|
||||
const parsed = url.parse(target);
|
||||
if (type === 'repo' || type === 'repos' || type === 'org') {
|
||||
const parsed = URL.parse(target);
|
||||
const org = parsed.path.split('/')[2];
|
||||
return !this.config.orgFilter.has(org.toLowerCase());
|
||||
}
|
||||
|
@ -429,16 +590,24 @@ class Crawler {
|
|||
return request;
|
||||
}
|
||||
|
||||
_eventHelper(document) {
|
||||
_eventHelper(request, references) {
|
||||
const document = request.document;
|
||||
// TODO understand if the actor is typically the same as the creator or pusher in the payload
|
||||
document._metadata.links.actor = { type: 'self', href: `urn:login:${document.actor.id}` };
|
||||
document._metadata.links.repo = { type: 'self', href: `urn:repo:${document.repo.id}` };
|
||||
document._metadata.links.org = { type: 'self', href: `urn:org:${document.org.id}` };
|
||||
this._queue('login', document.actor.url);
|
||||
this._queue('repo', document.repo.url);
|
||||
this._queue('org', document.org.url);
|
||||
const repo = document.repo ? document.repo.id : null;
|
||||
const urn = repo ? `urn:repo:${repo}` : `urn:org:${document.org.id}`;
|
||||
this._linkSelf(request, 'self', `${urn}:${request.type}:${document.id}`);
|
||||
this._linkSelf(request, 'actor', `urn:login:${document.actor.id}`);
|
||||
this._linkSelf(request, 'repo', `urn:repo:${document.repo.id}`);
|
||||
this._linkSelf(request, 'org', `urn:org:${document.org.id}`);
|
||||
this._queueRoot(request, 'login', document.actor.url);
|
||||
this._queueRoot(request, 'repo', document.repo.url);
|
||||
this._queueRoot(request, 'org', document.org.url);
|
||||
return document.payload;
|
||||
}
|
||||
|
||||
_isCollectionRequest(request) {
|
||||
return collections.hasOwnProperty(request.type);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = Crawler;
|
|
@ -24,6 +24,7 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"moment": "2.15.2",
|
||||
"parse-link-header": "^0.4.1",
|
||||
"q": "1.4.1",
|
||||
"qlimit": "^0.1.1"
|
||||
},
|
||||
|
|
Загрузка…
Ссылка в новой задаче