From 8a3e07fd68a3cff056b8108c1c1d82a72690b017 Mon Sep 17 00:00:00 2001 From: Jeff McAffer Date: Fri, 20 Jan 2017 17:49:25 -0800 Subject: [PATCH] revamp traversal so to use maps --- lib/githubFetcher.js | 2 +- lib/githubProcessor.js | 122 ++++++------ lib/request.js | 28 +-- lib/traversalPolicy.2.js | 302 +++++++++++++++++++++++++++++ lib/traversalPolicy.js | 203 +++++--------------- lib/visitorMap.js | 338 +++++++++++++++++++++++++++++++++ test/crawlerTests.js | 27 ++- test/gitHubProcessorTests.js | 357 ++++++++++++++++++----------------- test/githubFetcherTests.js | 59 +++--- test/requestTests.js | 107 ----------- test/visitorMapTests.js | 47 +++++ 11 files changed, 1051 insertions(+), 541 deletions(-) create mode 100644 lib/traversalPolicy.2.js create mode 100644 lib/visitorMap.js create mode 100644 test/visitorMapTests.js diff --git a/lib/githubFetcher.js b/lib/githubFetcher.js index fa31de1..28eda54 100644 --- a/lib/githubFetcher.js +++ b/lib/githubFetcher.js @@ -142,7 +142,7 @@ class GitHubFetcher { _checkGitHubRateLimit(request, response) { const retryAfter = parseInt(response.headers['retry-after'], 10) || 0; if (retryAfter > 0) { - this.logger.info(`Retry-After delay of ${retryAfter} for ${request.toString()}`, request.meta); + this.logger.info(`Retry-After delay of ${retryAfter}s for ${request.toString()}`, request.meta); request.addMeta({ retryAfterDelay: retryAfter }); request.delay(retryAfter * 1000); } diff --git a/lib/githubProcessor.js b/lib/githubProcessor.js index 000934e..c31b0da 100644 --- a/lib/githubProcessor.js +++ b/lib/githubProcessor.js @@ -63,7 +63,7 @@ class GitHubProcessor { for (let i = 2; i <= links.last.page; i++) { const separator = request.url.includes('?') ? '&' : '?'; const url = request.url + `${separator}page=${i}&per_page=100`; - const newRequest = new Request(request.type, url, request.context, request.relationship); + const newRequest = new Request(request.type, url, request.context); // Carry this request's transitivity forward to the other pages. newRequest.policy = request.policy; requests.push(newRequest); @@ -88,21 +88,24 @@ class GitHubProcessor { this._processRelation(request, relation); elementType = relation.type; } + // Use the current request's policy as that is assumed to be precomputed for this page's elements. Don't bother queuing + // if we are just going for the pages themselves (e.g., [] map step). + if (request.policy.getCurrentStep() === []) { + return document; + } // Queue up the page elements. Use the same policy as this request as the page itself is more of an implementation - // detail and should not be part of the user model of transitivity. If the current request is a relation, be sure to - // queue a 'reference' relationship since the other side of the relation is not 'contained' by the origin or the relation itself. + // detail and should not be part of the user model of traversal. document.elements.forEach(item => { if (elementType) { const elementQualifier = this.isRootType(elementType) ? 'urn:' : qualifier; const newContext = { qualifier: elementQualifier, history: request.context.history }; - const newRelationship = relation ? 'reference' : request.relationship; - request.queue(newRelationship, elementType, item.url, newContext, null, request.policy); + request.queue(elementType, item.url, request.policy, newContext); } else { // TODO if there is no elementType on a collection then assume it is events. Need to fix this up and // formalize the model of collections where the request carries the payload. const baseUrl = request.url.split("?")[0]; const newContext = { history: request.context.history }; - const newRequest = new Request(item.type, `${baseUrl}/${item.id}`, newContext, request.relationship); + const newRequest = new Request(item.type, `${baseUrl}/${item.id}`, newContext); newRequest.payload = { etag: 1, body: item }; newRequest.policy = request.policy; request.queueRequests(newRequest); @@ -118,7 +121,7 @@ class GitHubProcessor { // TODO look at new API for membership differentiation // * hooks // - this._addRootReference(request, 'user', 'user', document.url.replace('/orgs/', '/users/'), `urn:user:${document.id}`); + this._addRoot(request, 'user', 'user', document.url.replace('/orgs/', '/users/'), `urn:user:${document.id}`); this._addCollection(request, 'repos', 'repo', null, `urn:user:${document.id}:repos`); if (document.members_url) { this._addRelation(request, 'members', 'user', document.members_url.replace('{/member}', ''), `${this._getQualifier(request)}:org_members`); @@ -169,10 +172,10 @@ class GitHubProcessor { request.linkCollection('pull_requests', `${document._metadata.links.self.href}:pull_requests`); if (document.organization) { - this._addRootReference(request, 'owner', 'user'); - this._addRootContainer(request, 'organization', 'org'); + this._addRoot(request, 'owner', 'user'); + this._addRoot(request, 'organization', 'org'); } else { - this._addRootContainer(request, 'owner', 'user'); + this._addRoot(request, 'owner', 'user'); } this._addRelation(request, 'teams', 'team'); @@ -205,12 +208,12 @@ class GitHubProcessor { // even if there are no comments to process, add a link to the comment collection for future use request.linkCollection('commit_comments', commentsUrn); } - this._addRootContainer(request, 'repo', 'repo', document.url.replace(/\/commits\/.*/, ''), `${context.qualifier}`); + this._addRoot(request, 'repo', 'repo', document.url.replace(/\/commits\/.*/, ''), `${context.qualifier}`); // TODO some commits have author and committer properties, others have email info in a "commit" property // For the former, this code works. For the latter, consider queuing an email lookup and storing a // email key here for the author/committer. - this._addRootReference(request, 'author', 'user'); - this._addRootReference(request, 'committer', 'user'); + this._addRoot(request, 'author', 'user'); + this._addRoot(request, 'committer', 'user'); if (document.files) { document.files.forEach(file => { @@ -229,7 +232,7 @@ class GitHubProcessor { request.linkResource('commit', context.qualifier); request.linkSiblings(`${context.qualifier}:commit_comments`); - this._addRootReference(request, 'user', 'user'); + this._addRoot(request, 'user', 'user'); return document; } @@ -237,15 +240,16 @@ class GitHubProcessor { const document = request.document; const context = request.context; request.addSelfLink(); + request.linkResource('repo', `urn:repo:${document.base.repo.id}`); request.linkSiblings(`${context.qualifier}:pull_requests`); - this._addRootReference(request, 'user', 'user'); - this._addRootReference(request, 'merged_by', 'user'); - this._addRootReference(request, 'assignee', 'user'); + this._addRoot(request, 'user', 'user'); + this._addRoot(request, 'merged_by', 'user'); + this._addRoot(request, 'assignee', 'user'); if (document.head.repo) { - this._addRootContainer(request, 'head', 'repo', document.head.repo.url, `urn:repo:${document.head.repo.id}`); + this._addRoot(request, 'head', 'repo', document.head.repo.url, `urn:repo:${document.head.repo.id}`); } - this._addRootContainer(request, 'base', 'repo', document.base.repo.url, `urn:repo:${document.base.repo.id}`); + this._addRoot(request, 'base', 'repo', document.base.repo.url, `urn:repo:${document.base.repo.id}`); if (document._links.review_comments && document.comments) { this._addCollection(request, 'review_comments', 'review_comment', document._links.review_comments.href); @@ -279,7 +283,7 @@ class GitHubProcessor { request.linkResource('pull_request', context.qualifier); request.linkSiblings(`${context.qualifier}:review_comments`); - this._addRootReference(request, 'user', 'user'); + this._addRoot(request, 'user', 'user'); return document; } @@ -297,10 +301,10 @@ class GitHubProcessor { request.linkResource('assignees', assignees); } - this._addRootReference(request, 'user', 'user'); - this._addRootContainer(request, 'repo', 'repo', document.repository_url, context.qualifier); - this._addRootReference(request, 'assignee', 'user'); - this._addRootReference(request, 'closed_by', 'user'); + this._addRoot(request, 'user', 'user'); + this._addRoot(request, 'repo', 'repo', document.repository_url, context.qualifier); + this._addRoot(request, 'assignee', 'user'); + this._addRoot(request, 'closed_by', 'user'); if (document.comments_url && document.comments) { this._addCollection(request, 'issue_comments', 'issue_comment', document.comments_url); } @@ -325,16 +329,16 @@ class GitHubProcessor { request.linkResource('issue', context.qualifier); request.linkSiblings(`${context.qualifier}:issue_comments`); - this._addRootReference(request, 'user', 'user'); + this._addRoot(request, 'user', 'user'); return document; } team(request) { const document = request.document; - request.addSelfLink(); + request.addRootSelfLink(); request.linkSiblings(`urn:org:${document.organization.id}:teams`); - this._addRootContainer(request, 'organization', 'org'); + this._addRoot(request, 'organization', 'org'); this._addRelation(request, 'members', 'user', document.members_url.replace('{/member}', ''), `${this._getQualifier(request)}:team_members`); this._addRelation(request, 'repos', 'repo', document.repositories_url); return document; @@ -346,7 +350,7 @@ class GitHubProcessor { request.addSelfLink(); request.linkSiblings(`${context.qualifier}:deployments`); request.linkResource('commit', `${context.qualifier}:commit:${document.sha}`); - this._addRootReference(request, 'creator', 'user'); + this._addRoot(request, 'creator', 'user'); return document; } @@ -629,72 +633,63 @@ class GitHubProcessor { request.linkSiblings(`${qualifier}:${request.type}s`); // TODO understand if the actor is typically the same as the creator or pusher in the payload - this._addRootReference(request, 'actor', 'user'); + this._addRoot(request, 'actor', 'user'); if (repo) { - this._addRootReference(request, 'repo', 'repo'); + this._addRoot(request, 'repo', 'repo'); } - this._addRootReference(request, 'org', 'org'); + this._addRoot(request, 'org', 'org'); return [document, repo, document.payload]; } _addEventResourceReference(request, repo, name, type = name, qualifier = null) { - return this._addEventResource(request, repo, name, type, qualifier, 'reference'); + return this._addEventResource(request, repo, name, type, qualifier); } _addEventResourceContains(request, repo, name, type = name, qualifier = null) { - return this._addEventResource(request, repo, name, type, qualifier, 'contains'); + return this._addEventResource(request, repo, name, type, qualifier); } - _addEventResource(request, repo, name, type = name, qualifier = null, relationship = 'reference') { + _addEventResource(request, repo, name, type = name, qualifier = null) { const payload = request.document.payload; - // if the repo is given then use it. Otherwise, assume the type is a root and construct a urn - qualifier = qualifier || (repo ? `urn:repo:${repo}` : `urn`); const target = payload[name]; if (!target) { - throw new Error(`Payload@${name} missing in ${request.toString()}`); + throw new Error(`payload[${name}] missing in ${request.toString()}`); + } + // if the repo is given then use it. Otherwise, assume the type is a root and construct a urn + qualifier = qualifier || (repo ? `urn:repo:${repo}` : 'urn:'); + const separator = qualifier.endsWith(':') ? '' : ':'; + request.linkResource(name, `${qualifier}${separator}${type}:${payload[name].id}`); + const newRequest = new Request(type, payload[name].url, { qualifier: qualifier }); + newRequest.policy = request.getNextPolicy(name); + if (newRequest.policy) { + request.queueRequests(newRequest); } - request.linkResource(name, qualifier + `:${type}:${payload[name].id}`); - const newRequest = new Request(type, payload[name].url, { qualifier: qualifier }, relationship); - newRequest.policy = request.policy.getNextPolicy(request, relationship); - request.queueRequests(newRequest); return request.document; } - // resources are always a contains relationship _addResource(request, name, type, id, url = null, urn = null, qualifier = null) { qualifier = qualifier || this._getQualifier(request); urn = urn || `${qualifier}:${name}:${id}`; url = url || request.document[`${name}_url`]; request.linkResource(name, urn); - request.queue('contains', type, url, { qualifier: qualifier }); + const newPolicy = request.getNextPolicy(name); + request.queue(type, url, newPolicy, { qualifier: qualifier }); } - // collections are always 'contains' relationships _addCollection(request, name, type, url = null, urn = null) { const qualifier = this._getQualifier(request); urn = urn || `${qualifier}:${name}`; url = url || request.document[`${name}_url`]; request.linkCollection(name, urn); - if (this.isRootType(type)) { - return request.queue('contains', name, url, { elementType: type }); - } - const newContext = { qualifier: qualifier, elementType: type }; - newContext.qualifier = request.document._metadata.links.self.href; - request.queue('contains', name, url, newContext); + const newPolicy = request.getNextPolicy(name); + const newContext = { qualifier: request.document._metadata.links.self.href, elementType: type }; + request.queue(name, url, newPolicy, newContext); } - _addRootContainer(request, name, type, url = null, urn = null) { - return this._addRoot(request, 'belongsTo', name, type, url, urn); - } - - _addRootReference(request, name, type, url = null, urn = null) { - return this._addRoot(request, 'reference', name, type, url, urn); - } - - _addRoot(request, relationship, name, type, url = null, urn = null) { + _addRoot(request, name, type, url = null, urn = null) { const element = request.document[name]; // If there is no element then we must have both the url and urn as otherwise we don't know how to compute them if (!element && !(urn && url)) { @@ -704,14 +699,15 @@ class GitHubProcessor { urn = urn || `urn:${type}:${element.id}`; url = url || element.url; request.linkResource(name, urn); - request.queue(relationship, type, url); + const newPolicy = request.getNextPolicy(name); + request.queue(type, url, newPolicy); } /** * Relate this document to a collection of other documents of the given type. For example, * a repo to its collaborators which are users. * - * This creates a relationship between the current document being processed and the named + * This creates a relation between the current document being processed and the named * target resource of the given type. This results in a siblings link with the given name * and urn being added to this document and a relation request queued for the given url. * The document produced by processing that url will have matching siblings links (called 'siblings') @@ -731,8 +727,8 @@ class GitHubProcessor { // Also add an 'knownPages' link to help deal with page clean up. // request.linkCollection('knownPages', `${urn}:pages`); const context = { qualifier: qualifier, relation: { origin: request.type, qualifier: urn, type: type, guid: guid } }; - // queue the relation pages as contains as the current resource does 'contain' the list of things to which it is related. - request.queue('contains', name, url, context, false); + const newPolicy = request.getNextPolicy(name); + request.queue(name, url, newPolicy, context, false); } /** diff --git a/lib/request.js b/lib/request.js index b110299..e67cc86 100644 --- a/lib/request.js +++ b/lib/request.js @@ -7,29 +7,27 @@ const Policy = require('./traversalPolicy'); * Requests describe a resource to capture and process as well as the context for that processing. */ class Request { - constructor(type, url, context = null, relationship = 'contains') { + constructor(type, url, context = null) { this.type = type; this.url = url; this.context = context || {}; - this.relationship = relationship; - this.policy = Policy.default(); + this.policy = Policy.default('type'); } static adopt(object) { if (object.__proto__ !== Request.prototype) { object.__proto__ = Request.prototype; } - object.policy = object.policy || Policy.default(); + object.policy = object.policy || Policy.default(this.type); object.policy = Request._getExpandedPolicy(object.policy); if (object.policy && object.policy.__proto__ !== Policy.prototype) { object.policy.__proto__ = Policy.prototype; } - this.relationship = this.relationship || 'contains'; return object; } static _getExpandedPolicy(policyOrSpec) { - return typeof policyOrSpec === 'string' ? Policy.getPolicy(policyOrSpec) : policyOrSpec; + return typeof policyOrSpec === 'string' ? Policy.getPolicy(`${policyOrSpec}:${this.type}`) : policyOrSpec; } // Setup some internal context and open this request for handling. @@ -43,8 +41,13 @@ class Request { } _expandPolicy() { + if (!this.policy) { + return this.crawler.queueDead(this); + } if (typeof this.policy === 'string') { - const policy = Policy.getPolicy(this.policy); + // if the policy spec does not include a map, default to using the type of this request as the map name + const spec = this.policy.includes(':') ? this.policy : `${this.policy}:${this.type}`; + const policy = Policy.getPolicy(spec); if (!policy) { return this.crawler.queueDead(this); } @@ -128,20 +131,23 @@ class Request { links[name] = { href: href, type: 'relation' }; } + getNextPolicy(name) { + return this.policy.getNextPolicy(name); + } + queueRequests(requests, name = null) { requests = Array.isArray(requests) ? requests : [requests]; const toQueue = requests.filter(request => !this.hasSeen(request)); this.track(this.crawler.queue(toQueue, name)); } - queue(relationship, type, url, context = null, pruneRelation = true, policy = null) { - policy = policy || this.policy.getNextPolicy(this, relationship); + queue(type, url, policy, context = null, pruneRelation = true) { if (!policy) { return; } context = Object.assign({}, this.context, context); context.qualifier = context.qualifier || 'urn:'; - const newRequest = new Request(type, url, context, relationship); + const newRequest = new Request(type, url, context); newRequest.policy = policy; // relations are not transitive so ensure any relation is stripped off if (pruneRelation) { @@ -215,7 +221,7 @@ class Request { createRequeuable() { // Create a new request data structure that has just the things we should queue - const queuable = new Request(this.type, this.url, this.context, this.relationship); + const queuable = new Request(this.type, this.url, this.context); queuable.attemptCount = this.attemptCount; queuable.policy = this.policy; if (this.payload) { diff --git a/lib/traversalPolicy.2.js b/lib/traversalPolicy.2.js new file mode 100644 index 0000000..3a13f69 --- /dev/null +++ b/lib/traversalPolicy.2.js @@ -0,0 +1,302 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +/** +Fetch behavior -- Defines the authoritative source for content. The first word of the name identifies the authority. + * storageOnly - Only use stored content. Skip this resource if we don't already have it + * originStorage - Origin rules. Consider storage first and use it if it matches origin. Otherwise, get content from origin + * storageOriginIfMissing - Storage rules. Only if content is missing from storage, get content from origin + * mutables - Use originStorage if the resource is deemed mutable, storageOriginIfMissing if immutable + * originOnly - Always get content from original source + +Freshness -- How age of the resource, relative what we have seen/done before, factors into whether or not process the resource. + * always - process the resource no matter what + * match - process the resource if origin and stored docs do NOT match + * N - process the resource if newer or if the stored copy is N days old + * version - process the resource if the current stored doc's processing version is behind current + * matchOrVersion - process the resource if stored and origin do not match or the stored processed version is out of date + +Processing -- Which processing to do for a given resource. + * documentAndRelated - generate links etc and queue referenced resources for further processing + * documentAndChildren - generate links etc and queue referenced child resources (i.e., not roots) for further processing + * documentOnly - generate links but do not queue any referenced resources + +Transitivity -- How related resources should be queued. We need to define behavior of four different relationships: +Contains, References, Is-a and contained-By (CRIB). For each we talk about transitivity as a number, 0, 1 or 8 (inifinite). +We assume various traversal cutting techniques such as a list of visted nodes. + * broad - 8, 1, 8, 8 an aggressive broad traversal that can start at any point in the graph. This explores all + strong edges and ensure that weak (reference) edges have the referenced node. + * exact - 0, 0, 0, 0 do not explore any other nodes + * neighbors - 1,1,1,1 ensure neighbors exist + + Basically, once you are doing deep traversal, carry that through for all children, but still allow transivity + control when traversing to a root. A deepDeep traversal to a root will queue that root as deepShallow. Similarly, + when traversing with deepShallow, queued roots end up as shallow. This approach gives you the ability to push deep + for one level. + +=============== Scenarios + +Initialization -- Traverse a subgraph ensuring everything is fetched. If something has already been processed, great, assume it is up to date +* fetch = originStorage +* freshness = match +* processing = documentAndRelated +* transitivity = broad + +Update -- Ensure a subgraph up to date. If something has already been processed, get it again +* fetch = originStorage +* freshness = always +* processing = documentAndRelated +* transitivity = broad + +Events -- Given an event, traverse its subgraph until encountering something previously seen. This ensures the event is recorded and the related resources are present. They may not be completely up to date. +* fetch = originStorage +* freshness = match +* processing = documentAndRelated +* transitivity = broad + +Events and update -- Traverse a subgraph until encountering something previously seen. If that +resource is older than N days, ensure the it is updated +// TODO, what is N's relation to match? +* fetch = originStorage +* freshness = N +* processing = documentAndRelated +* transitivity = broad + +Just Reprocess -- Reprocess just the exact resources we have already fetched +* fetch = storageOnly +* freshness = version +* processing = documentOnly +* transitivity = [broad | exact | neighbors] + +Reprocess and Rediscover -- Reprocess the resources we have and traverse to new/missing resources discovered during reprocessing. Process those as desired. +* fetch = storageOriginIfMissing +* freshness = version +* processing = documentAndRelated +* transitivity = [broad | exact | neighbors] + +Reprocess and Update -- Reprocess anything that is EITHER older version or out of date. +* fetch = originStorage +* freshness = matchOrVersion +* processing = documentAndRelated +* transitivity = [broad | exact | neighbors] + + */ + +const moment = require('moment'); + +class UpdateTransitivity { + + getNextPolicy(request, relationship) { + const currentRelationship = request.relationship; + if (!currentRelationship || ((currentRelationship === 'contains' || currentRelationship === 'belongsTo') && relationship === currentRelationship)) { + return 'update'; + } + return 'neighbors'; + } + + getShortForm() { + return 'U'; + } + + /** + * Given a request that would not otherwise be processed, answer whether or not its document should be + * traversed to discover additional resources to process. + */ + shouldTraverse(request) { + return true; + } +} + +class BroadTransitivity { + + getNextPolicy(request, relationship) { + const currentRelationship = request.relationship; + if (!currentRelationship || ((currentRelationship === 'contains' || currentRelationship === 'belongsTo') && relationship === currentRelationship)) { + return 'broad'; + } + return 'only'; + } + + getShortForm() { + return 'B'; + } + + /** + * Given a request that would not otherwise be processed, answer whether or not its document should be + * traversed to discover additional resources to process. + */ + shouldTraverse(request) { + return true; + } +} + +class OnlyTransitivity { + + getNextPolicy(request, relationship) { + return null; + } + + getShortForm() { + return 'O'; + } + + shouldTraverse(request) { + return false; + } +} + +class NeighborsTransitivity { + + getNextPolicy(request, relationship) { + return 'only'; + } + + getShortForm() { + return 'N'; + } + + shouldTraverse(request) { + return true; + } +} + +const transitivitySpecs = { + update: new UpdateTransitivity(), + broad: new BroadTransitivity(), + only: new OnlyTransitivity(), + neighbors: new NeighborsTransitivity() +}; + +class TraversalPolicy { + + static getPolicy(name) { + const definition = TraversalPolicy[name]; + return definition ? definition() : null; + } + + static default() { + return new TraversalPolicy('mutables', 'match', 'broad'); + } + + static refresh() { + return new TraversalPolicy('mutables', 'match', 'update'); + } + + static reload() { + return new TraversalPolicy('originStorage', 'match', 'update'); + } + + static events() { + return TraversalPolicy.default(); + } + + static reprocess() { + return new TraversalPolicy('storageOnly', 'version', 'update'); + } + + static reprocessAndDiscover() { + return new TraversalPolicy('storageOriginIfMissing', 'version', 'update'); + } + + static reprocessAndUpdate() { + return new TraversalPolicy('mutables', 'matchOrVersion', 'update'); + } + + static clone(policy) { + return new TraversalPolicy(policy.fetch, policy.freshness, policy.transitivity); + } + + constructor(fetch, freshness, transitivity) { + this.fetch = fetch; + this.freshness = freshness; + this.transitivity = transitivity; + } + + getNextPolicy(request, relationship) { + const transitivity = this._getTransitivitySpec().getNextPolicy(request, relationship); + if (transitivity === null) { + return null; + } + return new TraversalPolicy(this.fetch, this.freshness, transitivity); + } + + /** + * Given a request for which the requisite content has been fetched, determine whether or not it needs to be + * processed. + */ + shouldProcess(request, version) { + if (this.freshness === 'always') { + return true; + } + if (this.freshness === 'match') { + // process if the content came from origin then either we did not have it cached or it did not match. Process + return request.contentOrigin === 'origin'; + } + if (typeof this.freshness === 'number') { + // TODO this is not quite right. To tell time freshness we need to get the cached version but if we need to process + // we need the content from origin. Essentially we need to read the processed time with the etag (at that point) + // determine if the content is stale. Testing here is too late. + return moment.diff(request.document._metadata.processedAt, 'hours') > this.freshness * 24; + } + if (this.freshness === 'version' || this.freshness === 'matchOrVersion') { + return !request.document._metadata.version || (request.document._metadata.version < version); + } + throw new Error('Invalid freshness in traversal policy'); + } + + /** + * Given a request that would not otherwise be processed, answer whether or not its document should be + * traversed to discover additional resources to process. + */ + shouldTraverse(request) { + return this._getTransitivitySpec().shouldTraverse(request); + } + + isImmutable(type) { + return ['commit'].includes(type); + } + + /** + * Return the source from which to perform the initial fetch for the given request's resource. + */ + initialFetch(request) { + const mutablesValue = this.isImmutable(request.type) ? 'storage' : 'etag'; + const result = { storageOnly: 'storage', originStorage: 'etag', originMutable: 'storage', storageOriginIfMissing: 'storage', mutables: mutablesValue, originOnly: 'origin' }[this.fetch]; + if (!result) { + throw new Error(`Fetch policy misconfigured ${this.fetch}`); + } + return result; + } + + /** + * Return the source from which to fetch if the original fetch did not find any content + */ + shouldFetchMissing(request) { + const result = { storageOnly: null, originStorage: 'origin', storageOriginIfMissing: 'origin', mutables: 'origin', originOnly: null }[this.fetch]; + if (result === undefined) { + throw new Error(`Fetch policy misconfigured ${this.fetch}`); + } + return result; + } + + /** + * Return a symbolic short form to uniquely identify this policy. + */ + getShortForm() { + const fetch = { storageOnly: 'S', storageOriginIfMissing: 's', originOnly: 'O', originStorage: 'o', mutables: 'm' }[this.fetch]; + let freshness = { always: 'A', match: 'M', version: 'V', matchOrVersion: 'm' }[this.freshness]; + if (!freshness) { + if (typeof this.policy.freshness === 'number') { + freshness = 'N'; + } + } + const transitivity = this._getTransitivitySpec().getShortForm(); + return fetch + freshness + transitivity; + } + + _getTransitivitySpec() { + return transitivitySpecs[this.transitivity]; + } +} + +module.exports = TraversalPolicy; diff --git a/lib/traversalPolicy.js b/lib/traversalPolicy.js index cd39298..f2a7789 100644 --- a/lib/traversalPolicy.js +++ b/lib/traversalPolicy.js @@ -2,12 +2,6 @@ // Licensed under the MIT License. /** - - - - - - Fetch behavior -- Defines the authoritative source for content. The first word of the name identifies the authority. * storageOnly - Only use stored content. Skip this resource if we don't already have it * originStorage - Origin rules. Consider storage first and use it if it matches origin. Otherwise, get content from origin @@ -22,208 +16,115 @@ Freshness -- How age of the resource, relative what we have seen/done before, f * version - process the resource if the current stored doc's processing version is behind current * matchOrVersion - process the resource if stored and origin do not match or the stored processed version is out of date -Processing -- Which processing to do for a given resource. - * documentAndRelated - generate links etc and queue referenced resources for further processing - * documentAndChildren - generate links etc and queue referenced child resources (i.e., not roots) for further processing - * documentOnly - generate links but do not queue any referenced resources - -Transitivity -- How related resources should be queued. We need to define behavior of four different relationships: -Contains, References, Is-a and contained-By (CRIB). For each we talk about transitivity as a number, 0, 1 or 8 (inifinite). -We assume various traversal cutting techniques such as a list of visted nodes. - * broad - 8, 1, 8, 8 an aggressive broad traversal that can start at any point in the graph. This explores all - strong edges and ensure that weak (reference) edges have the referenced node. - * exact - 0, 0, 0, 0 do not explore any other nodes - * neighbors - 1,1,1,1 ensure neighbors exist - - Basically, once you are doing deep traversal, carry that through for all children, but still allow transivity - control when traversing to a root. A deepDeep traversal to a root will queue that root as deepShallow. Similarly, - when traversing with deepShallow, queued roots end up as shallow. This approach gives you the ability to push deep - for one level. - =============== Scenarios Initialization -- Traverse a subgraph ensuring everything is fetched. If something has already been processed, great, assume it is up to date * fetch = originStorage * freshness = match -* processing = documentAndRelated -* transitivity = broad Update -- Ensure a subgraph up to date. If something has already been processed, get it again * fetch = originStorage * freshness = always -* processing = documentAndRelated -* transitivity = broad Events -- Given an event, traverse its subgraph until encountering something previously seen. This ensures the event is recorded and the related resources are present. They may not be completely up to date. * fetch = originStorage * freshness = match -* processing = documentAndRelated -* transitivity = broad Events and update -- Traverse a subgraph until encountering something previously seen. If that resource is older than N days, ensure the it is updated // TODO, what is N's relation to match? * fetch = originStorage * freshness = N -* processing = documentAndRelated -* transitivity = broad Just Reprocess -- Reprocess just the exact resources we have already fetched * fetch = storageOnly * freshness = version -* processing = documentOnly -* transitivity = [broad | exact | neighbors] Reprocess and Rediscover -- Reprocess the resources we have and traverse to new/missing resources discovered during reprocessing. Process those as desired. * fetch = storageOriginIfMissing * freshness = version -* processing = documentAndRelated -* transitivity = [broad | exact | neighbors] Reprocess and Update -- Reprocess anything that is EITHER older version or out of date. * fetch = originStorage * freshness = matchOrVersion -* processing = documentAndRelated -* transitivity = [broad | exact | neighbors] */ const moment = require('moment'); - -class UpdateTransitivity { - - getNextPolicy(request, relationship) { - const currentRelationship = request.relationship; - if (!currentRelationship || ((currentRelationship === 'contains' || currentRelationship === 'belongsTo') && relationship === currentRelationship)) { - return 'update'; - } - return 'neighbors'; - } - - getShortForm() { - return 'U'; - } - - /** - * Given a request that would not otherwise be processed, answer whether or not its document should be - * traversed to discover additional resources to process. - */ - shouldTraverse(request) { - return true; - } -} - -class BroadTransitivity { - - getNextPolicy(request, relationship) { - const currentRelationship = request.relationship; - if (!currentRelationship || ((currentRelationship === 'contains' || currentRelationship === 'belongsTo') && relationship === currentRelationship)) { - return 'broad'; - } - return 'only'; - } - - getShortForm() { - return 'B'; - } - - /** - * Given a request that would not otherwise be processed, answer whether or not its document should be - * traversed to discover additional resources to process. - */ - shouldTraverse(request) { - return true; - } -} - -class OnlyTransitivity { - - getNextPolicy(request, relationship) { - return null; - } - - getShortForm() { - return 'O'; - } - - shouldTraverse(request) { - return false; - } -} - -class NeighborsTransitivity { - - getNextPolicy(request, relationship) { - return 'only'; - } - - getShortForm() { - return 'N'; - } - - shouldTraverse(request) { - return true; - } -} - -const transitivitySpecs = { - update: new UpdateTransitivity(), - broad: new BroadTransitivity(), - only: new OnlyTransitivity(), - neighbors: new NeighborsTransitivity() -}; +const VisitorMap = require('./visitorMap'); class TraversalPolicy { - static getPolicy(name) { - const definition = TraversalPolicy[name]; - return definition ? definition() : null; + static _resolveMapSpec(spec) { + if (!spec) { + return null; + } + if (typeof spec !== 'string') { + return spec; + } + const [mapName, path] = spec.split('@'); + return VisitorMap.getMap(mapName, path); } - static default() { - return new TraversalPolicy('mutables', 'match', 'broad'); + static getPolicy(policySpec) { + const [policyName, mapSpec] = policySpec.split(':'); + const map = TraversalPolicy._resolveMapSpec(mapSpec); + if (!map) { + return null; + } + + const definition = TraversalPolicy[policyName]; + return definition ? definition(map) : null; } - static refresh() { - return new TraversalPolicy('mutables', 'match', 'update'); + static default(map) { + return new TraversalPolicy('mutables', 'match', TraversalPolicy._resolveMapSpec(map)); } - static reload() { - return new TraversalPolicy('originStorage', 'match', 'update'); + static refresh(map) { + return new TraversalPolicy('mutables', 'match', TraversalPolicy._resolveMapSpec(map)); } - static events() { - return TraversalPolicy.default(); + static reload(map) { + return new TraversalPolicy('originStorage', 'match', TraversalPolicy._resolveMapSpec(map)); } - static reprocess() { - return new TraversalPolicy('storageOnly', 'version', 'update'); + static reprocess(map) { + return new TraversalPolicy('storageOnly', 'version', TraversalPolicy._resolveMapSpec(map)); } - static reprocessAndDiscover() { - return new TraversalPolicy('storageOriginIfMissing', 'version', 'update'); + static reprocessAndDiscover(map) { + return new TraversalPolicy('storageOriginIfMissing', 'version', TraversalPolicy._resolveMapSpec(map)); } - static reprocessAndUpdate() { - return new TraversalPolicy('mutables', 'matchOrVersion', 'update'); + static always(map) { + return new TraversalPolicy('origin', 'always', TraversalPolicy._resolveMapSpec(map)); + } + + static reprocessAndUpdate(map) { + return new TraversalPolicy('mutables', 'matchOrVersion', TraversalPolicy._resolveMapSpec(map)); } static clone(policy) { - return new TraversalPolicy(policy.fetch, policy.freshness, policy.transitivity); + return new TraversalPolicy(policy.fetch, policy.freshness, policy.map); } - constructor(fetch, freshness, transitivity) { + constructor(fetch, freshness, map) { this.fetch = fetch; this.freshness = freshness; - this.transitivity = transitivity; + this.map = typeof map === 'string' ? new VisitorMap(map) : map; } - getNextPolicy(request, relationship) { - const transitivity = this._getTransitivitySpec().getNextPolicy(request, relationship); - if (transitivity === null) { + getNextPolicy(name) { + const map = this.map.getNextMap(name); + if (!map) { return null; } - return new TraversalPolicy(this.fetch, this.freshness, transitivity); + return new TraversalPolicy(this.fetch, this.freshness, map); + } + + getCurrentStep() { + return this.map.getCurrentStep(); } /** @@ -254,8 +155,8 @@ class TraversalPolicy { * Given a request that would not otherwise be processed, answer whether or not its document should be * traversed to discover additional resources to process. */ - shouldTraverse(request) { - return this._getTransitivitySpec().shouldTraverse(request); + shouldTraverse() { + return this.map.hasNextStep(); } isImmutable(type) { @@ -296,13 +197,9 @@ class TraversalPolicy { freshness = 'N'; } } - const transitivity = this._getTransitivitySpec().getShortForm(); - return fetch + freshness + transitivity; + return fetch + freshness; } - _getTransitivitySpec() { - return transitivitySpecs[this.transitivity]; - } } -module.exports = TraversalPolicy; \ No newline at end of file +module.exports = TraversalPolicy; diff --git a/lib/visitorMap.js b/lib/visitorMap.js new file mode 100644 index 0000000..3bccf0e --- /dev/null +++ b/lib/visitorMap.js @@ -0,0 +1,338 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +class VisitorMap { + + static getCopy(name) { + return VisitorMap.copy(VisitorMap._getMap(name)); + } + + static copy(node, seen = new Map()) { + if (typeof node === 'string' || typeof node === 'function') { + return node; + } + if (seen.get(node)) { + return seen.get(node); + } + // if (Array.isArray(node)) { + // if (Array.isArray(node[0])) { + // return [[VisitorMap.copy(node[0][0], seen)]]; + // } + // return [VisitorMap.copy(node[0], seen)]; + // } + const result = {}; + seen.set(node, result); + for (let key in node) { + const value = node[key]; + if (typeof value === 'function') { + result[key] = value; + } + result[key] = VisitorMap.copy(value, seen); + } + return result; + } + + static resolve(step, segment) { + return typeof step === 'function' ? step(segment) : step[segment]; + } + + static _getMap(name) { + return mapList[name] || mapList.event[name]; + } + + static getMap(name, path = '/') { + return name ? new VisitorMap(name, path) : null; + } + + constructor(name, path = '/') { + this.name = name; + this.path = path; + } + + getNextMap(next) { + const separator = this.path.endsWith('/') ? '' : '/'; + return this.hasNextStep(next) ? new VisitorMap(this.name, this.path + `${separator}${next}`) : null; + } + + getNextStep(next) { + const current = this.getCurrentStep(); + return this.navigate(current, next); + } + + hasNextStep(next = null) { + const current = this.getCurrentStep(); + // arrays trigger the traversal of a collection/relation but not their contents. Terminal nodes only + if (Array.isArray(current)) { + return false; + } + const props = Object.getOwnPropertyNames(current); + if (props.length === 0) { + return false; + } + return next ? true : props.includes(next); + } + + getCurrentStep() { + const map = this.getMap(); + if (!map) { + throw new Error(`VisitorMap in an invalid state. Unknown map: ${this.name}`); + } + return this.navigate(this.getMap(), this.getPath()); + } + + navigate(map, path) { + if (!map) { + throw new Error('VisitorMap in an invalid state. Unknown map.') + } + path = this._resolvePath(path); + let current = map; + let currentPath = []; + for (let i = 0; i < path.length; i++) { + const segment = path[i]; + currentPath.push(segment); + current = VisitorMap.resolve(current, segment); + if (!current) { + return current; + } + } + return current; + } + + getMap() { + return VisitorMap._getMap(this.name); + } + + getPath() { + return this._resolvePath(this.path); + } + + _resolvePath(spec) { + if (Array.isArray(spec)) { + return spec; + } + if (spec === '/') { + return []; + } + return spec.split('/').slice(spec.startsWith('/') ? 1 : 0); + } +} + +module.exports = VisitorMap; + + +// Map building blocks +const self = {}; + +function neighbors() { + return self; +} + +function collection(type) { + // return [type]; + return type; +} + +function relation(type) { + // return [[type]]; + return type; +} + +// /org/repos/collaborators = [], {} +// + +const commit_comment = { + _type: 'commit_comment', + user: self +}; + + +const commit = { + _type: 'commit', + commit_comments: collection(commit_comment), + repo: self, + author: self, + committer: self +}; + +const status = self; + +const issue_comment = { + _type: 'issue_comment', + user: self +}; + +const issue = { + _type: 'issue', + // assignees: collection(user), + user: self, + repo: self, + assignee: self, + closed_by: self, + issue_comments: collection(issue_comment) + // pull_request: pull_request +} + +const review_comment = { + _type: 'review_comment', + user: self +}; + +const review = { + _type: 'review', + user: self, + pull_request: self +}; + +const pull_request = { + _type: 'pull_request', + user: self, + merged_by: self, + assignee: self, + head: self, + base: self, + review: review, + review_comments: review_comment, + statuses: collection(status), + commits: collection(commit), + issue: issue, + issue_comments: collection(issue_comment) +} +// patch pull_request into issue as it is a cycle. +issue.pull_request = pull_request; + +const deployment = { + _type: 'deployment', + created_by: self +}; + +const traffic = { + _type: 'traffic', + referrers: self, + paths: self, + views: self, + clones: self +}; + +const team = { + _type: 'team', + organization: self, + members: relation(self), + repos: relation(self) +} + +const repo = { + _type: 'repo', + owner: self, + organization: self, + teams: relation(team), + collaborators: relation(self), + contributors: relation(self), + subscribers: relation(self), + issues: collection(issue), + commits: collection(commit), + events: collection(event) +} + +const user = { + _type: 'user', + repos: collection(repo) +}; + +const org = { + _type: 'org', + repos: collection(repo), + user: user, + members: relation(user), + teams: relation(team) +}; + + +function event(additions = {}) { + const base = { + actor: self, + repo: self, + org: self + }; + return Object.assign({}, base, additions); +} + +const eventList = { + CommitCommentEvent: event({ + commit: commit, + commit_comment: commit_comment + }), + CreateEvent: event(), + DeleteEvent: event(), + DeploymentEvent: event({ + deployment: deployment + }), + DeploymentStatusEvent: event({ + deployment: deployment + }), + ForkEvent: event(), + GollumEvent: event(), + IssueCommentEvent: event({ + issue: issue, + issue_comment: issue_comment + }), + IssuesEvent: event({ + issue: issue, + assignee: self, + label: self + }), + LabelEvent: event(), + MemberEvent: event({ + member: self + }), + MembershipEvent: event({ + member: self, + team: self + }), + MilestoneEvent: event(), + PageBuildEvent: event(), + PublicEvent: event(), + PullRequestEvent: event({ + pull_request: pull_request + }), + PullRequestReviewEvent: event({ + pull_request: pull_request + }), + PullRequestReviewCommentEvent: event({ + pull_request: pull_request, + comment: review_comment + }), + PushEvent: event(), + ReleaseEvent: event(), + RepositoryEvent: event({ + repository: self + }), + StatusEvent: event(), + TeamEvent: event({ + repository: self, + team: self + }), + TeamAddEvent: event({ + repository: self, + team: self + }), + WatchEvent: event() +}; + +const mapList = VisitorMap.copy({ + self: self, + neighbors: neighbors, + event: eventList, + org: org, + repo: repo, + user: user, + team: team, + commit: commit, + commit_comment: commit_comment, + deployment: deployment, + issue: issue, + issue_comment: issue_comment, + pull_request: pull_request, + review: review, + review_comment: review_comment, + traffic: traffic +}); diff --git a/test/crawlerTests.js b/test/crawlerTests.js index 7b7ce4f..363fa62 100644 --- a/test/crawlerTests.js +++ b/test/crawlerTests.js @@ -178,7 +178,7 @@ describe('Crawler error handler', () => { const request = crawler._errorHandler(box, error); expect(request.shouldSkip()).to.be.true; expect(request.shouldRequeue()).to.be.true; - expect(request.outcome).to.be.equal('Processing Error'); + expect(request.outcome).to.be.equal('Error'); expect(request.message).to.be.equal(error); }); @@ -350,6 +350,7 @@ describe('Crawler requeue', () => { request.markRequeue(); request._retryQueue = 'normal'; const crawler = createBaseCrawler({ queues: queues }); + request.crawler = crawler; return crawler._requeue(request).then(() => { queue = [].concat.apply([], queue); expect(queue.length).to.be.equal(0); @@ -590,13 +591,14 @@ describe('Crawler process document', () => { }); it('should invoke a handler', () => { - const originalRequest = new Request('test', 'http://test.com'); + const originalRequest = new Request('user', 'http://test.com'); + originalRequest.policy = TraversalPolicy.always('user'); const doc = { _metadata: {} }; originalRequest.document = doc; const crawler = createBaseCrawler(); const processorBox = []; - crawler.processor.test = request => { - processorBox[0] = 42; + crawler.processor.user = request => { + processorBox.push(42); request.document.cool = 'content'; return request.document; }; @@ -621,11 +623,13 @@ describe('Crawler process document', () => { }); it('should throw if the handler throws', () => { - const originalRequest = new Request('test', 'http://test.com'); + const originalRequest = new Request('user', 'http://test.com'); + originalRequest.policy = TraversalPolicy.reload('user'); + originalRequest.policy.freshness = 'always'; const doc = { _metadata: {} }; originalRequest.document = doc; const crawler = createBaseCrawler(); - crawler.processor.test = request => { throw new Error('bummer'); }; + crawler.processor.user = request => { throw new Error('bummer'); }; return Q.try(() => { return crawler._processDocument(originalRequest) }).then( @@ -762,7 +766,9 @@ describe('Crawler whole meal deal', () => { const normal = crawler.queues.queueTable['normal']; const priority = crawler.queues.queueTable['priority']; - normal.requests = [new Request('user', 'http://test.com/users/user1')]; + const request = new Request('user', 'http://test.com/users/user1'); + request.policy = TraversalPolicy.reload('user'); + normal.requests = [request]; crawler.fetcher.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })]; return Q.try(() => { return crawler.processOne({ name: 'test' }); }).then( () => { @@ -936,7 +942,9 @@ describe('Crawler whole meal deal', () => { const crawler = createFullCrawler(); crawler.store = { upsert: () => { throw new Error('bad upsert') } }; const normal = crawler.queues.queueTable['normal']; - normal.requests = [new Request('user', 'http://test.com/users/user1')]; + const request = new Request('user', 'http://test.com/users/user1'); + request.policy = TraversalPolicy.reload('user'); + normal.requests = [request]; crawler.fetcher.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })]; return Q.try(() => { @@ -1133,8 +1141,9 @@ function createBaseStore({etag = null, upsert = null, get = null} = {}) { return result; } -function createBaseLog({info = null, warn = null, error = null, verbose = null, silly = null} = {}) { +function createBaseLog({log = null, info = null, warn = null, error = null, verbose = null, silly = null} = {}) { const result = {}; + result.log = log || (() => { }); result.info = info || (() => { }); result.warn = warn || (() => { }); result.error = error || (() => { }); diff --git a/test/gitHubProcessorTests.js b/test/gitHubProcessorTests.js index 17c0120..38be9cd 100644 --- a/test/gitHubProcessorTests.js +++ b/test/gitHubProcessorTests.js @@ -11,9 +11,8 @@ const TraversalPolicy = require('../lib/traversalPolicy'); describe('GitHubProcessor reprocessing', () => { it('will skip if at same version', () => { const processor = new GitHubProcessor(); - const request = new Request('user', 'http://test.com/users/user1'); - request.policy.freshness = 'version'; - request.policy.transitivity = 'only'; + const request = createRequest('user', 'http://test.com/users/user1'); + request.policy = TraversalPolicy.reprocess('user'); request.document = { _metadata: { version: processor.version } }; sinon.stub(processor, 'user', () => { }); processor.process(request); @@ -23,9 +22,8 @@ describe('GitHubProcessor reprocessing', () => { it('will skip and warn if at greater version', () => { const processor = new GitHubProcessor(); - const request = new Request('user', 'http://test.com/users/user1'); - request.policy.freshness = 'version'; - request.policy.transitivity = 'only'; + const request = createRequest('user', 'http://test.com/users/user1'); + request.policy = TraversalPolicy.reprocess('user'); request.document = { _metadata: { version: processor.version + 1 } }; sinon.stub(processor, 'user', () => { }); processor.process(request); @@ -36,7 +34,8 @@ describe('GitHubProcessor reprocessing', () => { it('will process and update if at lesser version', () => { const processor = new GitHubProcessor(); - const request = new Request('user', 'http://test.com/users/user1'); + const request = createRequest('user', 'http://test.com/users/user1'); + request.policy = TraversalPolicy.reprocess('user'); request.fetch = 'none'; request.document = { _metadata: { version: processor.version - 1 } }; sinon.stub(processor, 'user', () => { return request.document; }); @@ -49,7 +48,9 @@ describe('GitHubProcessor reprocessing', () => { describe('Collection processing', () => { it('should queue collection pages as broad and elements as broad', () => { - const request = new Request('issues', 'http://test.com/issues', { elementType: 'issue' }); + const request = createRequest('issues', 'http://test.com/issues', { elementType: 'issue' }); + request.policy = TraversalPolicy.refresh('repo@issues'); + request.policy.freshness = 'always'; request.response = { headers: { link: createLinkHeader(request.url, null, 2, 2) } }; @@ -64,14 +65,12 @@ describe('Collection processing', () => { expect(queue.getCall(0).args[1]).to.be.equal('soon'); const newPages = queue.getCall(0).args[0]; expect(newPages.length).to.be.equal(1); - expect(newPages[0].policy.transitivity).to.be.equal('broad'); expect(newPages[0].url).to.be.equal('http://test.com/issues?page=2&per_page=100'); expect(newPages[0].type).to.be.equal('issues'); let newRequest = queue.getCall(1).args[0]; expect(newRequest.length).to.be.equal(1); newRequest = newRequest[0]; - expect(newRequest.policy.transitivity).to.be.equal('broad'); expect(newRequest.url).to.be.equal('http://child1'); expect(newRequest.type).to.be.equal('issue'); }); @@ -79,8 +78,9 @@ describe('Collection processing', () => { describe('URN building', () => { it('should create urn for team members', () => { - const request = new Request('repo', 'http://test.com/foo'); - request.policy = TraversalPolicy.refresh(); + const request = createRequest('repo', 'http://test.com/foo'); + request.policy = TraversalPolicy.refresh('repo'); + request.policy.freshness = 'always'; request.document = { _metadata: { links: {} }, id: 42, owner: { url: 'http://test.com/test' }, teams_url: 'http://test.com/teams', issues_url: 'http://test.com/issues', commits_url: 'http://test.com/commits', collaborators_url: 'http://test.com/collaborators' }; request.crawler = { queue: () => { }, queues: { pushPriority: () => { } } }; const queue = sinon.spy(request.crawler, 'queue'); @@ -135,7 +135,7 @@ describe('URN building', () => { describe('Org processing', () => { it('should link and queue correctly', () => { - const request = new Request('org', 'http://org/9'); + const request = createRequest('org', 'http://org/9'); request.context = {}; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; @@ -160,20 +160,19 @@ describe('Org processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://users/9', relationship: 'reference', transitivity: 'only' }, - { type: 'repos', url: 'http://repos', relationship: 'contains', transitivity: 'broad' }, - { type: 'members', url: 'http://members', relationship: 'contains', transitivity: 'broad' }, - { type: 'teams', url: 'http://orgs/9/teams', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://users/9', path: '/user' }, + { type: 'repos', url: 'http://repos', qualifier: 'urn:org:9', path: '/repos' }, + { type: 'members', url: 'http://members', qualifier: 'urn:org:9', path: '/members' }, + { type: 'teams', url: 'http://orgs/9/teams', qualifier: 'urn:org:9', path: '/teams' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('User processing', () => { it('should link and queue correctly', () => { - const request = new Request('user', 'http://user/9'); - request.context = {}; + const request = createRequest('user', 'http://user/9'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; request.document = { @@ -192,17 +191,16 @@ describe('User processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'repos', url: 'http://repos', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'repos', url: 'http://repos', qualifier: 'urn:user:9', path: '/repos' }, ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Repo processing', () => { it('should link and queue correctly', () => { - const request = new Request('repo', 'http://foo/repo/12'); - request.context = {}; + const request = createRequest('repo', 'http://foo/repo/12'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; request.document = { @@ -215,6 +213,7 @@ describe('Repo processing', () => { events_url: 'http://events', issues_url: 'http://issues{/number}', pulls_url: 'http://pulls{/number}', + subscribers_count: 1, subscribers_url: 'http://subscribers', teams_url: 'http://teams', organization: { id: 24, url: 'http://org/24' }, @@ -239,22 +238,22 @@ describe('Repo processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/45', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/24', relationship: 'belongsTo', transitivity: 'only' }, - { type: 'teams', url: 'http://teams', relationship: 'contains', transitivity: 'broad', relation: { origin: 'repo', qualifier: 'urn:repo:12:teams', type: 'team' } }, - { type: 'collaborators', url: 'http://collaborators', relationship: 'contains', transitivity: 'broad', relation: { origin: 'repo', qualifier: 'urn:repo:12:collaborators', type: 'user' } }, - { type: 'contributors', url: 'http://contributors', relationship: 'contains', transitivity: 'broad', relation: { origin: 'repo', qualifier: 'urn:repo:12:contributors', type: 'user' } }, - { type: 'subscribers', url: 'http://subscribers', relationship: 'contains', transitivity: 'broad', relation: { origin: 'repo', qualifier: 'urn:repo:12:subscribers', type: 'user' } }, - { type: 'issues', url: 'http://issues', relationship: 'contains', transitivity: 'broad' }, - { type: 'commits', url: 'http://commits', relationship: 'contains', transitivity: 'broad' }, - { type: 'events', url: 'http://events', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/45', path: '/owner' }, + { type: 'org', url: 'http://org/24', path: '/organization' }, + { type: 'teams', url: 'http://teams', qualifier: 'urn:repo:12', path: '/teams', relation: { origin: 'repo', qualifier: 'urn:repo:12:teams', type: 'team' } }, + { type: 'collaborators', url: 'http://collaborators', qualifier: 'urn:repo:12', path: '/collaborators', relation: { origin: 'repo', qualifier: 'urn:repo:12:collaborators', type: 'user' } }, + { type: 'contributors', url: 'http://contributors', qualifier: 'urn:repo:12', path: '/contributors', relation: { origin: 'repo', qualifier: 'urn:repo:12:contributors', type: 'user' } }, + { type: 'subscribers', url: 'http://subscribers', qualifier: 'urn:repo:12', path: '/subscribers', relation: { origin: 'repo', qualifier: 'urn:repo:12:subscribers', type: 'user' } }, + { type: 'issues', url: 'http://issues?state=all', qualifier: 'urn:repo:12', path: '/issues', }, + { type: 'commits', url: 'http://commits', qualifier: 'urn:repo:12', path: '/commits', }, + { type: 'events', url: 'http://events', qualifier: 'urn:repo:12', path: '/events', } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue CreateEvent', () => { - const request = new Request('CreateEvent', 'http://foo'); + const request = createRequest('CreateEvent', 'http://foo'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -275,18 +274,18 @@ describe('Repo processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Commit processing', () => { it('should link and queue correctly', () => { - const request = new Request('commit', 'http://foo/commit'); + const request = createRequest('commit', 'http://foo/commit'); request.context = { qualifier: 'urn:repo:12' }; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; @@ -314,19 +313,19 @@ describe('Commit processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/7', relationship: 'reference', transitivity: 'only' }, - { type: 'user', url: 'http://user/15', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/12', relationship: 'belongsTo', transitivity: 'only' }, - { type: 'commit_comments', url: 'http://comments', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/7', path: '/author' }, + { type: 'user', url: 'http://user/15', path: '/committer' }, + { type: 'repo', url: 'http://repo/12', path: '/repo' }, + { type: 'commit_comments', url: 'http://comments', qualifier: 'urn:repo:12:commit:6dcb09b5b5', path: '/commit_comments' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Commit comment processing', () => { it('should link and queue correctly', () => { - const request = new Request('commit_comment', 'http://repo/commit/comment'); + const request = createRequest('commit_comment', 'http://repo/commit/comment'); request.context = { qualifier: 'urn:repo:12:commit:a1b1' }; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; @@ -346,14 +345,14 @@ describe('Commit comment processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/7', relationship: 'reference', transitivity: 'only' }, + const expected = [ + { type: 'user', url: 'http://user/7', path: '/user' }, ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue CommitCommentEvent', () => { - const request = new Request('CommitCommentEvent', 'http://foo/pull'); + const request = createRequest('CommitCommentEvent', 'http://foo/pull'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -375,20 +374,20 @@ describe('Commit comment processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'commit_comment', url: 'http://commit_comment/7', relationship: 'contains', transitivity: 'broad' }, - { type: 'commit', url: 'http://repo/4/commits/a1b1', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'commit', url: 'http://repo/4/commits/a1b1', qualifier: 'urn:repo:4', path: '/commit' }, + { type: 'commit_comment', url: 'http://commit_comment/7', qualifier: 'urn:repo:4:commit:a1b1', path: '/commit_comment' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Deployment processing', () => { it('should link and queue correctly', () => { - const request = new Request('deployment', 'http://foo'); + const request = createRequest('deployment', 'http://foo'); request.context = { qualifier: 'urn:repo:12' }; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; @@ -409,22 +408,24 @@ describe('Deployment processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/7', relationship: 'reference', transitivity: 'only' } + const expected = [ + { type: 'user', url: 'http://user/7', path: '/creator' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Pull Request processing', () => { it('should link and queue correctly', () => { - const request = new Request('pull_request', 'http://foo/pull'); + const request = createRequest('pull_request', 'http://foo/pull'); request.context = { qualifier: 'urn:repo:12' }; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; request.document = { _metadata: { links: {} }, id: 13, + comments: 1, + commits: 1, assignee: { id: 1, url: 'http://user/1' }, milestone: { id: 26 }, head: { repo: { id: 45, url: 'http://repo/45' } }, @@ -449,6 +450,7 @@ describe('Pull Request processing', () => { assignee: { href: 'urn:user:1', type: 'resource' }, head: { href: 'urn:repo:45', type: 'resource' }, base: { href: 'urn:repo:17', type: 'resource' }, + repo: { href: 'urn:repo:17', type: 'resource' }, review_comments: { href: 'urn:repo:12:pull_request:13:review_comments', type: 'collection' }, commits: { href: 'urn:repo:12:pull_request:13:commits', type: 'collection' }, statuses: { href: 'urn:repo:12:commit:funkySHA:statuses', type: 'collection' }, @@ -457,21 +459,21 @@ describe('Pull Request processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/7', relationship: 'reference', transitivity: 'only' }, - { type: 'user', url: 'http://user/15', relationship: 'reference', transitivity: 'only' }, - { type: 'user', url: 'http://user/1', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/45', relationship: 'belongsTo', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/17', relationship: 'belongsTo', transitivity: 'only' }, - { type: 'review_comments', url: 'http://review_comments', relationship: 'contains', transitivity: 'broad' }, - { type: 'commits', url: 'http://commits', relationship: 'contains', transitivity: 'broad' }, - { type: 'statuses', url: 'http://statuses/funkySHA', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/7', path: '/user' }, + { type: 'user', url: 'http://user/15', path: '/merged_by' }, + { type: 'user', url: 'http://user/1', path: '/assignee' }, + { type: 'repo', url: 'http://repo/45', path: '/head' }, + { type: 'repo', url: 'http://repo/17', path: '/base' }, + { type: 'review_comments', url: 'http://review_comments', qualifier: 'urn:repo:12:pull_request:13', path: '/review_comments' }, + { type: 'statuses', url: 'http://statuses/funkySHA', qualifier: 'urn:repo:12:pull_request:13', path: '/statuses' }, + { type: 'commits', url: 'http://commits', qualifier: 'urn:repo:12:pull_request:13', path: '/commits' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue PullRequestEvent', () => { - const request = new Request('PullRequestEvent', 'http://foo/pull'); + const request = createRequest('PullRequestEvent', 'http://foo/pull'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -492,17 +494,17 @@ describe('Pull Request processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'pull_request', url: 'http://pull_request/1', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'pull_request', url: 'http://pull_request/1', qualifier: 'urn:repo:4', path: '/pull_request' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue PullRequestReviewEvent', () => { - const request = new Request('PullRequestReviewEvent', 'http://foo/pull'); + const request = createRequest('PullRequestReviewEvent', 'http://foo/pull'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -523,19 +525,19 @@ describe('Pull Request processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'pull_request', url: 'http://pull_request/1', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'pull_request', url: 'http://pull_request/1', qualifier: 'urn:repo:4', path: '/pull_request' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Pull request/review comment processing', () => { it('should link and queue correctly', () => { - const request = new Request('review_comment', 'http://repo/pull_request/comment'); + const request = createRequest('review_comment', 'http://repo/pull_request/comment'); request.context = { qualifier: 'urn:repo:12:pull_request:27' }; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; @@ -555,14 +557,14 @@ describe('Pull request/review comment processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/7', relationship: 'reference', transitivity: 'only' }, + const expected = [ + { type: 'user', url: 'http://user/7', path: '/user' }, ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue PullRequestReviewCommentEvent', () => { - const request = new Request('PullRequestReviewCommentEvent', 'http://foo/pull'); + const request = createRequest('PullRequestReviewCommentEvent', 'http://foo/pull'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -585,26 +587,27 @@ describe('Pull request/review comment processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'review_comment', url: 'http://review_comment/7', relationship: 'contains', transitivity: 'broad' }, - { type: 'pull_request', url: 'http://pull_request/1', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'review_comment', url: 'http://review_comment/7', qualifier: 'urn:repo:4:pull_request:1', path: '/comment' }, + { type: 'pull_request', url: 'http://pull_request/1', qualifier: 'urn:repo:4', path: '/pull_request' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Issue processing', () => { it('should link and queue correctly', () => { - const request = new Request('issue', 'http://repo/issue'); + const request = createRequest('issue', 'http://repo/issue'); request.context = { qualifier: 'urn:repo:12' }; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; request.document = { _metadata: { links: {} }, id: 27, + comments: 1, assignee: { id: 1, url: 'http://user/1' }, assignees: [{ id: 50 }, { id: 51 }], milestone: { id: 26 }, @@ -632,19 +635,19 @@ describe('Issue processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/7', relationship: 'reference', transitivity: 'only' }, - { type: 'user', url: 'http://user/15', relationship: 'reference', transitivity: 'only' }, - { type: 'user', url: 'http://user/1', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/45', relationship: 'belongsTo', transitivity: 'only' }, - { type: 'issue_comments', url: 'http://issue/27/comments', relationship: 'contains', transitivity: 'broad' }, - { type: 'pull_request', url: 'http://pull_request/27', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/7', path: '/user' }, + { type: 'user', url: 'http://user/15', path: '/closed_by' }, + { type: 'user', url: 'http://user/1', path: '/assignee' }, + { type: 'repo', url: 'http://repo/45', path: '/repo' }, + { type: 'issue_comments', url: 'http://issue/27/comments', qualifier: 'urn:repo:12:issue:27', path: '/issue_comments' }, + { type: 'pull_request', url: 'http://pull_request/27', qualifier: 'urn:repo:12', path: '/pull_request' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue IssuesEvent', () => { - const request = new Request('IssuesEvent', 'http://foo/pull'); + const request = createRequest('IssuesEvent', 'http://foo/pull'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -669,21 +672,21 @@ describe('Issue processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'user', url: 'http://user/2', relationship: 'reference', transitivity: 'only' }, - { type: 'issue', url: 'http://issue/1', relationship: 'contains', transitivity: 'broad' }, - { type: 'label', url: 'http://label/8', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'user', url: 'http://user/2', path: '/assignee' }, + { type: 'issue', url: 'http://issue/1', qualifier: 'urn:repo:4', path: '/issue' }, + { type: 'label', url: 'http://label/8', qualifier: 'urn:repo:4', path: '/label' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Issue comment processing', () => { it('should link and queue correctly', () => { - const request = new Request('issue_comment', 'http://repo/issue/comment'); + const request = createRequest('issue_comment', 'http://repo/issue/comment'); request.context = { qualifier: 'urn:repo:12:issue:27' }; const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; @@ -703,14 +706,14 @@ describe('Issue comment processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/7', relationship: 'reference', transitivity: 'only' }, + const expected = [ + { type: 'user', url: 'http://user/7', path: '/user' }, ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue IssueCommentEvent', () => { - const request = new Request('IssueCommentEvent', 'http://foo/'); + const request = createRequest('IssueCommentEvent', 'http://foo/'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -733,20 +736,20 @@ describe('Issue comment processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'issue_comment', url: 'http://issue_comment/7', relationship: 'contains', transitivity: 'broad' }, - { type: 'issue', url: 'http://issue/1', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'issue_comment', url: 'http://issue_comment/7', qualifier: 'urn:repo:4:issue:1', path: '/comment' }, + { type: 'issue', url: 'http://issue/1', qualifier: 'urn:repo:4', path: '/issue' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Status processing', () => { it('should link and queue StatusEvent', () => { - const request = new Request('StatusEvent', 'http://foo/'); + const request = createRequest('StatusEvent', 'http://foo/'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -767,19 +770,18 @@ describe('Status processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Team processing', () => { it('should link and queue correctly', () => { - const request = new Request('team', 'http://team/66'); - request.context = { qualifier: 'urn' }; + const request = createRequest('team', 'http://team/66'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; request.document = { @@ -801,16 +803,16 @@ describe('Team processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'org', url: 'http://orgs/9', relationship: 'belongsTo', transitivity: 'only' }, - { type: 'repos', url: 'http://teams/66/repos', relationship: 'contains', transitivity: 'broad' }, - { type: 'members', url: 'http://teams/66/members', relationship: 'contains', transitivity: 'broad' } + const expected = [ + { type: 'org', url: 'http://orgs/9', path: '/organization' }, + { type: 'repos', url: 'http://teams/66/repos', qualifier: 'urn:team:66', path: '/repos' }, + { type: 'members', url: 'http://teams/66/members', qualifier: 'urn:team:66', path: '/members' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue TeamEvent', () => { - const request = new Request('TeamEvent', 'http://foo/team'); + const request = createRequest('TeamEvent', 'http://foo/team'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -831,16 +833,16 @@ describe('Team processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'team', url: 'http://team/7', relationship: 'reference', transitivity: 'only' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'team', url: 'http://team/7', path: '/team' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue TeamEvent with repository', () => { - const request = new Request('TeamEvent', 'http://foo/team'); + const request = createRequest('TeamEvent', 'http://foo/team'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -863,17 +865,17 @@ describe('Team processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/6', relationship: 'reference', transitivity: 'only' }, - { type: 'team', url: 'http://team/7', relationship: 'reference', transitivity: 'only' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'repo', url: 'http://repo/6', path: '/repository' }, + { type: 'team', url: 'http://team/7', path: '/team' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); it('should link and queue TeamAddEvent', () => { - const request = new Request('TeamAddEvent', 'http://foo/team'); + const request = createRequest('TeamAddEvent', 'http://foo/team'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -896,19 +898,19 @@ describe('Team processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/6', relationship: 'reference', transitivity: 'only' }, - { type: 'team', url: 'http://team/7', relationship: 'reference', transitivity: 'only' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'org', url: 'http://org/5', path: '/org' }, + { type: 'repo', url: 'http://repo/6', path: '/repository' }, + { type: 'team', url: 'http://team/7', path: '/team' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); describe('Watch processing', () => { it('should link and queue WatchEvent', () => { - const request = new Request('WatchEvent', 'http://foo/watch'); + const request = createRequest('WatchEvent', 'http://foo/watch'); const queue = []; request.crawler = { queue: sinon.spy(request => { queue.push.apply(queue, request) }) }; const payload = { @@ -929,12 +931,12 @@ describe('Watch processing', () => { } expectLinks(document._metadata.links, links); - const queued = [ - { type: 'user', url: 'http://user/3', relationship: 'reference', transitivity: 'only' }, - { type: 'repo', url: 'http://repo/4', relationship: 'reference', transitivity: 'only' }, - { type: 'org', url: 'http://org/5', relationship: 'reference', transitivity: 'only' } + const expected = [ + { type: 'user', url: 'http://user/3', path: '/actor' }, + { type: 'repo', url: 'http://repo/4', path: '/repo' }, + { type: 'org', url: 'http://org/5', path: '/org' } ]; - expectQueued(queue, queued); + expectQueued(queue, expected); }); }); @@ -956,6 +958,13 @@ describe('Event Finder', () => { // =========================== HELPERS ========================= + +function createRequest(type, url, context = {}) { + const result = new Request(type, url, context); + result.policy = TraversalPolicy.default(type); + return result; +} + function expectLinks(actual, expected) { expect(Object.getOwnPropertyNames(actual).length).to.be.equal(Object.getOwnPropertyNames(expected).length); Object.getOwnPropertyNames(actual).forEach(name => { @@ -980,8 +989,8 @@ function expectQueued(actual, expected) { const er = e.context ? e.context.relation : null; return e.type === a.type && e.url === a.url - && (!e.relationship || e.relationship === a.relationship) - && (!e.transitivity || e.transitivity === a.policy.transitivity) + && (!e.urn || e.urn === a.context.qualifier) + && (!e.path || e.path === a.policy.map.path) && (!er || (er.origin === ar.orgin && er.qualifier === ar.qualifier && er.type === ar.type)); })).to.be.true; }) diff --git a/test/githubFetcherTests.js b/test/githubFetcherTests.js index ffcc67c..43f42f4 100644 --- a/test/githubFetcherTests.js +++ b/test/githubFetcherTests.js @@ -17,7 +17,7 @@ const URL = require('url'); describe('GitHub fetcher', () => { it('should fetch one unseen document', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); const responses = [createResponse('test')]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); const store = createBaseStore({ etag: () => { return Q(null); } }); @@ -31,7 +31,7 @@ describe('GitHub fetcher', () => { it('should set proper types for collection requests', () => { const url = 'http://test.com/foo'; - const request = new Request('repos', url); + const request = createRequest('repos', url); let etagArgs = null; let getArgs = null; const responses = [createResponse('test')]; @@ -54,7 +54,7 @@ describe('GitHub fetcher', () => { }); it('should requeue and delay on 403 forbidden throttling', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); const responses = [createResponse('test', 403, null, 0)]; const requestor = createBaseRequestor({ get: () => { @@ -73,8 +73,8 @@ describe('GitHub fetcher', () => { }); it('should delay on backoff throttling', () => { - const request = new Request('foo', 'http://test'); - const resetTime = Date.now() + 2000; + const request = createRequest('foo', 'http://test'); + const resetTime = Math.floor(Date.now() / 1000) + 2; const responses = [createResponse('bar', 200, null, 30, resetTime)]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); const store = createBaseStore({ etag: () => { return Q(null); } }); @@ -83,15 +83,14 @@ describe('GitHub fetcher', () => { expect(request.document).to.be.equal('bar'); expect(request.shouldRequeue()).to.be.false; expect(request.shouldSkip()).to.be.false; - expect(request.nextRequestTime).to.be.equal(resetTime); + expect(request.nextRequestTime).to.be.equal(resetTime * 1000); }); }); it('should delay on Retry-After throttling', () => { - const request = new Request('foo', 'http://test'); - const resetTime = Date.now() + 3000; - const headers = { 'Retry-After': 3 }; - const responses = [createResponse('bar', 200, null, 30, resetTime, headers)]; + const request = createRequest('foo', 'http://test'); + const headers = { 'retry-after': 3 }; + const responses = [createResponse('bar', 200, null, 300, 244123412, headers)]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); const store = createBaseStore({ etag: () => { return Q(null); } }); const fetcher = createBaseFetcher({ requestor: requestor, store: store }); @@ -100,12 +99,13 @@ describe('GitHub fetcher', () => { expect(request.shouldRequeue()).to.be.false; expect(request.shouldSkip()).to.be.false; // give at most 100ms for the test to run + const resetTime = Date.now() + 3000; expect(request.nextRequestTime).to.be.within(resetTime, resetTime + 100); }); }); it('should skip 409s', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); const responses = [createResponse('test', 409)]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); const store = createBaseStore({ etag: () => { return Q(null); } }); @@ -118,8 +118,8 @@ describe('GitHub fetcher', () => { it('should return cached content and not save and response for 304 with force', () => { const url = 'http://test.com/foo'; - const request = new Request('repos', url); - request.policy = TraversalPolicy.refresh(); + const request = createRequest('repos', url); + request.policy = TraversalPolicy.refresh('self'); let getArgs = null; const responses = [createResponse(null, 304, 42)]; const requestor = createBaseRequestor({ @@ -140,7 +140,7 @@ describe('GitHub fetcher', () => { it('should return cached content and headers for 304', () => { const url = 'http://test.com/foo'; const request = new Request('repos', url); - request.policy = TraversalPolicy.refresh(); + request.policy = TraversalPolicy.refresh('self'); let getArgs = null; const responses = [createResponse(null, 304, 42)]; const requestor = createBaseRequestor({ @@ -160,7 +160,7 @@ describe('GitHub fetcher', () => { }); it('should skip for 304 without force', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); const responses = [createResponse(null, 304, 42)]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); const store = createBaseStore({ etag: () => { return Q(42); }, get: () => { return Q({ id: 13, _metadata: { fetchedAt: 3, version: 7 } }); } }); @@ -172,7 +172,7 @@ describe('GitHub fetcher', () => { }); it('should get from origin with originOnly fetch policy', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); request.policy.fetch = 'originOnly'; const responses = [createResponse('hey there')]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); @@ -184,7 +184,7 @@ describe('GitHub fetcher', () => { }); it('should pull from storage only storageOnly fetch policy', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); request.policy.fetch = 'storageOnly'; const store = createBaseStore({ get: () => { return Q({ _metadata: {}, id: 'test' }); } }); const fetcher = createBaseFetcher({ store: store }); @@ -195,7 +195,7 @@ describe('GitHub fetcher', () => { }); it('should throw for bad codes', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); const responses = [createResponse('test', 500)]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); const store = createBaseStore({ etag: () => { return Q(null); } }); @@ -208,7 +208,7 @@ describe('GitHub fetcher', () => { }); it('should throw for store etag errors', () => { - const request = new Request('foo', 'http://test'); + const request = createRequest('foo', 'http://test'); const store = createBaseStore({ etag: () => { throw new Error('test'); } }); const fetcher = createBaseFetcher({ store: store }); return Q.try(() => { @@ -220,7 +220,7 @@ describe('GitHub fetcher', () => { }); it('should throw for requestor get errors', () => { - const request = new Request('repos', 'http://test'); + const request = createRequest('repos', 'http://test'); const requestor = createBaseRequestor({ get: () => { throw new Error('test'); } }); const store = createBaseStore({ etag: () => { return Q(42); } }); const fetcher = createBaseFetcher({ requestor: requestor, store: store }); @@ -233,7 +233,7 @@ describe('GitHub fetcher', () => { }); it('should throw for store get errors', () => { - const request = new Request('repos', 'http://test'); + const request = createRequest('repos', 'http://test'); request.policy = TraversalPolicy.refresh(); const responses = [createResponse(null, 304, 42)]; const requestor = createBaseRequestor({ get: () => { return Q(responses.shift()); } }); @@ -248,6 +248,11 @@ describe('GitHub fetcher', () => { }); }); +function createRequest(type, url) { + const result = new Request(type, url); + result.policy = TraversalPolicy.default('self'); + return result; +} function createResponse(body, code = 200, etag = null, remaining = 4000, reset = 0, headers = {}) { return { @@ -269,8 +274,8 @@ function createBaseStore({etag = null, upsert = null, get = null} = {}) { return result; } -function createBaseFetcher({ requestor = createBaseRequestor(), store = createBaseStore(), tokenFactory = createBaseTokenFactory(), options = createBaseOptions() } = {}) { - return new GitHubFetcher(requestor, store, tokenFactory, options.fetcher); +function createBaseFetcher({ requestor = createBaseRequestor(), store = createBaseStore(), tokenFactory = createBaseTokenFactory(), limiter = createBaseLimiter(), options = createBaseOptions() } = {}) { + return new GitHubFetcher(requestor, store, tokenFactory, limiter, options.fetcher); } function createBaseRequestor({ get = null, getAll = null } = {}) { @@ -287,6 +292,14 @@ function createBaseTokenFactory() { }; } +function createBaseLimiter() { + return { + run: (key, operation) => { + return operation(); + } + } +} + function createBaseOptions(logger = createBaseLog()) { return { fetcher: { diff --git a/test/requestTests.js b/test/requestTests.js index 41189c6..62d0ee4 100644 --- a/test/requestTests.js +++ b/test/requestTests.js @@ -7,113 +7,6 @@ const Request = require('../lib/request.js'); const sinon = require('sinon'); const TraversalPolicy = require('../lib/traversalPolicy'); -describe('Request transitivity', () => { - - it('will queue contains relationship correctly for broad transitivity', () => { - let request = new Request('user', 'http://test.com/users/user1'); - request.crawler = { queue: () => { } }; - const queue = sinon.spy(request.crawler, 'queue'); - - request.relationship = 'contains'; - request.queue('contains', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(1); - expect(queue.getCall(0).args[0][0].policy.transitivity).to.be.equal('broad'); - - request.relationship = 'belongsTo'; - request.queue('contains', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(2); - expect(queue.getCall(1).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'isa'; - request.queue('contains', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(3); - expect(queue.getCall(2).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'reference'; - request.queue('contains', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(4); - expect(queue.getCall(3).args[0][0].policy.transitivity).to.be.equal('only'); - }); - - it('will queue belongsTo relationship correctly for broad transitivity', () => { - let request = new Request('user', 'http://test.com/users/user1'); - request.crawler = { queue: () => { } }; - const queue = sinon.spy(request.crawler, 'queue'); - - request.relationship = 'contains'; - request.queue('belongsTo', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(1); - expect(queue.getCall(0).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'belongsTo'; - request.queue('belongsTo', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(2); - expect(queue.getCall(1).args[0][0].policy.transitivity).to.be.equal('broad'); - - request.relationship = 'isa'; - request.queue('belongsTo', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(3); - expect(queue.getCall(2).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'reference'; - request.queue('belongsTo', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(4); - expect(queue.getCall(3).args[0][0].policy.transitivity).to.be.equal('only'); - }); - - it('will queue isa relationship correctly for broad transitivity', () => { - let request = new Request('user', 'http://test.com/users/user1'); - request.crawler = { queue: () => { } }; - const queue = sinon.spy(request.crawler, 'queue'); - - request.relationship = 'contains'; - request.queue('isa', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(1); - expect(queue.getCall(0).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'belongsTo'; - request.queue('isa', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(2); - expect(queue.getCall(1).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'isa'; - request.queue('isa', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(3); - expect(queue.getCall(2).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'reference'; - request.queue('isa', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(4); - expect(queue.getCall(3).args[0][0].policy.transitivity).to.be.equal('only'); - }); - - it('will queue reference relationship correctly for broad transitivity', () => { - let request = new Request('user', 'http://test.com/users/user1'); - request.crawler = { queue: () => { } }; - const queue = sinon.spy(request.crawler, 'queue'); - - request.relationship = 'contains'; - request.queue('reference', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(1); - expect(queue.getCall(0).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'belongsTo'; - request.queue('reference', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(2); - expect(queue.getCall(1).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'isa'; - request.queue('reference', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(3); - expect(queue.getCall(2).args[0][0].policy.transitivity).to.be.equal('only'); - - request.relationship = 'reference'; - request.queue('reference', 'foo', 'http://'); - expect(queue.callCount).to.be.equal(4); - expect(queue.getCall(3).args[0][0].policy.transitivity).to.be.equal('only'); - }); -}); - describe('Request context/qualifier', () => { it('will not queueRoot if none transitivity', () => { }); diff --git a/test/visitorMapTests.js b/test/visitorMapTests.js new file mode 100644 index 0000000..c7d9ce0 --- /dev/null +++ b/test/visitorMapTests.js @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +const assert = require('chai').assert; +const expect = require('chai').expect; +const sinon = require('sinon'); +const VisitorMap = require('../lib/visitorMap'); + +describe('Visitor Map', () => { + + it('will get next', () => { + const map = new VisitorMap('org'); + const node = map.getNextStep('repos'); + expect(node._type).to.be.equal('repo'); + }); + + it('will get next giving a self', () => { + const map = new VisitorMap('org', '/repos'); + const node = map.getNextStep('owner'); + expect(Object.getOwnPropertyNames(node).length).to.be.equal(0); + }); + + it('will return undefined for next of self', () => { + const map = new VisitorMap('org', '/repos'); + const node = map.getNextStep('foo'); + expect(node).to.be.undefined; + }); + + it('will return undefined for random next', () => { + const map = new VisitorMap('org'); + const node = map.getNextStep('boo'); + expect(node).to.be.undefined; + }); + + + // it('will get next for collection', () => { + // const map = new VisitorMap('org'); + // const node = map.getNextPolicy('repos'); + // expect(node[0]._type).to.be.equal('repo'); + // }); + + // it('will resolve collections', () => { + // const map = new VisitorMap('org'); + // const node = map.getNextPolicy('repos'); + // expect(VisitorMap.resolve(node)._type).to.be.equal('repo'); + // }); +});