From 0849b33c00ce8ab6709c0c96473b7b6b19a84257 Mon Sep 17 00:00:00 2001 From: Jeff McAffer Date: Sat, 3 Dec 2016 23:22:46 -0800 Subject: [PATCH] get tests running again --- lib/crawler.js | 15 ++++++--- lib/processor.js | 2 +- lib/traversalPolicy.js | 5 ++- test/crawlerTests.js | 76 +++++++++++++++++++++++------------------- test/processorTests.js | 24 +++++++------ test/queueSetTests.js | 13 ++++---- 6 files changed, 74 insertions(+), 61 deletions(-) diff --git a/lib/crawler.js b/lib/crawler.js index 86df90c..17d1b73 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -222,7 +222,7 @@ class Crawler { request.addMeta({ attempt: request.attemptCount }); this.logger.info(`Requeuing attempt ${request.attemptCount} of request ${request.type}@${request.url}`); const queuable = this._createQueuable(request); - return request.newQueue ? this.queues.push(queuable, request.newQueue) : this.queues.repush(request, queuable); + return this.queues.repush(request, queuable); }); } @@ -362,8 +362,10 @@ class Crawler { if (request.shouldSkip()) { return Q(request); } - this._logStartEnd('processing', request, () => request.document = this.processor.process(request)); - return Q(request); + return this._logStartEnd('processing', request, () => { + request.document = this.processor.process(request); + return request; + }); } _logStartEnd(name, request, work) { @@ -372,8 +374,11 @@ class Crawler { this.logger.verbose(`Started ${name} ${uniqueString}`); let result = null; return Q - .try(work) - .then(workResult => { result = workResult; return result; }) + .try(() => { return work(); }) + .then(workResult => { + result = workResult; + return result; + }) .finally(() => { // in the getRequest case we did not have a request to start. Report on the one we found. if (!request && result instanceof Request) { diff --git a/lib/processor.js b/lib/processor.js index 2f2e494..2bb0ae2 100644 --- a/lib/processor.js +++ b/lib/processor.js @@ -36,7 +36,7 @@ class Processor { const requests = []; for (let i = 2; i <= links.last.page; i++) { const url = request.url + `?page=${i}&per_page=100`; - const newRequest = new Request(request.type, url, { qualifier: request.context.qualifier, elementType: request.context.elementType }); + const newRequest = new Request(request.type, url, request.context); newRequest.policy = request.policy; requests.push(newRequest); } diff --git a/lib/traversalPolicy.js b/lib/traversalPolicy.js index 182c919..d4ef3c4 100644 --- a/lib/traversalPolicy.js +++ b/lib/traversalPolicy.js @@ -105,7 +105,7 @@ class TraversalPolicy { return new TraversalPolicy('storageOriginIfMissing', 'version', 'documentAndRelated', 'deepDeep'); } - static reprocess() { + static reprocessAndUpdate() { return new TraversalPolicy('originStorage', 'matchOrVersion', 'documentAndRelated', 'deepDeep'); } @@ -141,8 +141,7 @@ class TraversalPolicy { return null; } const transitivity = { shallow: 'shallow', deepShallow: 'deepShallow', deepDeep: 'deepShallow' }[this.transitivity]; - const freshness = { shallow: this.freshness, deepShallow: this.freshness, deepDeep: 'always' }[this.transitivity]; - return new TraversalPolicy(this.fetch, freshness, this.processing, transitivity); + return new TraversalPolicy(this.fetch, this.freshness, this.processing, transitivity); } /** diff --git a/test/crawlerTests.js b/test/crawlerTests.js index 7b20427..f446791 100644 --- a/test/crawlerTests.js +++ b/test/crawlerTests.js @@ -334,8 +334,7 @@ describe('Crawler fetch', () => { return crawler._fetch(request); }).then( request => assert.fail(), - error => expect(error.message.startsWith('Code: 500')).to.be.true - ); + error => expect(error.message.startsWith('Code 500')).to.be.true); }); it('should throw for store etag errors', () => { @@ -557,21 +556,17 @@ describe('Crawler requeue', () => { const normal = createBaseQueue('normal', { push: request => { queue.push(request); return Q(); } }); const queues = createBaseQueues({ normal: normal }); const crawler = createBaseCrawler({ queues: queues }); - for (let i = 0; i < 5; i++) { - const request = new Request('test', 'http://api.github.com/repo/microsoft/test'); - request.markRequeue(); - request._originQueue = normal; - request.attemptCount = i === 0 ? null : i; - crawler._requeue(request); - expect(request.promises.length).to.be.equal(1); + const request = new Request('test', 'http://api.github.com/repo/microsoft/test'); + request.markRequeue(); + request._originQueue = normal; + return crawler._requeue(request).then(() => { + // expect(request.promises.length).to.be.equal(1); expect(queue.length).to.be.equal(1); expect(queue[0] !== request).to.be.true; expect(queue[0].type === request.type).to.be.true; expect(queue[0].url === request.url).to.be.true; - expect(queue[0].attemptCount).to.be.equal(i + 1); - // pop the request to get ready for the next iteration - queue.shift(); - } + expect(queue[0].attemptCount).to.be.equal(1); + }); }); it('should requeue in deadletter queue after 5 attempts', () => { @@ -585,14 +580,14 @@ describe('Crawler requeue', () => { request.markRequeue(); request._originQueue = normal; const crawler = createBaseCrawler({ queues: queues }); - crawler._requeue(request); - expect(request.promises.length).to.be.equal(1); - expect(queue.length).to.be.equal(0); - expect(deadletterQueue.length).to.be.equal(1); - expect(deadletterQueue[0] !== request).to.be.true; - expect(deadletterQueue[0].type === request.type).to.be.true; - expect(deadletterQueue[0].url === request.url).to.be.true; - expect(deadletterQueue[0].attemptCount).to.be.equal(6); + return crawler._requeue(request).then(() => { + expect(queue.length).to.be.equal(0); + expect(deadletterQueue.length).to.be.equal(1); + expect(deadletterQueue[0] !== request).to.be.true; + expect(deadletterQueue[0].type === request.type).to.be.true; + expect(deadletterQueue[0].url === request.url).to.be.true; + expect(deadletterQueue[0].attemptCount).to.be.equal(6); + }); }); }); @@ -700,12 +695,10 @@ describe('Crawler complete request', () => { error => assert.fail()); }); - it('still dequeues and unlocks if promises fail', () => { - const done = []; - const unlock = []; - const normal = createBaseQueue('normal', { done: request => { done.push(request); return Q(); } }); + it('requeues and unlocks if promises fail', () => { + const normal = createBaseQueue('normal', { push: sinon.spy(() => { return Q(); }) }); const queues = createBaseQueues({ normal: normal }); - const locker = createBaseLocker({ unlock: request => { unlock.push(request); return Q(); } }); + const locker = createBaseLocker({ unlock: sinon.spy(() => { return Q(); }) }); const originalRequest = new Request('test', 'http://test.com'); originalRequest.lock = 42; originalRequest._originQueue = normal; @@ -714,10 +707,12 @@ describe('Crawler complete request', () => { return crawler._completeRequest(originalRequest).then( request => assert.fail(), error => { - expect(done.length).to.be.equal(1); - expect(done[0] === originalRequest).to.be.true; - expect(unlock.length).to.be.equal(1); - expect(unlock[0]).to.be.equal(42); + expect(normal.push.callCount).to.be.equal(1); + const requeued = normal.push.getCall(0).args[0]; + expect(requeued.type).to.be.equal(originalRequest.type); + expect(requeued.url).to.be.equal(originalRequest.url); + expect(locker.unlock.callCount).to.be.equal(1); + expect(locker.unlock.getCall(0).args[0]).to.be.equal(42); }); }); @@ -861,7 +856,7 @@ describe('Crawler process document', () => { const crawler = createBaseCrawler(); crawler.processor.test = request => { throw new Error('bummer'); }; return Q.try(() => { - crawler._processDocument(originalRequest) + return crawler._processDocument(originalRequest) }).then( request => assert.fail(), error => { expect(error.message).to.be.equal('bummer'); }); @@ -987,7 +982,7 @@ describe('Crawler whole meal deal', () => { const context = { name: 'foo', delay: 0 }; return crawler._run(context).then(() => { - expect(context.currentDelay).to.be.approximately(451, 4); + expect(context.currentDelay).to.be.approximately(451, 10); }); }); @@ -1316,8 +1311,14 @@ function createBaseOptions(logger = createBaseLog()) { return { queuing: { logger: logger, - ttl: 1000, - weights: [1] + weights: [1], + parallelPush: 10, + attenuation: { + ttl: 1000 + }, + tracker: { + ttl: 6 * 60 * 1000 + } }, storage: { logger: logger, @@ -1330,9 +1331,13 @@ function createBaseOptions(logger = createBaseLog()) { }, crawler: { logger: logger, - tokenLowerBound: 50, + processingTtl: 60 * 1000, promiseTrace: false, orgList: [], + fetcher: { + tokenLowerBound: 50, + forbiddenDelay: 120000 + } }, requestor: { } @@ -1346,6 +1351,7 @@ function createBaseQueues({ priority = null, normal = null, deadletter = null, o function createBaseQueue(name, { pop = null, push = null, done = null, abandon = null} = {}) { const result = { name: name }; + result.getName = () => { return name; }; result.pop = pop || (() => assert.fail('should not pop')); result.push = push || (() => assert.fail('should not push')); result.done = done || (() => assert.fail('should not done')); diff --git a/test/processorTests.js b/test/processorTests.js index a18179c..8dff0f1 100644 --- a/test/processorTests.js +++ b/test/processorTests.js @@ -45,7 +45,7 @@ describe('Processor reprocessing', () => { describe('Collection processing', () => { it('should queue collection pages as deepShallow and elements as deepShallow', () => { - const request = new Request('issues', 'http://test.com/issues'); + const request = new Request('issues', 'http://test.com/issues', { elementType: 'issue' }); request.policy.transitivity = 'deepShallow'; request.response = { headers: { link: createLinkHeader(request.url, null, 2, 2) } @@ -74,7 +74,7 @@ describe('Collection processing', () => { }); it('should queue deepShallow root collections as deepShallow and elements as shallow', () => { - const request = new Request('orgs', 'http://test.com/orgs'); + const request = new Request('orgs', 'http://test.com/orgs', { elementType: 'org' }); request.policy.transitivity = 'deepShallow'; request.response = { headers: { link: createLinkHeader(request.url, null, 2, 2) } @@ -104,7 +104,7 @@ describe('Collection processing', () => { }); it('should queue forceForce root collection pages as forceForce and elements as forceNormal', () => { - const request = new Request('orgs', 'http://test.com/orgs'); + const request = new Request('orgs', 'http://test.com/orgs', { elementType: 'org' }); request.policy = TraversalPolicy.update(); request.response = { headers: { link: createLinkHeader(request.url, null, 2, 2) } @@ -133,7 +133,7 @@ describe('Collection processing', () => { }); it('should queue forceForce page elements with forceNormal transitivity', () => { - const request = new Request('orgs', 'http://test.com/orgs?page=2&per_page=100'); + const request = new Request('orgs', 'http://test.com/orgs?page=2&per_page=100', { elementType: 'org' }); request.policy = TraversalPolicy.update(); request.document = { _metadata: { links: {} }, elements: [{ url: 'http://child1' }] }; request.crawler = { queue: () => { } }; @@ -162,7 +162,7 @@ describe('URN building', () => { expect(request.crawler.queue.callCount).to.be.at.least(4); const teamsRequest = request.crawler.queue.getCall(1).args[0]; expect(teamsRequest.context.qualifier).to.be.equal('urn:repo:42'); - expect(teamsRequest.context.relation).to.be.deep.equal({ origin: 'repo', name: 'teams', type: 'team' } ); + expect(teamsRequest.context.relation).to.be.deep.equal({ origin: 'repo', name: 'teams', type: 'team' }); request.crawler.queue.reset(); teamsRequest.type = 'teams'; @@ -170,11 +170,13 @@ describe('URN building', () => { teamsRequest.crawler = request.crawler; const teamsPage = processor.process(teamsRequest); const links = teamsPage._metadata.links; - expect(links.teams.type).to.be.equal('self'); - expect(links.teams.hrefs.length).to.be.equal(1); - expect(links.teams.hrefs[0]).to.be.equal('urn:team:13'); - expect(links.repo.type).to.be.equal('self'); + expect(links.resources.type).to.be.equal('resource'); + expect(links.resources.hrefs.length).to.be.equal(1); + expect(links.resources.hrefs[0]).to.be.equal('urn:team:13'); + expect(links.repo.type).to.be.equal('resource'); expect(links.repo.href).to.be.equal('urn:repo:42'); + expect(links.origin.type).to.be.equal('resource'); + expect(links.origin.href).to.be.equal('urn:repo:42'); const teamRequest = request.crawler.queue.getCall(0).args[0]; expect(teamRequest.type).to.be.equal('team'); @@ -187,11 +189,11 @@ describe('URN building', () => { const membersRequest = request.crawler.queue.getCall(0).args[0]; expect(membersRequest.url).to.be.equal('http://team1/members'); expect(membersRequest.context.qualifier).to.be.equal('urn:team:54'); - expect(membersRequest.context.relation).to.be.equal('team_members_relation'); + expect(membersRequest.context.relation).to.be.deep.equal({ name: 'members', origin: 'team', type: 'user' }); const reposRequest = request.crawler.queue.getCall(1).args[0]; expect(reposRequest.url).to.be.equal('http://team1/repos'); expect(reposRequest.context.qualifier).to.be.equal('urn:team:54'); - expect(reposRequest.context.relation).to.be.equal('team_repos_relation'); + expect(reposRequest.context.relation).to.be.deep.equal({ name: 'repos', origin: 'team', type: 'repo' }); }); }); diff --git a/test/queueSetTests.js b/test/queueSetTests.js index 218fe1f..b60f3e6 100644 --- a/test/queueSetTests.js +++ b/test/queueSetTests.js @@ -8,13 +8,13 @@ const sinon = require('sinon'); describe('QueueSet construction', () => { it('should throw on duplicate queue names', () => { - expect(() => new QueueSet([{ name: '1' }, { name: '1' }])).to.throw(Error); + expect(() => new QueueSet([createBaseQueue('1'), createBaseQueue('1')])).to.throw(Error); }); }); describe('QueueSet weighting', () => { it('should create a simple startMap', () => { - const set = new QueueSet([{ name: '1' }, { name: '2' }], null, createOptions([3, 2])); + const set = new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, createOptions([3, 2])); expect(set.startMap.length).to.be.equal(5); expect(set.startMap[0]).to.be.equal(0); expect(set.startMap[2]).to.be.equal(0); @@ -23,21 +23,21 @@ describe('QueueSet weighting', () => { }); it('should create a default startMap if no weights given', () => { - const set = new QueueSet([{ name: '1' }, { name: '2' }]); + const set = new QueueSet([createBaseQueue('1'), createBaseQueue('2')]); expect(set.startMap.length).to.be.equal(1); expect(set.startMap[0]).to.be.equal(0); }); it('should throw if too many weights are given', () => { - expect(() => new QueueSet([{ name: '1' }, { name: '2' }], null, createOptions([3, 2, 1]))).to.throw(Error); + expect(() => new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, createOptions([3, 2, 1]))).to.throw(Error); }); it('should throw if no weights are given', () => { - expect(() => new QueueSet([{ name: '1' }, { name: '2' }], null, [])).to.throw(Error); + expect(() => new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, [])).to.throw(Error); }); it('should create a simple startMap', () => { - const set = new QueueSet([{ name: '1' }, { name: '2' }], null, createOptions([3, 2])); + const set = new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, createOptions([3, 2])); expect(set.startMap.length).to.be.equal(5); expect(set.startMap[0]).to.be.equal(0); expect(set.startMap[2]).to.be.equal(0); @@ -250,6 +250,7 @@ function createBaseQueues(queues, deadletter, weights = [1]) { function createBaseQueue(name, { pop = null, push = null, done = null, abandon = null, subscribe = null, unsubscribe = null} = {}) { const result = { name: name }; + result.getName = () => { return name; }; result.pop = pop || (() => assert.fail('should not pop')); result.push = push || (() => assert.fail('should not push')); result.done = done || (() => assert.fail('should not done'));