This commit is contained in:
Jeff McAffer 2016-12-03 23:22:46 -08:00
Родитель bf2254f830
Коммит 0849b33c00
6 изменённых файлов: 74 добавлений и 61 удалений

Просмотреть файл

@ -222,7 +222,7 @@ class Crawler {
request.addMeta({ attempt: request.attemptCount });
this.logger.info(`Requeuing attempt ${request.attemptCount} of request ${request.type}@${request.url}`);
const queuable = this._createQueuable(request);
return request.newQueue ? this.queues.push(queuable, request.newQueue) : this.queues.repush(request, queuable);
return this.queues.repush(request, queuable);
});
}
@ -362,8 +362,10 @@ class Crawler {
if (request.shouldSkip()) {
return Q(request);
}
this._logStartEnd('processing', request, () => request.document = this.processor.process(request));
return Q(request);
return this._logStartEnd('processing', request, () => {
request.document = this.processor.process(request);
return request;
});
}
_logStartEnd(name, request, work) {
@ -372,8 +374,11 @@ class Crawler {
this.logger.verbose(`Started ${name} ${uniqueString}`);
let result = null;
return Q
.try(work)
.then(workResult => { result = workResult; return result; })
.try(() => { return work(); })
.then(workResult => {
result = workResult;
return result;
})
.finally(() => {
// in the getRequest case we did not have a request to start. Report on the one we found.
if (!request && result instanceof Request) {

Просмотреть файл

@ -36,7 +36,7 @@ class Processor {
const requests = [];
for (let i = 2; i <= links.last.page; i++) {
const url = request.url + `?page=${i}&per_page=100`;
const newRequest = new Request(request.type, url, { qualifier: request.context.qualifier, elementType: request.context.elementType });
const newRequest = new Request(request.type, url, request.context);
newRequest.policy = request.policy;
requests.push(newRequest);
}

Просмотреть файл

@ -105,7 +105,7 @@ class TraversalPolicy {
return new TraversalPolicy('storageOriginIfMissing', 'version', 'documentAndRelated', 'deepDeep');
}
static reprocess() {
static reprocessAndUpdate() {
return new TraversalPolicy('originStorage', 'matchOrVersion', 'documentAndRelated', 'deepDeep');
}
@ -141,8 +141,7 @@ class TraversalPolicy {
return null;
}
const transitivity = { shallow: 'shallow', deepShallow: 'deepShallow', deepDeep: 'deepShallow' }[this.transitivity];
const freshness = { shallow: this.freshness, deepShallow: this.freshness, deepDeep: 'always' }[this.transitivity];
return new TraversalPolicy(this.fetch, freshness, this.processing, transitivity);
return new TraversalPolicy(this.fetch, this.freshness, this.processing, transitivity);
}
/**

Просмотреть файл

@ -334,8 +334,7 @@ describe('Crawler fetch', () => {
return crawler._fetch(request);
}).then(
request => assert.fail(),
error => expect(error.message.startsWith('Code: 500')).to.be.true
);
error => expect(error.message.startsWith('Code 500')).to.be.true);
});
it('should throw for store etag errors', () => {
@ -557,21 +556,17 @@ describe('Crawler requeue', () => {
const normal = createBaseQueue('normal', { push: request => { queue.push(request); return Q(); } });
const queues = createBaseQueues({ normal: normal });
const crawler = createBaseCrawler({ queues: queues });
for (let i = 0; i < 5; i++) {
const request = new Request('test', 'http://api.github.com/repo/microsoft/test');
request.markRequeue();
request._originQueue = normal;
request.attemptCount = i === 0 ? null : i;
crawler._requeue(request);
expect(request.promises.length).to.be.equal(1);
const request = new Request('test', 'http://api.github.com/repo/microsoft/test');
request.markRequeue();
request._originQueue = normal;
return crawler._requeue(request).then(() => {
// expect(request.promises.length).to.be.equal(1);
expect(queue.length).to.be.equal(1);
expect(queue[0] !== request).to.be.true;
expect(queue[0].type === request.type).to.be.true;
expect(queue[0].url === request.url).to.be.true;
expect(queue[0].attemptCount).to.be.equal(i + 1);
// pop the request to get ready for the next iteration
queue.shift();
}
expect(queue[0].attemptCount).to.be.equal(1);
});
});
it('should requeue in deadletter queue after 5 attempts', () => {
@ -585,14 +580,14 @@ describe('Crawler requeue', () => {
request.markRequeue();
request._originQueue = normal;
const crawler = createBaseCrawler({ queues: queues });
crawler._requeue(request);
expect(request.promises.length).to.be.equal(1);
expect(queue.length).to.be.equal(0);
expect(deadletterQueue.length).to.be.equal(1);
expect(deadletterQueue[0] !== request).to.be.true;
expect(deadletterQueue[0].type === request.type).to.be.true;
expect(deadletterQueue[0].url === request.url).to.be.true;
expect(deadletterQueue[0].attemptCount).to.be.equal(6);
return crawler._requeue(request).then(() => {
expect(queue.length).to.be.equal(0);
expect(deadletterQueue.length).to.be.equal(1);
expect(deadletterQueue[0] !== request).to.be.true;
expect(deadletterQueue[0].type === request.type).to.be.true;
expect(deadletterQueue[0].url === request.url).to.be.true;
expect(deadletterQueue[0].attemptCount).to.be.equal(6);
});
});
});
@ -700,12 +695,10 @@ describe('Crawler complete request', () => {
error => assert.fail());
});
it('still dequeues and unlocks if promises fail', () => {
const done = [];
const unlock = [];
const normal = createBaseQueue('normal', { done: request => { done.push(request); return Q(); } });
it('requeues and unlocks if promises fail', () => {
const normal = createBaseQueue('normal', { push: sinon.spy(() => { return Q(); }) });
const queues = createBaseQueues({ normal: normal });
const locker = createBaseLocker({ unlock: request => { unlock.push(request); return Q(); } });
const locker = createBaseLocker({ unlock: sinon.spy(() => { return Q(); }) });
const originalRequest = new Request('test', 'http://test.com');
originalRequest.lock = 42;
originalRequest._originQueue = normal;
@ -714,10 +707,12 @@ describe('Crawler complete request', () => {
return crawler._completeRequest(originalRequest).then(
request => assert.fail(),
error => {
expect(done.length).to.be.equal(1);
expect(done[0] === originalRequest).to.be.true;
expect(unlock.length).to.be.equal(1);
expect(unlock[0]).to.be.equal(42);
expect(normal.push.callCount).to.be.equal(1);
const requeued = normal.push.getCall(0).args[0];
expect(requeued.type).to.be.equal(originalRequest.type);
expect(requeued.url).to.be.equal(originalRequest.url);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0]).to.be.equal(42);
});
});
@ -861,7 +856,7 @@ describe('Crawler process document', () => {
const crawler = createBaseCrawler();
crawler.processor.test = request => { throw new Error('bummer'); };
return Q.try(() => {
crawler._processDocument(originalRequest)
return crawler._processDocument(originalRequest)
}).then(
request => assert.fail(),
error => { expect(error.message).to.be.equal('bummer'); });
@ -987,7 +982,7 @@ describe('Crawler whole meal deal', () => {
const context = { name: 'foo', delay: 0 };
return crawler._run(context).then(() => {
expect(context.currentDelay).to.be.approximately(451, 4);
expect(context.currentDelay).to.be.approximately(451, 10);
});
});
@ -1316,8 +1311,14 @@ function createBaseOptions(logger = createBaseLog()) {
return {
queuing: {
logger: logger,
ttl: 1000,
weights: [1]
weights: [1],
parallelPush: 10,
attenuation: {
ttl: 1000
},
tracker: {
ttl: 6 * 60 * 1000
}
},
storage: {
logger: logger,
@ -1330,9 +1331,13 @@ function createBaseOptions(logger = createBaseLog()) {
},
crawler: {
logger: logger,
tokenLowerBound: 50,
processingTtl: 60 * 1000,
promiseTrace: false,
orgList: [],
fetcher: {
tokenLowerBound: 50,
forbiddenDelay: 120000
}
},
requestor: {
}
@ -1346,6 +1351,7 @@ function createBaseQueues({ priority = null, normal = null, deadletter = null, o
function createBaseQueue(name, { pop = null, push = null, done = null, abandon = null} = {}) {
const result = { name: name };
result.getName = () => { return name; };
result.pop = pop || (() => assert.fail('should not pop'));
result.push = push || (() => assert.fail('should not push'));
result.done = done || (() => assert.fail('should not done'));

Просмотреть файл

@ -45,7 +45,7 @@ describe('Processor reprocessing', () => {
describe('Collection processing', () => {
it('should queue collection pages as deepShallow and elements as deepShallow', () => {
const request = new Request('issues', 'http://test.com/issues');
const request = new Request('issues', 'http://test.com/issues', { elementType: 'issue' });
request.policy.transitivity = 'deepShallow';
request.response = {
headers: { link: createLinkHeader(request.url, null, 2, 2) }
@ -74,7 +74,7 @@ describe('Collection processing', () => {
});
it('should queue deepShallow root collections as deepShallow and elements as shallow', () => {
const request = new Request('orgs', 'http://test.com/orgs');
const request = new Request('orgs', 'http://test.com/orgs', { elementType: 'org' });
request.policy.transitivity = 'deepShallow';
request.response = {
headers: { link: createLinkHeader(request.url, null, 2, 2) }
@ -104,7 +104,7 @@ describe('Collection processing', () => {
});
it('should queue forceForce root collection pages as forceForce and elements as forceNormal', () => {
const request = new Request('orgs', 'http://test.com/orgs');
const request = new Request('orgs', 'http://test.com/orgs', { elementType: 'org' });
request.policy = TraversalPolicy.update();
request.response = {
headers: { link: createLinkHeader(request.url, null, 2, 2) }
@ -133,7 +133,7 @@ describe('Collection processing', () => {
});
it('should queue forceForce page elements with forceNormal transitivity', () => {
const request = new Request('orgs', 'http://test.com/orgs?page=2&per_page=100');
const request = new Request('orgs', 'http://test.com/orgs?page=2&per_page=100', { elementType: 'org' });
request.policy = TraversalPolicy.update();
request.document = { _metadata: { links: {} }, elements: [{ url: 'http://child1' }] };
request.crawler = { queue: () => { } };
@ -162,7 +162,7 @@ describe('URN building', () => {
expect(request.crawler.queue.callCount).to.be.at.least(4);
const teamsRequest = request.crawler.queue.getCall(1).args[0];
expect(teamsRequest.context.qualifier).to.be.equal('urn:repo:42');
expect(teamsRequest.context.relation).to.be.deep.equal({ origin: 'repo', name: 'teams', type: 'team' } );
expect(teamsRequest.context.relation).to.be.deep.equal({ origin: 'repo', name: 'teams', type: 'team' });
request.crawler.queue.reset();
teamsRequest.type = 'teams';
@ -170,11 +170,13 @@ describe('URN building', () => {
teamsRequest.crawler = request.crawler;
const teamsPage = processor.process(teamsRequest);
const links = teamsPage._metadata.links;
expect(links.teams.type).to.be.equal('self');
expect(links.teams.hrefs.length).to.be.equal(1);
expect(links.teams.hrefs[0]).to.be.equal('urn:team:13');
expect(links.repo.type).to.be.equal('self');
expect(links.resources.type).to.be.equal('resource');
expect(links.resources.hrefs.length).to.be.equal(1);
expect(links.resources.hrefs[0]).to.be.equal('urn:team:13');
expect(links.repo.type).to.be.equal('resource');
expect(links.repo.href).to.be.equal('urn:repo:42');
expect(links.origin.type).to.be.equal('resource');
expect(links.origin.href).to.be.equal('urn:repo:42');
const teamRequest = request.crawler.queue.getCall(0).args[0];
expect(teamRequest.type).to.be.equal('team');
@ -187,11 +189,11 @@ describe('URN building', () => {
const membersRequest = request.crawler.queue.getCall(0).args[0];
expect(membersRequest.url).to.be.equal('http://team1/members');
expect(membersRequest.context.qualifier).to.be.equal('urn:team:54');
expect(membersRequest.context.relation).to.be.equal('team_members_relation');
expect(membersRequest.context.relation).to.be.deep.equal({ name: 'members', origin: 'team', type: 'user' });
const reposRequest = request.crawler.queue.getCall(1).args[0];
expect(reposRequest.url).to.be.equal('http://team1/repos');
expect(reposRequest.context.qualifier).to.be.equal('urn:team:54');
expect(reposRequest.context.relation).to.be.equal('team_repos_relation');
expect(reposRequest.context.relation).to.be.deep.equal({ name: 'repos', origin: 'team', type: 'repo' });
});
});

Просмотреть файл

@ -8,13 +8,13 @@ const sinon = require('sinon');
describe('QueueSet construction', () => {
it('should throw on duplicate queue names', () => {
expect(() => new QueueSet([{ name: '1' }, { name: '1' }])).to.throw(Error);
expect(() => new QueueSet([createBaseQueue('1'), createBaseQueue('1')])).to.throw(Error);
});
});
describe('QueueSet weighting', () => {
it('should create a simple startMap', () => {
const set = new QueueSet([{ name: '1' }, { name: '2' }], null, createOptions([3, 2]));
const set = new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, createOptions([3, 2]));
expect(set.startMap.length).to.be.equal(5);
expect(set.startMap[0]).to.be.equal(0);
expect(set.startMap[2]).to.be.equal(0);
@ -23,21 +23,21 @@ describe('QueueSet weighting', () => {
});
it('should create a default startMap if no weights given', () => {
const set = new QueueSet([{ name: '1' }, { name: '2' }]);
const set = new QueueSet([createBaseQueue('1'), createBaseQueue('2')]);
expect(set.startMap.length).to.be.equal(1);
expect(set.startMap[0]).to.be.equal(0);
});
it('should throw if too many weights are given', () => {
expect(() => new QueueSet([{ name: '1' }, { name: '2' }], null, createOptions([3, 2, 1]))).to.throw(Error);
expect(() => new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, createOptions([3, 2, 1]))).to.throw(Error);
});
it('should throw if no weights are given', () => {
expect(() => new QueueSet([{ name: '1' }, { name: '2' }], null, [])).to.throw(Error);
expect(() => new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, [])).to.throw(Error);
});
it('should create a simple startMap', () => {
const set = new QueueSet([{ name: '1' }, { name: '2' }], null, createOptions([3, 2]));
const set = new QueueSet([createBaseQueue('1'), createBaseQueue('2')], null, createOptions([3, 2]));
expect(set.startMap.length).to.be.equal(5);
expect(set.startMap[0]).to.be.equal(0);
expect(set.startMap[2]).to.be.equal(0);
@ -250,6 +250,7 @@ function createBaseQueues(queues, deadletter, weights = [1]) {
function createBaseQueue(name, { pop = null, push = null, done = null, abandon = null, subscribe = null, unsubscribe = null} = {}) {
const result = { name: name };
result.getName = () => { return name; };
result.pop = pop || (() => assert.fail('should not pop'));
result.push = push || (() => assert.fail('should not push'));
result.done = done || (() => assert.fail('should not done'));