This commit is contained in:
Jeff McAffer 2016-11-20 22:45:18 -08:00
Родитель 40713f46b7
Коммит 5de53a5501
3 изменённых файлов: 217 добавлений и 156 удалений

Просмотреть файл

@ -22,6 +22,9 @@ class Crawler {
// We are done call the done handler and return without continuing the loop // We are done call the done handler and return without continuing the loop
return options.done ? options.done() : null; return options.done ? options.done() : null;
} }
if (delay) {
this.logger.verbose(`Crawler: ${options.name} waiting for ${delay}ms`);
}
setTimeout(() => { this._run(options); }, delay); setTimeout(() => { this._run(options); }, delay);
} }
@ -33,35 +36,39 @@ class Crawler {
} }
options.delay = 0; options.delay = 0;
return Q.try(() => this.processOne(options)) return Q.try(() => this.processOne(options))
.then(this.log(this._computeDelay.bind(this, options))) .then(this.log(this._computeDelay.bind(this, options)), this._panic.bind(this, options))
.then(this.log(this.run.bind(this, options), this.log(this.run.bind(this, options)))); .finally(this.log(this.run.bind(this, options)));
} catch (error) { } catch (error) {
// If for some reason we throw all the way out of start, log and restart the loop // If for some reason we throw all the way out of start, log and restart the loop
this.logger.error(new Error('PANIC! Crawl loop exited unexpectedly')); this._panic(options, error);
this.logger.error(error);
this.run(options); this.run(options);
} }
} }
_computeDelay(options, request) {
_panic(options, error) {
this.logger.error(new Error('PANIC, we should not have gotten here'));
this.logger.error(error);
}
_computeDelay(options, delaySpec) {
let delay = options.delay; let delay = options.delay;
if (delay === -1) { if (delay === -1) {
return request; return delay;
} }
delay = delay || 0; delay = delay || 0;
const now = Date.now(); const now = Date.now();
const requestGate = now + (request.shouldDelay() ? 2000 : 0); const requestGate = now + (delaySpec.shouldDelay() ? 2000 : 0);
const delayGate = request.nextRequestTime || now; const delayGate = delaySpec.nextRequestTime || now;
const nextRequestTime = Math.max(requestGate, delayGate, now); const nextRequestTime = Math.max(requestGate, delayGate, now);
delay = Math.max(0, nextRequestTime - now); delay = Math.max(0, nextRequestTime - now);
options.delay = delay; options.delay = delay;
if (delay) {
this.logger.verbose(`Crawler: ${options.name} waiting for ${delay}ms`);
}
return delay; return delay;
} }
/**
* Process one request cycle. If an error happens during processing, handle it there and
* return a spec describing any delays that should be in .
*/
processOne(options) { processOne(options) {
let requestBox = []; let requestBox = [];
@ -75,13 +82,19 @@ class Crawler {
.catch(this.log(this._errorHandler.bind(this, requestBox))) .catch(this.log(this._errorHandler.bind(this, requestBox)))
.then(this.log(this._completeRequest.bind(this), this._completeRequest.bind(this))) .then(this.log(this._completeRequest.bind(this), this._completeRequest.bind(this)))
.catch(this.log(this._errorHandler.bind(this, requestBox))) .catch(this.log(this._errorHandler.bind(this, requestBox)))
.then(this.log(this._logOutcome.bind(this))); .then(this.log(this._logOutcome.bind(this)))
.catch(this.log(this._errorHandler.bind(this, requestBox)));
} }
_errorHandler(requestBox, error) { _errorHandler(requestBox, error) {
if (requestBox[0]) { if (requestBox[0]) {
if (requestBox[0].type === '_errorTrap') {
// TODO if there is a subsequent error, just capture the first and carry on for now. likely should log
return requestBox[0];
} else {
return requestBox[0].markRequeue('Error', error); return requestBox[0].markRequeue('Error', error);
} }
}
const request = new Request('_errorTrap', null); const request = new Request('_errorTrap', null);
request.markDelay(); request.markDelay();
request.markSkip('Error', error); request.markSkip('Error', error);

Просмотреть файл

@ -1,9 +1,10 @@
const extend = require('extend'); const extend = require('extend');
class Request { class Request {
constructor(type, url) { constructor(type, url, context = null) {
this.type = type; this.type = type;
this.url = url; this.url = url;
this.context = context || {};
this.promises = []; this.promises = [];
} }

Просмотреть файл

@ -880,6 +880,48 @@ describe('Crawler store document', () => {
}); });
}); });
describe('Crawler run', () => {
it('should panic for rejects in processOne', () => {
const crawler = createBaseCrawler();
sinon.stub(crawler, 'run', () => { });
sinon.stub(crawler, 'processOne', () => { return Q.reject('test error') });
sinon.spy(crawler, '_computeDelay');
sinon.spy(crawler, '_panic');
const request = new Request('user', 'http://test.com/users/user1');
const options = { name: 'foo', delay: 0 };
return crawler._run(options).then(() => {
expect(crawler.processOne.callCount).to.be.equal(1);
expect(crawler._computeDelay.callCount).to.be.equal(0);
expect(crawler._panic.callCount).to.be.equal(1);
expect(crawler.run.callCount).to.be.equal(1);
}, error => {
console.log(error);
});
});
it('should panic for errors in processOne', () => {
const crawler = createBaseCrawler();
sinon.stub(crawler, 'run', () => { });
sinon.stub(crawler, 'processOne', () => { throw new Error('test error') });
sinon.spy(crawler, '_computeDelay');
sinon.spy(crawler, '_panic');
const request = new Request('user', 'http://test.com/users/user1');
const options = { name: 'foo', delay: 0 };
return crawler._run(options).then(() => {
expect(crawler.processOne.callCount).to.be.equal(1);
expect(crawler._computeDelay.callCount).to.be.equal(0);
expect(crawler._panic.callCount).to.be.equal(1);
expect(crawler.run.callCount).to.be.equal(1);
}, error => {
console.log(error);
});
});
});
describe('Crawler whole meal deal', () => { describe('Crawler whole meal deal', () => {
it('should delay starting next iteration when markDelay', () => { it('should delay starting next iteration when markDelay', () => {
const crawler = createBaseCrawler(); const crawler = createBaseCrawler();
@ -928,9 +970,8 @@ describe('Crawler whole meal deal', () => {
crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')]; crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')];
crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })]; crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })];
return Q.try(() => { return Q.try(() => { return crawler.processOne({ name: 'test' }); }).then(
return crawler.processOne({ name: 'test' }); () => {
}).then(() => {
expect(crawler.queues.priority.pop.callCount).to.be.equal(1, 'priority call count'); expect(crawler.queues.priority.pop.callCount).to.be.equal(1, 'priority call count');
expect(crawler.queues.normal.pop.callCount).to.be.equal(1, 'normal call count'); expect(crawler.queues.normal.pop.callCount).to.be.equal(1, 'normal call count');
@ -963,8 +1004,9 @@ describe('Crawler whole meal deal', () => {
expect(crawler.queues.normal.done.callCount).to.be.equal(1); expect(crawler.queues.normal.done.callCount).to.be.equal(1);
expect(crawler.logger.error.callCount).to.be.equal(1); expect(crawler.logger.info.callCount).to.be.equal(1);
}); },
error => assert.fail());
}); });
it('should empty request queues', () => { it('should empty request queues', () => {
@ -982,9 +1024,8 @@ describe('Crawler whole meal deal', () => {
crawler.queues.normal = normal; crawler.queues.normal = normal;
crawler.requestor.responses = [createResponse(null, 500)]; crawler.requestor.responses = [createResponse(null, 500)];
return Q.try(() => { return Q.try(() => { return crawler.processOne({ name: 'test' }); }).then(
return crawler.processOne({ name: 'test' }); () => {
}).then(() => {
expect(crawler.queues.priority.pop.callCount).to.be.equal(1); expect(crawler.queues.priority.pop.callCount).to.be.equal(1);
expect(crawler.queues.normal.pop.callCount).to.be.equal(1); expect(crawler.queues.normal.pop.callCount).to.be.equal(1);
@ -1011,7 +1052,8 @@ describe('Crawler whole meal deal', () => {
expect(crawler.logger.error.callCount).to.be.equal(1); expect(crawler.logger.error.callCount).to.be.equal(1);
const error = crawler.logger.error.getCall(0).args[0]; const error = crawler.logger.error.getCall(0).args[0];
expect(error.message).to.be.equal('cant pop'); expect(error.message).to.be.equal('cant pop');
}); },
error => assert.fail());
}); });
it('should handle fetch reject', () => { it('should handle fetch reject', () => {
@ -1020,9 +1062,8 @@ describe('Crawler whole meal deal', () => {
// setup a good request but a server error response // setup a good request but a server error response
crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')]; crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')];
crawler.requestor.responses = [createResponse(null, 500)]; crawler.requestor.responses = [createResponse(null, 500)];
return Q.try(() => { return Q.try(() => { return crawler.processOne({ name: 'test' }); }).then(
return crawler.processOne({ name: 'test' }); () => {
}).then(() => {
expect(crawler.queues.priority.pop.callCount).to.be.equal(1); expect(crawler.queues.priority.pop.callCount).to.be.equal(1);
expect(crawler.queues.normal.pop.callCount).to.be.equal(1); expect(crawler.queues.normal.pop.callCount).to.be.equal(1);
@ -1057,7 +1098,8 @@ describe('Crawler whole meal deal', () => {
expect(crawler.logger.error.callCount).to.be.equal(1); expect(crawler.logger.error.callCount).to.be.equal(1);
const error = crawler.logger.error.getCall(0).args[0]; const error = crawler.logger.error.getCall(0).args[0];
expect(error.message.includes('500')).to.be.true; expect(error.message.includes('500')).to.be.true;
}); },
error => assert.fail());
}); });
it('should handle process document reject', () => { it('should handle process document reject', () => {
@ -1066,9 +1108,8 @@ describe('Crawler whole meal deal', () => {
crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')]; crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')];
crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })]; crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })];
return Q.try(() => { return Q.try(() => { return crawler.processOne({ name: 'test' }); }).then(
return crawler.processOne({ name: 'test' }); () => {
}).then(() => {
expect(crawler.queues.priority.pop.callCount).to.be.equal(1); expect(crawler.queues.priority.pop.callCount).to.be.equal(1);
expect(crawler.queues.normal.pop.callCount).to.be.equal(1); expect(crawler.queues.normal.pop.callCount).to.be.equal(1);
@ -1085,6 +1126,8 @@ describe('Crawler whole meal deal', () => {
expect(requestorGet.callCount).to.be.equal(1); expect(requestorGet.callCount).to.be.equal(1);
expect(requestorGet.getCall(0).args[0]).to.be.equal('http://test.com/users/user1'); expect(requestorGet.getCall(0).args[0]).to.be.equal('http://test.com/users/user1');
expect(crawler._errorHandler.callCount).to.be.equal(1);
const push = crawler.queues.normal.push; const push = crawler.queues.normal.push;
expect(push.callCount).to.be.equal(1); expect(push.callCount).to.be.equal(1);
const newRequest = push.getCall(0).args[0]; const newRequest = push.getCall(0).args[0];
@ -1103,7 +1146,8 @@ describe('Crawler whole meal deal', () => {
expect(crawler.logger.error.callCount).to.be.equal(1); expect(crawler.logger.error.callCount).to.be.equal(1);
const error = crawler.logger.error.getCall(0).args[0]; const error = crawler.logger.error.getCall(0).args[0];
expect(error instanceof Error).to.be.true; expect(error instanceof Error).to.be.true;
}); },
error => assert.fail());
}); });
it('should handle store document reject', () => { it('should handle store document reject', () => {
@ -1112,9 +1156,8 @@ describe('Crawler whole meal deal', () => {
crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')]; crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')];
crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })]; crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })];
return Q.try(() => { return Q.try(() => { return crawler.processOne({ name: 'test' }); }).then(
return crawler.processOne({ name: 'test' }); () => {
}).then(() => {
const unlock = crawler.locker.unlock; const unlock = crawler.locker.unlock;
expect(unlock.callCount).to.be.equal(1); expect(unlock.callCount).to.be.equal(1);
expect(unlock.getCall(0).args[0]).to.be.equal('lockToken'); expect(unlock.getCall(0).args[0]).to.be.equal('lockToken');
@ -1130,7 +1173,8 @@ describe('Crawler whole meal deal', () => {
expect(crawler.logger.error.callCount).to.be.equal(1); expect(crawler.logger.error.callCount).to.be.equal(1);
const error = crawler.logger.error.getCall(0).args[0]; const error = crawler.logger.error.getCall(0).args[0];
expect(error instanceof Error).to.be.true; expect(error instanceof Error).to.be.true;
}); },
error => assert.fail());
}); });
it('should handle complete request reject', () => { it('should handle complete request reject', () => {
@ -1139,9 +1183,8 @@ describe('Crawler whole meal deal', () => {
crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')]; crawler.queues.normal.requests = [new Request('user', 'http://test.com/users/user1')];
crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })]; crawler.requestor.responses = [createResponse({ id: 42, repos_url: 'http://test.com/users/user1/repos' })];
return Q.try(() => { return Q.try(() => { return crawler.processOne({ name: 'test' }); }).then(
return crawler.processOne({ name: 'test' }); () => {
}).then(() => {
const push = crawler.queues.normal.push; const push = crawler.queues.normal.push;
expect(push.callCount).to.be.equal(1); expect(push.callCount).to.be.equal(1);
const newRequest = push.getCall(0).args[0]; const newRequest = push.getCall(0).args[0];
@ -1151,7 +1194,8 @@ describe('Crawler whole meal deal', () => {
expect(crawler.logger.error.callCount).to.be.equal(1); expect(crawler.logger.error.callCount).to.be.equal(1);
const error = crawler.logger.error.getCall(0).args[0]; const error = crawler.logger.error.getCall(0).args[0];
expect(error instanceof Error).to.be.true; expect(error instanceof Error).to.be.true;
}); },
error => assert.fail());
}); });
}); });
@ -1164,7 +1208,7 @@ function createFullCrawler() {
normal.requests = []; normal.requests = [];
sinon.stub(normal, 'pop', () => { return Q(normal.requests.shift()); }); sinon.stub(normal, 'pop', () => { return Q(normal.requests.shift()); });
sinon.stub(normal, 'push', request => { return Q(); }); sinon.stub(normal, 'push', request => { return Q(); });
sinon.spy(normal, 'done'); sinon.stub(normal, 'done', request => { return Q(); });
const queues = createBaseQueues({ priority: priority, normal: normal }); const queues = createBaseQueues({ priority: priority, normal: normal });
@ -1194,6 +1238,9 @@ function createFullCrawler() {
const result = createBaseCrawler({ queues: queues, requestor: requestor, store: store, logger: logger, locker: locker, options: config }); const result = createBaseCrawler({ queues: queues, requestor: requestor, store: store, logger: logger, locker: locker, options: config });
result.processor = processor; result.processor = processor;
sinon.spy(result, '_errorHandler');
return result; return result;
} }