This commit is contained in:
Jeff McAffer 2016-12-21 14:55:51 -08:00
Родитель 67b5ff59a2
Коммит 0c0d776b0f
5 изменённых файлов: 27 добавлений и 6 удалений

Просмотреть файл

@ -142,7 +142,7 @@ class Crawler {
// If we could not acquire a lock, requeue. If the "error" is a normal Exceeded scenario, requeue normally
// noting that we could not get a lock. For any other error, requeue and capture the error for debugging.
if (error.message.startsWith('Exceeded')) {
return request.markRequeue('Requeued', 'Could not lock');
return request.markRequeue('Collision', 'Could not lock');
}
return request.markRequeue('Error', error);
});
@ -213,7 +213,6 @@ class Crawler {
return this._queueDead(request);
}
request.addMeta({ attempt: request.attemptCount });
this.logger.info(`Requeuing attempt ${request.attemptCount} of request ${request.type}@${request.url}`);
const queuable = this._createQueuable(request);
return this.queues.repush(request, queuable);
});
@ -239,8 +238,10 @@ class Crawler {
return request.markSkip('no event payload');
}
if (request.payload) {
request.document = request.payload;
// The request already has the document, so no need to fetch. Setup the request as if it was actually fetched.
request.document = request.payload.body;
request.contentOrigin = 'origin';
request.response = { headers: { etag: request.payload.etag } };
return request;
}
return this._logStartEnd('fetching', request, () => {
@ -278,6 +279,8 @@ class Crawler {
if (Array.isArray(request.document)) {
request.document = { elements: request.document };
}
if (typeof request.document === 'string')
console.log('got a string document');
request.document._metadata = metadata;
return Q(request);
}

Просмотреть файл

@ -47,7 +47,7 @@ class GitHubFetcher {
request.exhaustToken(Date.now() + delay);
request.delay(delay);
request.addMeta({ forbiddenDelay: delay });
return request.markRequeue(`GitHub throttled ${request.url}`);
return request.markRequeue('Throttled', `GitHub throttled ${request.url}`);
}
throw new Error(`Code ${status} for ${request.url}`);
}

Просмотреть файл

@ -305,8 +305,9 @@ class GitHubProcessor {
const newRequests = newEvents.map(event => {
// make sure the URL here is unique. Even though it will not actually be fetched (the content
// is in the payload), it will need to be unique for the queue tagging/optimization
// Events are immutable (and we can't fetch them later) so set the etag to a constant
const newRequest = new Request(event.type, `${request.url}/${event.id}`);
newRequest.payload = event;
newRequest.payload = { etag: 1, body: event };
return newRequest;
});
request.queue(newRequests);

Просмотреть файл

@ -118,7 +118,7 @@ describe('Crawler get request', () => {
return crawler._getRequest(requestBox, { name: 'test' }).then(
request => {
expect(request.shouldRequeue()).to.be.true;
expect(request.outcome).to.be.equal('Requeued');
expect(request.outcome).to.be.equal('Collision');
expect(request.message).to.be.equal('Could not lock');
},
request => assert.fail());

Просмотреть файл

@ -2,6 +2,7 @@ const assert = require('chai').assert;
const chai = require('chai');
const expect = require('chai').expect;
const GitHubProcessor = require('../lib/githubProcessor.js');
const Q = require('q');
const Request = require('../lib/request.js');
const sinon = require('sinon');
const TraversalPolicy = require('../lib/traversalPolicy');
@ -1001,6 +1002,22 @@ describe('Watch processing', () => {
});
});
describe('Event Finder', () => {
it('will skip duplicates', () => {
const docs = { 'http://repo1/events/3': '{ id: 3 }', 'http://repo1/events/4': '{ id: 4}' };
const store = { get: (type, url) => { return Q(docs[url]); } }
const events = [];
for (let i = 0; i < 20; i++) {
events.push({ id: i, repo: { url: 'http://repo1' } })
}
const processor = new GitHubProcessor();
processor.store = store;
processor._findNew(events).then(newEvents => {
expect(newEvents.length).to.be.equal(18);
});
});
});
// =========================== HELPERS =========================
function expectLinks(actual, expected) {