зеркало из https://github.com/microsoft/ghcrawler.git
update payload mechanism
This commit is contained in:
Родитель
67b5ff59a2
Коммит
0c0d776b0f
|
@ -142,7 +142,7 @@ class Crawler {
|
|||
// If we could not acquire a lock, requeue. If the "error" is a normal Exceeded scenario, requeue normally
|
||||
// noting that we could not get a lock. For any other error, requeue and capture the error for debugging.
|
||||
if (error.message.startsWith('Exceeded')) {
|
||||
return request.markRequeue('Requeued', 'Could not lock');
|
||||
return request.markRequeue('Collision', 'Could not lock');
|
||||
}
|
||||
return request.markRequeue('Error', error);
|
||||
});
|
||||
|
@ -213,7 +213,6 @@ class Crawler {
|
|||
return this._queueDead(request);
|
||||
}
|
||||
request.addMeta({ attempt: request.attemptCount });
|
||||
this.logger.info(`Requeuing attempt ${request.attemptCount} of request ${request.type}@${request.url}`);
|
||||
const queuable = this._createQueuable(request);
|
||||
return this.queues.repush(request, queuable);
|
||||
});
|
||||
|
@ -239,8 +238,10 @@ class Crawler {
|
|||
return request.markSkip('no event payload');
|
||||
}
|
||||
if (request.payload) {
|
||||
request.document = request.payload;
|
||||
// The request already has the document, so no need to fetch. Setup the request as if it was actually fetched.
|
||||
request.document = request.payload.body;
|
||||
request.contentOrigin = 'origin';
|
||||
request.response = { headers: { etag: request.payload.etag } };
|
||||
return request;
|
||||
}
|
||||
return this._logStartEnd('fetching', request, () => {
|
||||
|
@ -278,6 +279,8 @@ class Crawler {
|
|||
if (Array.isArray(request.document)) {
|
||||
request.document = { elements: request.document };
|
||||
}
|
||||
if (typeof request.document === 'string')
|
||||
console.log('got a string document');
|
||||
request.document._metadata = metadata;
|
||||
return Q(request);
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ class GitHubFetcher {
|
|||
request.exhaustToken(Date.now() + delay);
|
||||
request.delay(delay);
|
||||
request.addMeta({ forbiddenDelay: delay });
|
||||
return request.markRequeue(`GitHub throttled ${request.url}`);
|
||||
return request.markRequeue('Throttled', `GitHub throttled ${request.url}`);
|
||||
}
|
||||
throw new Error(`Code ${status} for ${request.url}`);
|
||||
}
|
||||
|
|
|
@ -305,8 +305,9 @@ class GitHubProcessor {
|
|||
const newRequests = newEvents.map(event => {
|
||||
// make sure the URL here is unique. Even though it will not actually be fetched (the content
|
||||
// is in the payload), it will need to be unique for the queue tagging/optimization
|
||||
// Events are immutable (and we can't fetch them later) so set the etag to a constant
|
||||
const newRequest = new Request(event.type, `${request.url}/${event.id}`);
|
||||
newRequest.payload = event;
|
||||
newRequest.payload = { etag: 1, body: event };
|
||||
return newRequest;
|
||||
});
|
||||
request.queue(newRequests);
|
||||
|
|
|
@ -118,7 +118,7 @@ describe('Crawler get request', () => {
|
|||
return crawler._getRequest(requestBox, { name: 'test' }).then(
|
||||
request => {
|
||||
expect(request.shouldRequeue()).to.be.true;
|
||||
expect(request.outcome).to.be.equal('Requeued');
|
||||
expect(request.outcome).to.be.equal('Collision');
|
||||
expect(request.message).to.be.equal('Could not lock');
|
||||
},
|
||||
request => assert.fail());
|
||||
|
|
|
@ -2,6 +2,7 @@ const assert = require('chai').assert;
|
|||
const chai = require('chai');
|
||||
const expect = require('chai').expect;
|
||||
const GitHubProcessor = require('../lib/githubProcessor.js');
|
||||
const Q = require('q');
|
||||
const Request = require('../lib/request.js');
|
||||
const sinon = require('sinon');
|
||||
const TraversalPolicy = require('../lib/traversalPolicy');
|
||||
|
@ -1001,6 +1002,22 @@ describe('Watch processing', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('Event Finder', () => {
|
||||
it('will skip duplicates', () => {
|
||||
const docs = { 'http://repo1/events/3': '{ id: 3 }', 'http://repo1/events/4': '{ id: 4}' };
|
||||
const store = { get: (type, url) => { return Q(docs[url]); } }
|
||||
const events = [];
|
||||
for (let i = 0; i < 20; i++) {
|
||||
events.push({ id: i, repo: { url: 'http://repo1' } })
|
||||
}
|
||||
const processor = new GitHubProcessor();
|
||||
processor.store = store;
|
||||
processor._findNew(events).then(newEvents => {
|
||||
expect(newEvents.length).to.be.equal(18);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// =========================== HELPERS =========================
|
||||
|
||||
function expectLinks(actual, expected) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче