add mutables notion and optimize commit/comment processing

This commit is contained in:
Jeff McAffer 2017-01-13 09:29:53 -08:00
Родитель bb80ed725b
Коммит ee9984a32f
4 изменённых файлов: 46 добавлений и 22 удалений

Просмотреть файл

@ -29,8 +29,7 @@ class GitHubFetcher {
if (initial === 'storage') {
return this._fetchFromStorage(request);
}
const checkEtag = request.policy.fetch === 'originStorage';
return this._fetchFromGitHub(request, checkEtag);
return this._fetchFromGitHub(request, initial === 'etag');
}
_fetchFromGitHub(request, checkEtag) {
@ -97,10 +96,11 @@ class GitHubFetcher {
_addTokenToUrl(request, options) {
let token = options.headers.authorization;
if (!token) {
return request.url
return request.url;
}
const urlSpec = URL.parse(request.url, true);
urlSpec.query.access_token = token.slice(6);
delete urlSpec.search;
delete options.headers.authorization;
return URL.format(urlSpec);
}
@ -120,8 +120,9 @@ class GitHubFetcher {
_requeueBenched(request, benchTime) {
request.delayUntil(benchTime);
request.addMeta({ benchDelay: benchTime - Date.now() });
return request.markRequeue('Benched', `Wait for token while getting ${request.url}`);
const benchDelay = benchTime - Date.now();
request.addMeta({ benchDelay: benchDelay });
return request.markRequeue('Benched', `Waiting ${benchDelay} for a token`);
}
_requeueThrottled(request) {
@ -129,7 +130,7 @@ class GitHubFetcher {
request.exhaustToken(Date.now() + delay);
request.delay(delay);
request.addMeta({ forbiddenDelay: delay });
return request.markRequeue('Throttled', `GitHub throttled ${request.url}`);
return request.markRequeue('Throttled', 'GitHub secondary throttling kicked in');
}
_fetchFromStorage(request) {

Просмотреть файл

@ -23,11 +23,14 @@ class GitHubProcessor {
}
const oldVersion = request.document._metadata.version;
if (!request.policy.shouldProcess(request, this.version)) {
if (request.policy.shouldProcess(request, this.version)) {
request.processMode = 'process';
} else {
// We are not going to process but may still need to traverse the doc to get to referenced docs that
// do need proecessing. If so, mark the request for no saving (already have good content) and carry on.
// Otherwise, skip the doc altogether.
if (request.policy.shouldTraverse(request)) {
request.processMode = 'traverse';
request.markNoSave();
} else {
request.markSkip('Excluded', `Traversal policy excluded this resource`);
@ -84,12 +87,14 @@ class GitHubProcessor {
elementType = relation.type;
}
// Queue up the page elements. Use the same policy as this request as the page itself is more of an implementation
// detail and should not be part of the user model of transitivity.
// detail and should not be part of the user model of transitivity. If the current request is a relation, be sure to
// queue a 'reference' relationship since the other side of the relation is not 'contained' by the origin or the relation itself.
document.elements.forEach(item => {
if (elementType) {
const elementQualifier = this.isRootType(elementType) ? 'urn:' : qualifier;
const newContext = { qualifier: elementQualifier, history: request.context.history };
request.queue(request.relationship, elementType, item.url, newContext, request.policy);
const newRelationship = relation ? 'reference' : request.relationship;
request.queue(newRelationship, elementType, item.url, newContext, request.policy);
} else {
// TODO if there is no elementType on a collection then assume it is events. Need to fix this up and
// formalize the model of collections where the request carries the payload.
@ -186,7 +191,10 @@ class GitHubProcessor {
request.addSelfLink('sha');
request.linkSiblings(`${context.qualifier}:commits`);
if (document.comments_url) {
// Most often there actually are no comments. Get the comments if we think there will be some and this resource is being processed (vs. traversed).
// Note that if we are doing event processing, new comments will be added to the list dynamically so the only reason we need to refetch the
// comment list in general is if we think we missed some events.
if (document.comments_url && (document.commit.comment_count > 0 && request.processMode === 'process')) {
this._addCollection(request, 'commit_comments', 'commit_comment', document.comments_url, `${document._metadata.links.self.href}:commit_comments`);
}
this._addRootContainer(request, 'repo', 'repo', document.url.replace(/\/commits\/.*/, ''), `${context.qualifier}`);
@ -714,7 +722,8 @@ class GitHubProcessor {
// Also add an 'knownPages' link to help deal with page clean up.
// request.linkCollection('knownPages', `${urn}:pages`);
const context = { qualifier: qualifier, relation: { origin: request.type, qualifier: urn, type: type, guid: guid } };
request.queue('reference', name, url, context, false);
// queue the relation pages as contains as the current resource does 'contain' the list of things to which it is related.
request.queue('contains', name, url, context, false);
}
/**

Просмотреть файл

@ -127,7 +127,8 @@ class Request {
}
queueRequests(requests, name = null) {
const toQueue = (Array.isArray(requests) ? requests : [requests]).filter(request => !this.hasSeen(request));
requests = Array.isArray(requests) ? requests : [requests];
const toQueue = requests.filter(request => !this.hasSeen(request));
this.track(this.crawler.queue(toQueue, name));
}

Просмотреть файл

@ -8,10 +8,11 @@
Fetch behavior
Fetch behavior -- Defines the authoritative source for content. The first word of the name identifies the authority.
* storageOnly - Only use stored content. Skip this resource if we don't already have it
* originStorage - Use stored content if it is up to date. Otherwise, get content from original source
* storageOriginIfMissing - Use stored content. If missing, get content from original source
* originStorage - Origin rules. Consider storage first and use it if it matches origin. Otherwise, get content from origin
* storageOriginIfMissing - Storage rules. Only if content is missing from storage, get content from origin
* mutables - Use originStorage if the resource is deemed mutable, storageOriginIfMissing if immutable
* originOnly - Always get content from original source
Freshness -- How age of the resource, relative what we have seen/done before, factors into whether or not process the resource.
@ -180,10 +181,14 @@ class TraversalPolicy {
}
static default() {
return new TraversalPolicy('originStorage', 'match', 'broad');
return new TraversalPolicy('mutables', 'match', 'broad');
}
static refresh() {
return new TraversalPolicy('mutables', 'match', 'update');
}
static reload() {
return new TraversalPolicy('originStorage', 'match', 'update');
}
@ -200,7 +205,7 @@ class TraversalPolicy {
}
static reprocessAndUpdate() {
return new TraversalPolicy('originStorage', 'matchOrVersion', 'update');
return new TraversalPolicy('mutables', 'matchOrVersion', 'update');
}
static clone(policy) {
@ -230,10 +235,13 @@ class TraversalPolicy {
return true;
}
if (this.freshness === 'match') {
// process if the content we got did NOT come from the cache (i.e., either is it new or never seen)
return request.origin !== 'cacheOfOrigin';
// process if the content came from origin then either we did not have it cached or it did not match. Process
return request.contentOrigin === 'origin';
}
if (typeof this.freshness === 'number') {
// TODO this is not quite right. To tell time freshness we need to get the cached version but if we need to process
// we need the content from origin. Essentially we need to read the processed time with the etag (at that point)
// determine if the content is stale. Testing here is too late.
return moment.diff(request.document._metadata.processedAt, 'hours') > this.freshness * 24;
}
if (this.freshness === 'version' || this.freshness === 'matchOrVersion') {
@ -250,11 +258,16 @@ class TraversalPolicy {
return this._getTransitivitySpec().shouldTraverse(request);
}
isImmutable(type) {
return ['commit'].includes(type);
}
/**
* Return the source from which to perform the initial fetch for the given request's resource.
*/
initialFetch(request) {
const result = { storageOnly: 'storage', originStorage: 'origin', storageOriginIfMissing: 'storage', originOnly: 'origin' }[this.fetch];
const mutablesValue = this.isImmutable(request.type) ? 'storage' : 'etag';
const result = { storageOnly: 'storage', originStorage: 'etag', originMutable: 'storage', storageOriginIfMissing: 'storage', mutables: mutablesValue, originOnly: 'origin' }[this.fetch];
if (!result) {
throw new Error(`Fetch policy misconfigured ${this.fetch}`);
}
@ -265,7 +278,7 @@ class TraversalPolicy {
* Return the source from which to fetch if the original fetch did not find any content
*/
shouldFetchMissing(request) {
const result = { storageOnly: null, originStorage: 'origin', storageOriginIfMissing: 'origin', originOnly: null }[this.fetch];
const result = { storageOnly: null, originStorage: 'origin', storageOriginIfMissing: 'origin', mutables: 'origin', originOnly: null }[this.fetch];
if (result === undefined) {
throw new Error(`Fetch policy misconfigured ${this.fetch}`);
}
@ -276,7 +289,7 @@ class TraversalPolicy {
* Return a symbolic short form to uniquely identify this policy.
*/
getShortForm() {
const fetch = { storageOnly: 'S', storageOriginIfMissing: 's', originOnly: 'O', originStorage: 'o' }[this.fetch];
const fetch = { storageOnly: 'S', storageOriginIfMissing: 's', originOnly: 'O', originStorage: 'o', mutables: 'm' }[this.fetch];
let freshness = { always: 'A', match: 'M', version: 'V', matchOrVersion: 'm' }[this.freshness];
if (!freshness) {
if (typeof this.policy.freshness === 'number') {