зеркало из https://github.com/microsoft/ghcrawler.git
add mutables notion and optimize commit/comment processing
This commit is contained in:
Родитель
bb80ed725b
Коммит
ee9984a32f
|
@ -29,8 +29,7 @@ class GitHubFetcher {
|
|||
if (initial === 'storage') {
|
||||
return this._fetchFromStorage(request);
|
||||
}
|
||||
const checkEtag = request.policy.fetch === 'originStorage';
|
||||
return this._fetchFromGitHub(request, checkEtag);
|
||||
return this._fetchFromGitHub(request, initial === 'etag');
|
||||
}
|
||||
|
||||
_fetchFromGitHub(request, checkEtag) {
|
||||
|
@ -97,10 +96,11 @@ class GitHubFetcher {
|
|||
_addTokenToUrl(request, options) {
|
||||
let token = options.headers.authorization;
|
||||
if (!token) {
|
||||
return request.url
|
||||
return request.url;
|
||||
}
|
||||
const urlSpec = URL.parse(request.url, true);
|
||||
urlSpec.query.access_token = token.slice(6);
|
||||
delete urlSpec.search;
|
||||
delete options.headers.authorization;
|
||||
return URL.format(urlSpec);
|
||||
}
|
||||
|
@ -120,8 +120,9 @@ class GitHubFetcher {
|
|||
|
||||
_requeueBenched(request, benchTime) {
|
||||
request.delayUntil(benchTime);
|
||||
request.addMeta({ benchDelay: benchTime - Date.now() });
|
||||
return request.markRequeue('Benched', `Wait for token while getting ${request.url}`);
|
||||
const benchDelay = benchTime - Date.now();
|
||||
request.addMeta({ benchDelay: benchDelay });
|
||||
return request.markRequeue('Benched', `Waiting ${benchDelay} for a token`);
|
||||
}
|
||||
|
||||
_requeueThrottled(request) {
|
||||
|
@ -129,7 +130,7 @@ class GitHubFetcher {
|
|||
request.exhaustToken(Date.now() + delay);
|
||||
request.delay(delay);
|
||||
request.addMeta({ forbiddenDelay: delay });
|
||||
return request.markRequeue('Throttled', `GitHub throttled ${request.url}`);
|
||||
return request.markRequeue('Throttled', 'GitHub secondary throttling kicked in');
|
||||
}
|
||||
|
||||
_fetchFromStorage(request) {
|
||||
|
|
|
@ -23,11 +23,14 @@ class GitHubProcessor {
|
|||
}
|
||||
|
||||
const oldVersion = request.document._metadata.version;
|
||||
if (!request.policy.shouldProcess(request, this.version)) {
|
||||
if (request.policy.shouldProcess(request, this.version)) {
|
||||
request.processMode = 'process';
|
||||
} else {
|
||||
// We are not going to process but may still need to traverse the doc to get to referenced docs that
|
||||
// do need proecessing. If so, mark the request for no saving (already have good content) and carry on.
|
||||
// Otherwise, skip the doc altogether.
|
||||
if (request.policy.shouldTraverse(request)) {
|
||||
request.processMode = 'traverse';
|
||||
request.markNoSave();
|
||||
} else {
|
||||
request.markSkip('Excluded', `Traversal policy excluded this resource`);
|
||||
|
@ -84,12 +87,14 @@ class GitHubProcessor {
|
|||
elementType = relation.type;
|
||||
}
|
||||
// Queue up the page elements. Use the same policy as this request as the page itself is more of an implementation
|
||||
// detail and should not be part of the user model of transitivity.
|
||||
// detail and should not be part of the user model of transitivity. If the current request is a relation, be sure to
|
||||
// queue a 'reference' relationship since the other side of the relation is not 'contained' by the origin or the relation itself.
|
||||
document.elements.forEach(item => {
|
||||
if (elementType) {
|
||||
const elementQualifier = this.isRootType(elementType) ? 'urn:' : qualifier;
|
||||
const newContext = { qualifier: elementQualifier, history: request.context.history };
|
||||
request.queue(request.relationship, elementType, item.url, newContext, request.policy);
|
||||
const newRelationship = relation ? 'reference' : request.relationship;
|
||||
request.queue(newRelationship, elementType, item.url, newContext, request.policy);
|
||||
} else {
|
||||
// TODO if there is no elementType on a collection then assume it is events. Need to fix this up and
|
||||
// formalize the model of collections where the request carries the payload.
|
||||
|
@ -186,7 +191,10 @@ class GitHubProcessor {
|
|||
request.addSelfLink('sha');
|
||||
request.linkSiblings(`${context.qualifier}:commits`);
|
||||
|
||||
if (document.comments_url) {
|
||||
// Most often there actually are no comments. Get the comments if we think there will be some and this resource is being processed (vs. traversed).
|
||||
// Note that if we are doing event processing, new comments will be added to the list dynamically so the only reason we need to refetch the
|
||||
// comment list in general is if we think we missed some events.
|
||||
if (document.comments_url && (document.commit.comment_count > 0 && request.processMode === 'process')) {
|
||||
this._addCollection(request, 'commit_comments', 'commit_comment', document.comments_url, `${document._metadata.links.self.href}:commit_comments`);
|
||||
}
|
||||
this._addRootContainer(request, 'repo', 'repo', document.url.replace(/\/commits\/.*/, ''), `${context.qualifier}`);
|
||||
|
@ -714,7 +722,8 @@ class GitHubProcessor {
|
|||
// Also add an 'knownPages' link to help deal with page clean up.
|
||||
// request.linkCollection('knownPages', `${urn}:pages`);
|
||||
const context = { qualifier: qualifier, relation: { origin: request.type, qualifier: urn, type: type, guid: guid } };
|
||||
request.queue('reference', name, url, context, false);
|
||||
// queue the relation pages as contains as the current resource does 'contain' the list of things to which it is related.
|
||||
request.queue('contains', name, url, context, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -127,7 +127,8 @@ class Request {
|
|||
}
|
||||
|
||||
queueRequests(requests, name = null) {
|
||||
const toQueue = (Array.isArray(requests) ? requests : [requests]).filter(request => !this.hasSeen(request));
|
||||
requests = Array.isArray(requests) ? requests : [requests];
|
||||
const toQueue = requests.filter(request => !this.hasSeen(request));
|
||||
this.track(this.crawler.queue(toQueue, name));
|
||||
}
|
||||
|
||||
|
|
|
@ -8,10 +8,11 @@
|
|||
|
||||
|
||||
|
||||
Fetch behavior
|
||||
Fetch behavior -- Defines the authoritative source for content. The first word of the name identifies the authority.
|
||||
* storageOnly - Only use stored content. Skip this resource if we don't already have it
|
||||
* originStorage - Use stored content if it is up to date. Otherwise, get content from original source
|
||||
* storageOriginIfMissing - Use stored content. If missing, get content from original source
|
||||
* originStorage - Origin rules. Consider storage first and use it if it matches origin. Otherwise, get content from origin
|
||||
* storageOriginIfMissing - Storage rules. Only if content is missing from storage, get content from origin
|
||||
* mutables - Use originStorage if the resource is deemed mutable, storageOriginIfMissing if immutable
|
||||
* originOnly - Always get content from original source
|
||||
|
||||
Freshness -- How age of the resource, relative what we have seen/done before, factors into whether or not process the resource.
|
||||
|
@ -180,10 +181,14 @@ class TraversalPolicy {
|
|||
}
|
||||
|
||||
static default() {
|
||||
return new TraversalPolicy('originStorage', 'match', 'broad');
|
||||
return new TraversalPolicy('mutables', 'match', 'broad');
|
||||
}
|
||||
|
||||
static refresh() {
|
||||
return new TraversalPolicy('mutables', 'match', 'update');
|
||||
}
|
||||
|
||||
static reload() {
|
||||
return new TraversalPolicy('originStorage', 'match', 'update');
|
||||
}
|
||||
|
||||
|
@ -200,7 +205,7 @@ class TraversalPolicy {
|
|||
}
|
||||
|
||||
static reprocessAndUpdate() {
|
||||
return new TraversalPolicy('originStorage', 'matchOrVersion', 'update');
|
||||
return new TraversalPolicy('mutables', 'matchOrVersion', 'update');
|
||||
}
|
||||
|
||||
static clone(policy) {
|
||||
|
@ -230,10 +235,13 @@ class TraversalPolicy {
|
|||
return true;
|
||||
}
|
||||
if (this.freshness === 'match') {
|
||||
// process if the content we got did NOT come from the cache (i.e., either is it new or never seen)
|
||||
return request.origin !== 'cacheOfOrigin';
|
||||
// process if the content came from origin then either we did not have it cached or it did not match. Process
|
||||
return request.contentOrigin === 'origin';
|
||||
}
|
||||
if (typeof this.freshness === 'number') {
|
||||
// TODO this is not quite right. To tell time freshness we need to get the cached version but if we need to process
|
||||
// we need the content from origin. Essentially we need to read the processed time with the etag (at that point)
|
||||
// determine if the content is stale. Testing here is too late.
|
||||
return moment.diff(request.document._metadata.processedAt, 'hours') > this.freshness * 24;
|
||||
}
|
||||
if (this.freshness === 'version' || this.freshness === 'matchOrVersion') {
|
||||
|
@ -250,11 +258,16 @@ class TraversalPolicy {
|
|||
return this._getTransitivitySpec().shouldTraverse(request);
|
||||
}
|
||||
|
||||
isImmutable(type) {
|
||||
return ['commit'].includes(type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the source from which to perform the initial fetch for the given request's resource.
|
||||
*/
|
||||
initialFetch(request) {
|
||||
const result = { storageOnly: 'storage', originStorage: 'origin', storageOriginIfMissing: 'storage', originOnly: 'origin' }[this.fetch];
|
||||
const mutablesValue = this.isImmutable(request.type) ? 'storage' : 'etag';
|
||||
const result = { storageOnly: 'storage', originStorage: 'etag', originMutable: 'storage', storageOriginIfMissing: 'storage', mutables: mutablesValue, originOnly: 'origin' }[this.fetch];
|
||||
if (!result) {
|
||||
throw new Error(`Fetch policy misconfigured ${this.fetch}`);
|
||||
}
|
||||
|
@ -265,7 +278,7 @@ class TraversalPolicy {
|
|||
* Return the source from which to fetch if the original fetch did not find any content
|
||||
*/
|
||||
shouldFetchMissing(request) {
|
||||
const result = { storageOnly: null, originStorage: 'origin', storageOriginIfMissing: 'origin', originOnly: null }[this.fetch];
|
||||
const result = { storageOnly: null, originStorage: 'origin', storageOriginIfMissing: 'origin', mutables: 'origin', originOnly: null }[this.fetch];
|
||||
if (result === undefined) {
|
||||
throw new Error(`Fetch policy misconfigured ${this.fetch}`);
|
||||
}
|
||||
|
@ -276,7 +289,7 @@ class TraversalPolicy {
|
|||
* Return a symbolic short form to uniquely identify this policy.
|
||||
*/
|
||||
getShortForm() {
|
||||
const fetch = { storageOnly: 'S', storageOriginIfMissing: 's', originOnly: 'O', originStorage: 'o' }[this.fetch];
|
||||
const fetch = { storageOnly: 'S', storageOriginIfMissing: 's', originOnly: 'O', originStorage: 'o', mutables: 'm' }[this.fetch];
|
||||
let freshness = { always: 'A', match: 'M', version: 'V', matchOrVersion: 'm' }[this.freshness];
|
||||
if (!freshness) {
|
||||
if (typeof this.policy.freshness === 'number') {
|
||||
|
|
Загрузка…
Ссылка в новой задаче