зеркало из https://github.com/microsoft/ghcrawler.git
Storage stream (#171)
* Resolve most of security vulnerabilities * Add eslint * Replace createBlockBlobFromText with createWriteStreamToBlockBlob * Remove memory cache for upsert, get, and etag
This commit is contained in:
Родитель
b37be19929
Коммит
3dfcc087e1
|
@ -13,7 +13,7 @@ const CrawlerService = require('./lib/crawlerService');
|
|||
const Q = require('q');
|
||||
const QueueSet = require('./providers/queuing/queueSet');
|
||||
const redlock = require('redlock');
|
||||
const RefreshingConfig = require('refreshing-config');
|
||||
const RefreshingConfig = require('@microsoft/refreshing-config');
|
||||
const RefreshingConfigRedis = require('refreshing-config-redis');
|
||||
|
||||
let logger = null;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
41
package.json
41
package.json
|
@ -25,46 +25,45 @@
|
|||
"url": "https://github.com/microsoft/ghcrawler"
|
||||
},
|
||||
"dependencies": {
|
||||
"@microsoft/refreshing-config": "^0.1.3",
|
||||
"amqp10": "noodlefrenzy/node-amqp10#issue295",
|
||||
"amqplib": "^0.5.1",
|
||||
"async": "^2.6.0",
|
||||
"azure-sb": "^0.10.6",
|
||||
"azure-storage": "^1.3.2",
|
||||
"azure-sb": "^0.11.0",
|
||||
"azure-storage": "^2.10.3",
|
||||
"body-parser": "^1.15.2",
|
||||
"connect-redis": "^3.1.0",
|
||||
"debug": "^2.6.1",
|
||||
"express": "^4.14.0",
|
||||
"connect-redis": "^3.4.1",
|
||||
"debug": "^4.1.1",
|
||||
"express": "^4.17.1",
|
||||
"express-init": "^1.1.0",
|
||||
"express-joi": "^0.3.1",
|
||||
"extend": "3.0.2",
|
||||
"ghrequestor": "^0.1.6",
|
||||
"ghrequestor": "^0.1.7",
|
||||
"htmlencode": "0.0.4",
|
||||
"ip": "^1.1.4",
|
||||
"memory-cache": "^0.1.6",
|
||||
"ip": "^1.1.5",
|
||||
"memory-cache": "^0.2.0",
|
||||
"mkdirp": "^0.5.1",
|
||||
"moment": "^2.22.2",
|
||||
"moment": "^2.24.0",
|
||||
"mongodb": "2.2.11",
|
||||
"morgan": "^1.7.0",
|
||||
"node-uuid": "^1.4.7",
|
||||
"painless-config": "^0.1.0",
|
||||
"morgan": "^1.9.1",
|
||||
"node-uuid": "^1.4.8",
|
||||
"painless-config": "^0.1.1",
|
||||
"parse-link-header": "^0.4.1",
|
||||
"promise-retry": "1.1.1",
|
||||
"q": "1.4.1",
|
||||
"q": "1.5.1",
|
||||
"qlimit": "^0.1.1",
|
||||
"redis": "2.6.3",
|
||||
"redis-metrics": "^0.4.1",
|
||||
"redis": "2.8.0",
|
||||
"redis-metrics": "^1.3.1",
|
||||
"redis-rate-limiter": "github:jeffmcaffer/redis-rate-limiter",
|
||||
"redlock": "2.0.1",
|
||||
"refreshing-config": "^0.1.2",
|
||||
"refreshing-config-redis": "^0.1.0",
|
||||
"tmp": "0.0.33"
|
||||
},
|
||||
"devDependencies": {
|
||||
"chai": "^3.5.0",
|
||||
"grunt": "^1.0.1",
|
||||
"grunt-mocha-test": "^0.13.2",
|
||||
"chai": "^4.2.0",
|
||||
"eslint": "^5.16.0",
|
||||
"istanbul": "^0.4.5",
|
||||
"mocha": "^3.1.2",
|
||||
"sinon": "^1.17.6"
|
||||
"mocha": "^6.1.4",
|
||||
"sinon": "^2.4.1"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ const async = require('async');
|
|||
const azure = require('azure-storage');
|
||||
const memoryCache = require('memory-cache');
|
||||
const Q = require('q');
|
||||
const { Readable } = require('stream');
|
||||
const URL = require('url');
|
||||
|
||||
class AzureStorageDocStore {
|
||||
|
@ -33,7 +34,6 @@ class AzureStorageDocStore {
|
|||
upsert(document) {
|
||||
const deferred = Q.defer();
|
||||
const blobName = this._getBlobNameFromDocument(document);
|
||||
const text = JSON.stringify(document);
|
||||
const blobMetadata = {
|
||||
version: document._metadata.version,
|
||||
etag: document._metadata.etag,
|
||||
|
@ -47,23 +47,22 @@ class AzureStorageDocStore {
|
|||
blobMetadata.extra = JSON.stringify(document._metadata.extra);
|
||||
}
|
||||
const options = { metadata: blobMetadata, contentSettings: { contentType: 'application/json' } };
|
||||
this.service.createBlockBlobFromText(this.name, blobName, text, options, (error, result, response) => {
|
||||
if (error) {
|
||||
const dataStream = new Readable();
|
||||
dataStream.push(JSON.stringify(document));
|
||||
dataStream.push(null);
|
||||
dataStream
|
||||
.pipe(this.service.createWriteStreamToBlockBlob(this.name, blobName, options))
|
||||
.on('error', (error) => {
|
||||
return deferred.reject(error);
|
||||
}
|
||||
memoryCache.put(document._metadata.url, { etag: document._metadata.etag, document: document }, this.options.ttl);
|
||||
deferred.resolve(blobName);
|
||||
});
|
||||
})
|
||||
.on('finish', () => {
|
||||
deferred.resolve(blobName);
|
||||
});
|
||||
return deferred.promise;
|
||||
}
|
||||
|
||||
// TODO: Consistency on whether key is a URL or URN
|
||||
get(type, key) {
|
||||
const cached = memoryCache.get(key);
|
||||
if (cached) {
|
||||
return Q(cached.document);
|
||||
}
|
||||
|
||||
const deferred = Q.defer();
|
||||
const blobName = this._getBlobNameFromKey(type, key);
|
||||
this.service.getBlobToText(this.name, blobName, (error, text, blob, response) => {
|
||||
|
@ -71,7 +70,6 @@ class AzureStorageDocStore {
|
|||
return deferred.reject(error);
|
||||
}
|
||||
const result = JSON.parse(text);
|
||||
memoryCache.put(key, { etag: result._metadata.etag, document: result }, this.options.ttl);
|
||||
deferred.resolve(result);
|
||||
});
|
||||
return deferred.promise;
|
||||
|
@ -79,11 +77,6 @@ class AzureStorageDocStore {
|
|||
|
||||
// TODO: Consistency on whether key is a URL or URN
|
||||
etag(type, key) {
|
||||
const cached = memoryCache.get(key);
|
||||
if (cached) {
|
||||
return Q(cached.etag);
|
||||
}
|
||||
|
||||
const deferred = Q.defer();
|
||||
const blobName = this._getBlobNameFromKey(type, key);
|
||||
this.service.getBlobMetadata(this.name, blobName, (error, blob, response) => {
|
||||
|
@ -100,7 +93,6 @@ class AzureStorageDocStore {
|
|||
const deferred = Q.defer();
|
||||
async.doWhilst(
|
||||
callback => {
|
||||
var started = new Date().getTime();
|
||||
this.service.listBlobsSegmented(this.name, continuationToken, { include: azure.BlobUtilities.BlobListingDetails.METADATA, location: azure.StorageUtilities.LocationMode.PRIMARY_THEN_SECONDARY }, function (err, result, response) {
|
||||
// metricsClient.trackDependency(url.parse(blobService.host.primaryHost).hostname, 'listBlobsSegmented', (new Date().getTime() - started), !err, "Http", { 'Container name': 'download', 'Continuation token present': result == null ? false : (result.continuationToken != null), 'Blob count': result == null ? 0 : result.entries.length });
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче