Merge ospo-ghcrawler into ghcrawler

This commit is contained in:
William Bartholomew 2017-03-09 16:22:33 -08:00
Родитель ba5d1cddf1
Коммит 7393e7c7fe
49 изменённых файлов: 4937 добавлений и 4 удалений

1
.dockerignore Normal file
Просмотреть файл

@ -0,0 +1 @@
node_modules/

2
.gitignore поставляемый
Просмотреть файл

@ -2,3 +2,5 @@ node_modules/
typings/
npm-debug.log
coverage/
.DS_Store
nohup*

211
.vscode/launch.json поставляемый
Просмотреть файл

@ -1,6 +1,215 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Mocha",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/node_modules/mocha/bin/_mocha",
"stopOnEntry": false,
"args": [
"${workspaceRoot}/test/unit/*.js"
],
"cwd": "${workspaceRoot}",
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "localhost"
},
"console": "internalConsole"
},
{
"name": "Coverage",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/node_modules/istanbul/lib/cli.js",
"stopOnEntry": false,
"args": [
"cover",
"${workspaceRoot}/node_modules/mocha/bin/_mocha",
"${workspaceRoot}/test/unit/*.js"
],
"cwd": "${workspaceRoot}",
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "localhost"
},
"console": "internalConsole"
},
{
"name": "Docker Infrastructure",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/bin/www.js",
"stopOnEntry": false,
"args": [],
"cwd": "${workspaceRoot}",
"preLaunchTask": null,
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "localhost",
"CRAWLER_MODE": "StandardWithoutEvents",
"CRAWLER_NAME": "crawlerdocker",
"CRAWLER_QUEUE_PREFIX": "crawlerdocker",
"CRAWLER_OPTIONS_PROVIDER": "redis",
"CRAWLER_REDIS_TLS": "",
"CRAWLER_REDIS_URL": "localhost",
"CRAWLER_REDIS_PORT": "6379",
"CRAWLER_STORE_PROVIDER": "mongo",
"CRAWLER_MONGO_URL": "mongodb://localhost:27017/ghcrawler",
"CRAWLER_QUEUE_PROVIDER": "amqp",
"CRAWLER_AMQP_URL": "amqp://localhost:5672",
"CRAWLER_RABBIT_MANAGER_ENDPOINT": "http://guest:guest@localhost:15672"
},
"console": "internalConsole",
"sourceMaps": false,
"outFiles": []
},
{
"name": "Local Standard",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/bin/www.js",
"stopOnEntry": false,
"args": [],
"cwd": "${workspaceRoot}",
"preLaunchTask": null,
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "localhost",
"CRAWLER_MODE": "Standard",
"CRAWLER_OPTIONS_PROVIDER": "redis"
},
"console": "internalConsole",
"sourceMaps": false,
"outFiles": []
},
{
"name": "Prod Standard",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/bin/www.js",
"cwd": "${workspaceRoot}",
"env": {
"NODE_ENV": "production",
"CRAWLER_MODE": "Standard",
"CRAWLER_OPTIONS_PROVIDER": "redis",
"NODE_TLS_REJECT_UNAUTHORIZED": "0"
},
"console": "internalConsole"
},
{
"name": "Local StandardWithoutEvents",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/bin/www.js",
"stopOnEntry": false,
"args": [],
"cwd": "${workspaceRoot}",
"preLaunchTask": null,
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "localhost",
"CRAWLER_MODE": "Standard",
"CRAWLER_EVENT_PROVIDER": "none",
"CRAWLER_OPTIONS_PROVIDER": "redis",
"DEBUG.off": "amqp10:client,amqp10:link:receiver"
},
"console": "internalConsole",
"sourceMaps": false,
"outFiles": []
},
{
"name": "Dev StandardWithoutEvents",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/bin/www.js",
"stopOnEntry": false,
"args": [],
"cwd": "${workspaceRoot}",
"preLaunchTask": null,
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "development",
"CRAWLER_OPTIONS_PROVIDER": "redis"
},
"console": "internalConsole",
"sourceMaps": false,
"outFiles": []
},
{
"name": "In Memory",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/bin/www.js",
"stopOnEntry": false,
"args": [],
"cwd": "${workspaceRoot}",
"preLaunchTask": null,
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "localhost"
},
"console": "internalConsole",
"sourceMaps": false,
"outFiles": []
},
{
"name": "Local with Webhooks",
"type": "node",
"request": "launch",
"program": "${workspaceRoot}/bin/www.js",
"stopOnEntry": false,
"args": [],
"cwd": "${workspaceRoot}",
"preLaunchTask": null,
"runtimeExecutable": null,
"runtimeArgs": [
"--nolazy"
],
"env": {
"NODE_ENV": "localhost",
"CRAWLER_MODE": "Standard",
"CRAWLER_EVENT_PROVIDER": "webhook",
"CRAWLER_OPTIONS_PROVIDER": "redis",
"NODE_TLS_REJECT_UNAUTHORIZED": "0"
},
"console": "internalConsole",
"sourceMaps": false,
"outFiles": []
},
{
"name": "Attach",
"type": "node",
"request": "attach",
"port": 5858,
"address": "localhost",
"restart": false,
"sourceMaps": false,
"outFiles": [],
"localRoot": "${workspaceRoot}",
"remoteRoot": null
},
{
"name": "Mocha",
"type": "node",
@ -57,7 +266,7 @@
"name": "Attach to Process",
"type": "node",
"request": "attach",
"processId": "${command.PickProcess}",
"processId": "${command:PickProcess}",
"port": 5858,
"sourceMaps": false,
"outFiles": []

5
.vscode/settings.json поставляемый
Просмотреть файл

@ -5,5 +5,8 @@
},
"editor.folding": false,
"editor.tabSize": 2,
"editor.detectIndentation": false
"editor.detectIndentation": false,
"editor.formatOnType": true,
"editor.insertSpaces": true,
"files.trimTrailingWhitespace": true
}

19
Dockerfile Normal file
Просмотреть файл

@ -0,0 +1,19 @@
FROM node:6.9.5
EXPOSE 3000
EXPOSE 5858
RUN mkdir -p /opt/ospo-ghcrawler
# use changes to package.json to force Docker not to use the cache
# when we change our application's nodejs dependencies:
RUN npm install -g nodemon
ADD package.json /tmp/package.json
RUN cd /tmp && npm install --production
RUN cp -a /tmp/node_modules /opt/ospo-ghcrawler/
WORKDIR /opt/ospo-ghcrawler
ENV PATH="/opt/ospo-ghcrawler/bin:$PATH"
ADD . /opt/ospo-ghcrawler
CMD ["npm", "start"]

159
README.md
Просмотреть файл

@ -2,6 +2,165 @@
![License](https://img.shields.io/github/license/Microsoft/ghcrawler.svg)
![Downloads](https://img.shields.io/npm/dt/ghcrawler.svg)
# OSPO GHCrawler
[GHCrawler](https://github.com/Microsoft/ghcrawler) is a service that systematically walks GitHub APIs and harvests data about a specified set of repos and orgs. The function here is the *getting started* infrastructure for running that system. The crawler can be configured to use a variety of different queuing (e.g., AMQP 1.0 and AMQP 0.9 compatible queues like Azure ServiceBus and Rabbit MQ, respectively) and storage technologies (e.g., Azure Blob and MongoDB). You can create your own infrastructure plugins to use different technologies.
# Running in-memory
The easiest way try our the crawler is to run it in memory. You can get up and running in a couple minutes. This approach does not scale and is not persistent but it's dead simple.
1. Clone the [Microsoft/ospo-ghcrawler](https://github.com/Microsoft/ospo-ghcrawler.git) repo.
1. Run ```npm install``` in the clone repo directory to install the prerequisites.
1. Run the crawler using ```node bin/www.js```.
Once the service is up and running, you should see some crawler related messages in the console output every few seconds. You can control the crawler either using the ```cc``` command line tool or a brower-based dashboard both of which are described below. Note that since you are running in memory, if you kill the crawler process, all work will be lost. This mode is great for playing around with the crawler or testing.
# Running Crawler-In-A-Box (CIABatta)
If you want to persist the data gathered and create some insight dashboards in small to medium production system, you can run the crawler in Docker with Mongo, Rabbit, and Redis using the Crawler-in-a-box (CIABatta) approach. This setup also includes Metabase for building browser-based insgihts and gives you a browser-based control-panel for observing and controlling the crawler service.
***NOTE*** This is an evolving solution and the steps for running will be simplified published, ready-to-use images on Docker Hub. For now, follow these steps
1. Clone the [Microsoft/ospo-ghcrawler](https://github.com/Microsoft/ospo-ghcrawler.git) and [Microsoft/crawler-dashboard](https://github.com/Microsoft/crawler-dashboard.git) repos.
1. In a command prompt go to ```ospo-ghcrawler/docker``` and run ```docker-compose up```.
Once the containers are up and running, you should see some crawler related messages in the container's console output every few seconds. You can control the crawler either using the ```cc``` command line tool or a brower-based dashboard both of which are described below.
You can also hookup directly to the crawler infrastructure. By default the containers expose a number of endpoints at different ports on localhost. Note that if you have trouble starting the containers due to port conflicts, either shutdown your services using these ports or edit the docker/docker-compose.yml file to change the ports.
* Crawler Dashboard (4000) -- Open http://localhost:4000 in your browser to see what's happening and control some behaivors and configurations
* Crawler (3000) -- Direct access to the REST API for the crawler
* MongoDB (27017 and 28017) -- Direct access to the Mongo DB
* Redis (6379) -- Observe what's happening in Redis. Not much else for you to do here
* RabbitMQ (5672 and 15672) -- Hit http://localhost:15672 with a browser to see and maange the RabbitMQ queues
* Metabase (5000) -- Hit http://localhost:5000 to get live insights in your browser via Metabase
# Deploying native
For ultimate flexibility, the crawler and associated bits can be run directly on VMs or as an app service. This structure typically uses cloud-based infrastructure for queuing, storage and redis. For example, this project comes with adapters for Azure Service Bus queuing and Azure Blob storage. The APIs on these adpaters is very slim so it is easy to for you to implement (and contribute) more.
***Setting up this operating mode is a bit more involved and is not yet documented.***
# Controlling the crawler
Given a running crawler service (see above), you can control it using either a simple command line app or a browser-based dashboard.
## ```cc``` command line
The *crawler-cli* (aka ```cc```) can run interactively or as a single command processor and enables a number of basic operations. For now the crawler-cli is not published as an npm. Instead, [clone its repo]((https://github.com/Microsoft/crawler-cli.git), run ```npm install``` and run the command line using
```
node bin/cc -i
```
The app's built-in help has general usage info and more details can be found in [the project's readme](https://github.com/Microsoft/crawler-cli/blob/develop/README.md). A typical command sequence shown in the snippet below starts ```cc``` in interactive mode, configures the crawler with a set of GitHub tokens, sets the org filtering and then queues and starts the processing of the org.
```
> node bin/cc -i
http://localhost:3000> tokens 43984b2344ca575d0f0e097efd97#public 972bbdfe098098fa9ce082309#admin
http://localhost:3000> orgs contoso-d
http://localhost:3000> queue contoso-d
http://localhost:3000> start 5
http://localhost:3000> exit
>
```
## Browser dashboard
The crawler dashboard gives you live feedback on what the crawler is doing as well as better control over the crawler's queues and configuration. Some configurations (e.g., Docker) include and start the dashboard for free. If you need to deploy the dashboard explicitly, clone the [Microsoft/crawler-dashboard](https://github.com/Microsoft/crawler-dashboard.git) repo and follow the instructions in [the README found there](https://github.com/Microsoft/crawler-dashboard/blob/develop/README.md).
Once the dashboard service is up and running, point your browser at the dashboard endpoing (http://localhost:4000 by default).
Note that the dashboard does not report queue message rates (top right graph) when used with the memory-based crawler service as that mechanism requires Redis to talk record activity.
# Tips
* Clearing queues -- In its normal redis configuration, the crawler uses redis to keep track of what is in the queues. This deduplicates queuing and dramatically reduces the number of requests needing to be processed. In the crawler dashboard is is possible to recreate the queues. This is a convenient way to clear them out. When doing this however, you must also clear the associated keys from redis that cache the queue content. Typically those keys are of the form ```<environment>:<queue provider>:<path>```. So for an AMPQ (i.e., RabbitMQ) setup running on the local machine the keys would look like ```localhost:amqp:<path>```. Use whatever redis client you like to clear these keys (possibly thousands). We use Redis Desktop Manager but any tool that does the job will do. Adding this redis clearing to the dashboard's recreate function is on the list of things to implement.
* Starting the crawler -- Due to some caching issues, you may need to ```Stop``` the crawler in the dashboard before you can start it using the update button as described above.
# Known issues
It is clearly early days for the crawler so there are a number of things left to do. These will be collected in repo issues. Note that the remaining issue set has yet to be populated.
Broadly speaking there are several types of work:
* Documentation -- The crawler code itself is relatively straightforward but some of the architecture, control and extensibility points are not called out.
* Ease of use -- There are a number of places where running and manaing the crawler is just clumsy and error prone
* Completeness -- There are a few functional gaps in certain scenarios that need to be addressed.
* Docker configuration -- Several items in making the Docker configuration real
* Analysis and insights -- Metabase is supplied in the Docker configuration but relatively little has been done with analyzing the harvested data.
## Runtime
### Docker items
1. Data persistence
1. Create separate docker-compose for general usage vs development
* Development should use local source code and enable Node debugging
* Both should allow end to end crawling with a single command (e.g. crawl orgName githubToken)
1. Publish images for Crawler Dashboard and Crawler to Docker Hub
## Updating the default Metabase for Docker configuratoins:
The Metabase configured by default has some canned queries and a dashboard. If you want to clear that out and start fresh, do the following:
1. Ensure you're starting from a completely clean container (docker-compose down && docker-compose up).
1. Crawl a small org to populate Mongo so you have schema/sample data to work with.
1. Open the Metabase URL and configure the questions, dashboard, etc. you want
1. REMEMBER: Any changes you make will be persisted
1. Copy the Metabase database by changing to the docker/metabase folder in the ospo-ghcrawler repository and running:
```docker cp docker_metabase_1:/var/opt/metabase/dockercrawler.db.mv.db .```
Production Docker deployment using Kubernetes or the like has been discussed but not yet planned. If you have a desire to do this, please open an issue or better yet a PR and lets see what can be done.
# Working with the code
### Build
`npm install`
### Unit test
`npm test`
### Integration test
`npm run integration`
### Run
`node ./bin/www.js`
# stale content
1. Start the service crawling by going to Crawler Dashboard at [http://localhost:4000](http://localhost:4000). On the righthand side, change the ```crawler/count``` to 1 and click ```Update``` button.
## Configuration
```
{
"NODE_ENV": "localhost",
"CRAWLER_MODE": "Standard",
"CRAWLER_OPTIONS_PROVIDER": ["defaults" | "memory" | "redis"],
"CRAWLER_INSIGHTS_KEY": "[SECRET]",
"CRAWLER_ORGS_FILE": "../orgs",
"CRAWLER_GITHUB_TOKENS": "[SECRET]",
"CRAWLER_REDIS_URL": "peoplesvc-dev.redis.cache.windows.net",
"CRAWLER_REDIS_ACCESS_KEY": "[SECRET]",
"CRAWLER_REDIS_PORT": 6380,
"CRAWLER_QUEUE_PROVIDER": "amqp10",
"CRAWLER_AMQP10_URL": "amqps://RootManageSharedAccessKey:[SECRET]@ghcrawlerdev.servicebus.windows.net",
"CRAWLER_QUEUE_PREFIX": "ghcrawlerdev",
"CRAWLER_STORE_PROVIDER": "azure",
"CRAWLER_STORAGE_NAME": "ghcrawlerdev",
"CRAWLER_STORAGE_ACCOUNT": "ghcrawlerdev",
"CRAWLER_STORAGE_KEY": "[SECRET]",
"CRAWLER_DOCLOG_STORAGE_ACCOUNT": "ghcrawlerdev",
"CRAWLER_DOCLOG_STORAGE_KEY": "[SECRET]"
}
```
# Contributing
The project team is more than happy to take contributions and suggestions.
To start working, run ```npm install``` in the repository folder to install the required dependencies. See the usage section for pointers on how to run.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
# GHCrawler
A robust GitHub API crawler that walks a queue of GitHub entities transitively retrieving and storing their contents. GHCrawler is great for:

82
app.js Normal file
Просмотреть файл

@ -0,0 +1,82 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const appInsights = require('applicationinsights');
const auth = require('./middleware/auth');
const bodyParser = require('body-parser');
const config = require('painless-config');
const CrawlerService = require('ghcrawler').crawlerService;
const express = require('express');
const logger = require('morgan');
const mockInsights = require('./providers/logger/mockInsights');
const CrawlerFactory = require('./lib/crawlerFactory');
const sendHelper = require('./middleware/sendHelper');
auth.initialize(config.get('CRAWLER_SERVICE_AUTH_TOKEN') || 'secret', config.get('CRAWLER_SERVICE_FORCE_AUTH'));
mockInsights.setup(config.get('CRAWLER_INSIGHTS_KEY') || 'mock', true);
const mode = config.get('CRAWLER_MODE') || '';
const service = CrawlerFactory.createService(mode);
const app = express();
app.use(logger('dev'));
app.use(sendHelper());
// If we should be listening for webhooks, add the route before the json body parser so we get the raw bodies.
// Note also that the GitHub doc says events are capped at 5mb
app.use('/webhook', bodyParser.raw({ limit: '5mb', type: '*/*' }), require('./routes/webhook')(service, config.get('CRAWLER_WEBHOOK_SECRET')));
// It's safe to set limitation to 2mb.
app.use(bodyParser.json({ limit: '2mb' }));
app.use('/status', require('./routes/status')(service));
app.use('/config', require('./routes/config')(service));
app.use('/requests', require('./routes/requests')(service));
app.use('/queues', require('./routes/queues')(service));
app.use('/deadletters', require('./routes/deadletters')(service));
// to keep AlwaysOn flooding logs with errors
app.get('/', function (request, response, next) {
response.helpers.send.noContent();
});
// Catch 404 and forward to error handler
const requestHandler = function (request, response, next) {
let error = { message: 'Not Found' };
error.status = 404;
error.success = false;
next(error);
};
app.use(requestHandler);
// Hang the service init code off a route middleware. Doesn't really matter which one.
requestHandler.init = (app, callback) => {
service.ensureInitialized().then(
() => {
service.run();
console.log('Service initialized');
// call the callback but with no args. An arg indicates an error.
callback();
},
error => {
console.log(`Service initialization error: ${error.message}`);
console.dir(error);
callback(error);
});
};
// Error handlers
const handler = function (error, request, response, next) {
appInsights.client.trackException(error, { name: 'SvcRequestFailure' });
if (response.headersSent) {
return next(error);
}
response.status(error.status || 500);
let propertiesToSerialize = ['success', 'message'];
if (app.get('env') !== 'production') {
propertiesToSerialize.push('stack');
}
// Properties on Error object aren't enumerable so need to explicitly list properties to serialize
response.send(JSON.stringify(error, propertiesToSerialize));
response.end();
};
app.use(handler);
module.exports = app;

88
bin/www.js Normal file
Просмотреть файл

@ -0,0 +1,88 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const app = require('../app');
const config = require('painless-config');
const http = require('http');
const init = require('express-init');
/**
* Get port from environment and store in Express.
*/
let port = normalizePort(config.get('CRAWLER_SERVICE_PORT') || process.env.PORT || '3000');
port = port === 'random' ? null : port;
app.set('port', port);
const server = http.createServer(app);
// initialize the apps (if they have async init functions) and start listening
init(app, error => {
if (error) {
console.log('Error initializing the Express app: ' + error);
throw new Error(error);
}
server.listen(port);
});
server.on('error', onError);
server.on('listening', onListening);
/**
* Normalize a port into a number, string, or false.
*/
function normalizePort(val) {
const normalizedPort = parseInt(val, 10);
if (isNaN(normalizedPort)) {
// named pipe
return val;
}
if (normalizedPort >= 0) {
// port number
return normalizedPort;
}
return false;
}
/**
* Event listener for HTTP server 'error' event.
*/
function onError(error) {
if (error.syscall !== 'listen') {
throw error;
}
const bind = typeof port === 'string'
? 'Pipe ' + port
: 'Port ' + port;
// handle specific listen errors with friendly messages
switch (error.code) {
case 'EACCES':
console.error(bind + ' requires elevated privileges');
process.exit(1);
break;
case 'EADDRINUSE':
console.error(bind + ' is already in use');
process.exit(1);
break;
default:
throw error;
}
}
/**
* Event listener for HTTP server 'listening' event.
*/
function onListening() {
const addr = server.address();
var bind = typeof addr === 'string'
? 'pipe ' + addr
: 'port ' + addr.port;
console.log(`Crawler service listening on ${bind}`);
}

Просмотреть файл

@ -0,0 +1,56 @@
mongo:
image: mongo:latest
ports:
- "27017:27017"
- "28017:28017"
redis:
image: redis:latest
ports:
- "6379:6379"
rabbitmq:
image: rabbitmq:management
ports:
- "5672:5672"
- "15672:15672"
metabase:
image: metabase/metabase:latest
ports:
- "5000:3000"
links:
- mongo
dashboard:
build: ../../crawler-dashboard
ports:
- "4000:4000"
environment:
- NODE_ENV=localhost
- DEBUG_ALLOW_HTTP=1
- CRAWLER_REDIS_URL=redis
- CRAWLER_NAME=crawlerdocker
links:
- redis
crawler:
build: ..
command: node --debug=5858 ./bin/www.js
links:
- mongo
- redis
- rabbitmq
- dashboard
ports:
- "3000:3000"
- "5858:5858"
volumes:
- ../..:/opt
environment:
- NODE_ENV=localhost
- CRAWLER_NAME=crawlerdocker
- CRAWLER_QUEUE_PREFIX=crawlerdocker
- CRAWLER_MODE=StandardWithoutEvents
- CRAWLER_REDIS_URL=redis
- CRAWLER_STORE_PROVIDER=mongo
- CRAWLER_MONGO_URL=mongodb://mongo:27017/ghcrawler
- CRAWLER_QUEUE_PROVIDER=amqp
- CRAWLER_AMQP_URL=amqp://rabbitmq:5672
- CRAWLER_GITHUB_TOKENS=${CRAWLER_GITHUB_TOKENS}
- CRAWLER_OPTIONS_PROVIDER=redis

54
docker/docker-compose.yml Normal file
Просмотреть файл

@ -0,0 +1,54 @@
mongo:
image: mongo:latest
ports:
- "27017:27017"
- "28017:28017"
redis:
image: redis:latest
ports:
- "6379:6379"
rabbitmq:
image: rabbitmq:management
ports:
- "5672:5672"
- "15672:15672"
metabase:
build: ./metabase
ports:
- "5000:3000"
links:
- mongo
dashboard:
build: ../../crawler-dashboard
ports:
- "4000:4000"
environment:
- NODE_ENV=localhost
- DEBUG_ALLOW_HTTP=1
- CRAWLER_REDIS_URL=redis
- CRAWLER_NAME=crawlerdocker
- CRAWLER_SERVICE_URL=http://crawler:3000
links:
- redis
- crawler
crawler:
build: ..
links:
- mongo
- redis
- rabbitmq
ports:
- "3000:3000"
environment:
- NODE_ENV=localhost
- CRAWLER_NAME=crawlerdocker
- CRAWLER_MODE=StandardWithoutEvents
- CRAWLER_REDIS_URL=redis
- CRAWLER_STORE_PROVIDER=mongo
- CRAWLER_MONGO_URL=mongodb://mongo:27017/ghcrawler
- CRAWLER_QUEUE_PREFIX=crawlerdocker
- CRAWLER_QUEUE_PROVIDER=amqp
- CRAWLER_AMQP_URL=amqp://rabbitmq:5672
- CRAWLER_RABBIT_MANAGER_ENDPOINT=http://guest:guest@rabbitmq:15672
- CRAWLER_GITHUB_TOKENS=${CRAWLER_GITHUB_TOKENS}
- CRAWLER_OPTIONS_PROVIDER=redis

Просмотреть файл

@ -0,0 +1,5 @@
FROM metabase/metabase:latest
EXPOSE 3000
RUN mkdir -p /var/opt/metabase
ENV MB_DB_FILE=/var/opt/metabase/dockercrawler.db
COPY dockercrawler.db.mv.db /var/opt/metabase/

Двоичные данные
docker/metabase/dockercrawler.db.mv.db Normal file

Двоичный файл не отображается.

681
lib/crawlerFactory.js Normal file
Просмотреть файл

@ -0,0 +1,681 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const mockInsights = require('../providers/logger/mockInsights');
const LimitedTokenFactory = require('../providers/fetcher/limitedTokenFactory');
const TokenFactory = require('../providers/fetcher/tokenFactory');
const ComputeLimiter = require('../providers/limiting/computeLimiter');
const InMemoryRateLimiter = require('../providers/limiting/inmemoryRateLimiter');
const Amqp10Queue = require('../providers/queuing/amqp10Queue');
const AttenuatedQueue = require('../providers/queuing/attenuatedQueue');
const InMemoryCrawlQueue = require('../providers/queuing/inmemorycrawlqueue');
const RabbitQueueManager = require('../providers/queuing/rabbitQueueManager');
const RedisRequestTracker = require('../providers/queuing/redisRequestTracker');
const ServiceBusQueueManager = require('../providers/queuing/serviceBusQueueManager');
const InMemoryDocStore = require('../providers/storage/inmemoryDocStore');
const DeltaStore = require('../providers/storage/deltaStore');
const MongoDocStore = require('../providers/storage/mongodocstore');
const AzureStorageDocStore = require('../providers/storage/storageDocStore');
const UrlToUrnMappingStore = require('../providers/storage/urlToUrnMappingStore');
const amqp10 = require('amqp10');
const appInsights = require('applicationinsights');
const aiLogger = require('winston-azure-application-insights').AzureApplicationInsightsLogger;
const AzureStorage = require('azure-storage');
const config = require('painless-config');
const Crawler = require('ghcrawler').crawler;
const CrawlerService = require('ghcrawler').crawlerService;
const fs = require('fs');
const GitHubFetcher = require('../index').githubFetcher;
const GitHubProcessor = require('../index').githubProcessor;
const ip = require('ip');
const moment = require('moment');
const policy = require('../index').policy;
const Q = require('q');
const QueueSet = require('../index').queueSet;
const redis = require('redis');
const RedisMetrics = require('redis-metrics');
const RedisRateLimiter = require('redis-rate-limiter');
const redlock = require('redlock');
const RefreshingConfig = require('refreshing-config');
const RefreshingConfigRedis = require('refreshing-config-redis');
const request = require('request');
const Request = require('../index').request;
const requestor = require('ghrequestor');
const winston = require('winston');
const AmqpClient = amqp10.Client;
const AmqpPolicy = amqp10.Policy;
let factoryLogger = null;
let redisClient = null;
class CrawlerFactory {
static getDefaultOptions() {
return {
crawler: {
name: config.get('CRAWLER_NAME') || 'crawler',
count: 0,
pollingDelay: 5000,
processingTtl: 60 * 1000,
promiseTrace: false,
orgList: CrawlerFactory.loadOrgs()
},
fetcher: {
tokenLowerBound: 50,
metricsStore: 'redis',
callCapStore: 'memory',
callCapWindow: 1, // seconds
callCapLimit: 30, // calls
computeLimitStore: 'memory',
computeWindow: 15, // seconds
computeLimit: 15000, // milliseconds
baselineFrequency: 60, // seconds
deferDelay: 500
},
queuing: {
provider: config.get('CRAWLER_QUEUE_PROVIDER') || 'amqp10',
queueName: config.get('CRAWLER_QUEUE_PREFIX') || 'crawler',
credit: 100,
weights: { events: 10, immediate: 3, soon: 2, normal: 3, later: 2 },
messageSize: 240,
parallelPush: 10,
pushRateLimit: 200,
metricsStore: 'redis',
events: {
provider: config.get('CRAWLER_EVENT_PROVIDER') || 'webhook',
topic: config.get('CRAWLER_EVENT_TOPIC_NAME') || 'crawler',
queueName: config.get('CRAWLER_EVENT_QUEUE_NAME') || 'crawler'
},
attenuation: {
ttl: 3000
},
tracker: {
// driftFactor: 0.01,
// retryCount: 3,
// retryDelay: 200,
// locking: true,
// lockTtl: 1000,
ttl: 60 * 60 * 1000
}
},
storage: {
ttl: 3 * 1000,
provider: config.get('CRAWLER_STORE_PROVIDER') || 'azure',
delta: {
provider: config.get('CRAWLER_DELTA_PROVIDER')
}
},
locker: {
provider: 'redis',
retryCount: 3,
retryDelay: 200
}
};
}
static createService(name) {
factoryLogger.info('appInitStart');
const crawlerName = config.get('CRAWLER_NAME') || 'crawler';
const optionsProvider = config.get('CRAWLER_OPTIONS_PROVIDER') || 'memory';
const subsystemNames = ['crawler', 'fetcher', 'queuing', 'storage', 'locker'];
const crawlerPromise = CrawlerFactory.createRefreshingOptions(crawlerName, subsystemNames, optionsProvider).then(options => {
factoryLogger.info(`creating refreshingOption completed`);
name = name || 'InMemory';
factoryLogger.info(`begin create crawler of type ${name}`);
const crawler = CrawlerFactory[`create${name}Crawler`](options);
return [crawler, options];
});
return new CrawlerService(crawlerPromise);
}
static createStandardCrawler(options) {
factoryLogger.info(`creating standard Crawler Started`);
return CrawlerFactory.createCrawler(options);
}
static createInMemoryCrawler(options) {
CrawlerFactory._configureInMemoryOptions(options);
return CrawlerFactory.createCrawler(options);
}
static _configureInMemoryOptions(options) {
factoryLogger.info(`create in memory options`);
options.crawler.count = 1;
options.fetcher.computeLimitStore = 'memory';
options.fetcher.metricsStore = null;
delete options.queuing.events.provider;
options.queuing.provider = 'memory';
options.queuing.metricsStore = null;
options.locker.provider = 'memory';
options.storage.provider = 'memory';
return options;
}
static _decorateOptions(options) {
Object.getOwnPropertyNames(options).forEach(key => {
const logger = CrawlerFactory.createLogger(true);
options[key].logger = logger;
const capitalized = key.charAt(0).toUpperCase() + key.slice(1);
const metricsFactory = CrawlerFactory[`create${capitalized}Metrics`];
if (metricsFactory) {
factoryLogger.info('Creating metrics factory', { factory: capitalized });
logger.metrics = metricsFactory(options.crawler.name, options[key]);
}
});
}
static createCrawler(options, { queues = null, store = null, deadletters = null, locker = null, fetcher = null, processor = null } = {}) {
CrawlerFactory._decorateOptions(options);
queues = queues || CrawlerFactory.createQueues(options.queuing);
store = store || CrawlerFactory.createStore(options.storage);
deadletters = deadletters || CrawlerFactory.createDeadletterStore(options.storage);
locker = locker || CrawlerFactory.createLocker(options.locker);
fetcher = fetcher || CrawlerFactory.createGitHubFetcher(store, options.fetcher);
processor = processor || new GitHubProcessor(store);
const result = new Crawler(queues, store, deadletters, locker, fetcher, processor, options.crawler);
result.initialize = CrawlerFactory._initialize.bind(result);
return result;
}
static _initialize() {
return Q.try(this.queues.subscribe.bind(this.queues))
.then(this.store.connect.bind(this.store))
.then(this.deadletters.connect.bind(this.deadletters));
}
static createRefreshingOptions(crawlerName, subsystemNames, provider = 'redis') {
factoryLogger.info(`creating refreshing options with crawlerName:${crawlerName}`);
const result = {};
provider = provider.toLowerCase();
return Q.all(subsystemNames.map(subsystemName => {
factoryLogger.info(`creating refreshing options promise with crawlerName:${crawlerName} subsystemName ${subsystemName} provider ${provider}`);
let config = null;
if (provider === 'redis') {
config = CrawlerFactory.createRedisRefreshingConfig(crawlerName, subsystemName);
} else if (provider === 'memory') {
config = CrawlerFactory.createInMemoryRefreshingConfig();
} else {
throw new Error(`Invalid options provider setting ${provider}`);
}
return config.getAll().then(values => {
factoryLogger.info(`creating refreshingOption config get completed`);
const defaults = CrawlerFactory.getDefaultOptions();
return CrawlerFactory.initializeSubsystemOptions(values, defaults[subsystemName]).then(resolved => {
factoryLogger.info(`subsystem options initialized`);
result[subsystemName] = values;
});
});
})).then(() => { return result; });
}
static initializeSubsystemOptions(config, defaults) {
if (Object.getOwnPropertyNames(config).length > 1) {
return Q(config);
}
return Q.all(Object.getOwnPropertyNames(defaults).map(optionName => {
return config._config.set(optionName, defaults[optionName]);
})).then(() => { return config._config.getAll(); });
}
static createRedisRefreshingConfig(crawlerName, subsystemName) {
factoryLogger.info('Create refreshing redis config', { crawlerName: crawlerName, subsystemName: subsystemName });
const redisClient = CrawlerFactory.getRedisClient(CrawlerFactory.createLogger(true));
const key = `${crawlerName}:options:${subsystemName}`;
const channel = `${key}-channel`;
const configStore = new RefreshingConfigRedis.RedisConfigStore(redisClient, key);
const config = new RefreshingConfig.RefreshingConfig(configStore)
.withExtension(new RefreshingConfigRedis.RedisPubSubRefreshPolicyAndChangePublisher(redisClient, channel));
return config;
}
static createInMemoryRefreshingConfig(values = {}) {
factoryLogger.info('create in memory refreshing config');
const configStore = new RefreshingConfig.InMemoryConfigStore(values);
const config = new RefreshingConfig.RefreshingConfig(configStore)
.withExtension(new RefreshingConfig.InMemoryPubSubRefreshPolicyAndChangePublisher());
return config;
}
static createGitHubFetcher(store, options) {
factoryLogger.info('create github fetcher');
const requestor = CrawlerFactory.createRequestor();
const tokenFactory = CrawlerFactory.createTokenFactory(options);
const limiter = CrawlerFactory.createComputeLimiter(options);
return new GitHubFetcher(requestor, store, tokenFactory, limiter, options);
}
static createTokenFactory(options) {
factoryLogger.info('create token factory');
const factory = new TokenFactory(config.get('CRAWLER_GITHUB_TOKENS'), options);
const limiter = CrawlerFactory.createTokenLimiter(options);
return new LimitedTokenFactory(factory, limiter, options);
}
static createRequestor() {
factoryLogger.info('create requestor');
return requestor.defaults({
// turn off the requestor's throttle management mechanism in favor of ours
forbiddenDelay: 0,
delayOnThrottle: false
});
}
static createFetcherMetrics(crawlerName, options) {
factoryLogger.info('create fetcher metrics', { metricsStore: options.metricsStore });
if (options.metricsStore !== 'redis') {
return null;
}
const metrics = new RedisMetrics({ client: CrawlerFactory.getRedisClient(options.logger) });
const names = ['fetch'];
const result = {};
names.forEach(name => {
const fullName = `${crawlerName}:github:${name}`;
result[name] = metrics.counter(fullName, { timeGranularity: 'second', namespace: 'crawlermetrics' }); // Stored in Redis as {namespace}:{name}:{period}
});
return result;
}
static createTokenLimiter(options) {
factoryLogger.info('create token limiter', { capStore: options.capStore });
return options.capStore === 'redis'
? CrawlerFactory.createRedisTokenLimiter(getRedisClient(options.logger), options)
: CrawlerFactory.createInMemoryTokenLimiter(options);
}
static createRedisTokenLimiter(redisClient, options) {
factoryLogger.info('create redis token limiter', { callCapWindow: options.callCapWindow, callCapLimit: options.callCapLimit });
const ip = '';
return RedisRateLimiter.create({
redis: redisClient,
key: request => `${ip}:token:${request.key}`,
window: () => options.callCapWindow || 1,
limit: () => options.callCapLimit
});
}
static createInMemoryTokenLimiter(options) {
factoryLogger.info('create in memory token limiter', { callCapWindow: options.callCapWindow, callCapLimit: options.callCapLimit });
return InMemoryRateLimiter.create({
key: request => 'token:' + request.key,
window: () => options.callCapWindow || 1,
limit: () => options.callCapLimit
});
}
static createComputeLimiter(options) {
factoryLogger.info('create compute limiter', { computeLimitStore: options.computeLimitStore });
const limiter = options.computeLimitStore === 'redis'
? CrawlerFactory.createRedisComputeLimiter(CrawlerFactory.getRedisClient(options.logger), options)
: CrawlerFactory.createInMemoryComputeLimiter(options);
options.baselineUpdater = CrawlerFactory._networkBaselineUpdater.bind(null, options.logger);
return new ComputeLimiter(limiter, options);
}
static _networkBaselineUpdater(logger) {
return Q.allSettled([0, 1, 2, 3].map(number => {
return Q.delay(number * 50).then(() => {
const deferred = Q.defer();
request({
url: 'https://api.github.com/rate_limit',
headers: {
'User-Agent': 'ghrequestor'
},
time: true
}, (error, response, body) => {
if (error) {
return deferred.reject(error);
}
deferred.resolve(response.elapsedTime);
});
return deferred.promise;
});
})).then(times => {
let total = 0;
let count = 0;
for (let index in times) {
if (times[index].state === 'fulfilled') {
total += times[index].value;
count++;
}
}
const result = Math.floor(total / count);
logger.info(`New GitHub request baseline: ${result}`);
return result;
});
}
static createRedisComputeLimiter(redisClient, options) {
const address = ip.address().toString();
factoryLogger.info('create redis compute limiter', { address: address, computeWindow: options.computeWindow, computeLimit: options.computeLimit });
return RedisRateLimiter.create({
redis: redisClient,
key: request => `${address}:compute:${request.key}`,
incr: request => request.amount,
window: () => options.computeWindow || 15,
limit: () => options.computeLimit || 15000
});
}
static createInMemoryComputeLimiter(options) {
factoryLogger.info('create in memory compute limiter', { computeWindow: options.computeWindow, computeLimit: options.computeLimit });
return InMemoryRateLimiter.create({
key: request => 'compute:' + request.key,
incr: request => request.amount,
window: () => options.computeWindow || 15,
limit: () => options.computeLimit || 15000
});
}
static createStore(options) {
const provider = options.provider || 'azure';
factoryLogger.info(`Create store for provider ${options.provider}`);
let store = null;
switch (options.provider) {
case 'azure': {
store = CrawlerFactory.createRedisAndStorageStore(options);
break;
}
case 'mongo': {
store = CrawlerFactory.createMongoStore(options);
break;
}
case 'memory': {
store = new InMemoryDocStore(true);
break;
}
default: throw new Error(`Invalid store provider: ${provider}`);
}
store = CrawlerFactory.createDeltaStore(store, options);
return store;
}
static createMongoStore(options) {
return new MongoDocStore(config.get('CRAWLER_MONGO_URL'), options);
}
static createRedisAndStorageStore(options, name = null) {
factoryLogger.info(`creating azure store`, { name: name });
const baseStore = CrawlerFactory.createAzureStorageStore(options, name);
return new UrlToUrnMappingStore(baseStore, CrawlerFactory.getRedisClient(options.logger), baseStore.name, options);
}
static createAzureStorageStore(options, name = null) {
factoryLogger.info(`creating azure storage store`);
name = name || config.get('CRAWLER_STORAGE_NAME');
const account = config.get('CRAWLER_STORAGE_ACCOUNT');
const key = config.get('CRAWLER_STORAGE_KEY');
const blobService = CrawlerFactory.createBlobService(account, key);
return new AzureStorageDocStore(blobService, name, options);
}
static createDeadletterStore(options) {
const provider = options.provider || 'azure';
factoryLogger.info(`Create deadletter store for provider ${options.provider}`);
switch (options.provider) {
case 'azure': {
return CrawlerFactory.createAzureStorageStore(options, config.get('CRAWLER_STORAGE_NAME') + '-deadletter');
}
case 'mongo': {
return CrawlerFactory.createMongoStore(options);
}
case 'memory': {
return new InMemoryDocStore(true);
}
default: throw new Error(`Invalid store provider: ${provider}`);
}
}
static createDeltaStore(inner, options) {
if (!options.delta || !options.delta.provider) {
return inner;
}
factoryLogger.info(`creating delta store`);
switch (options.delta.provider) {
case 'azure': {
return CrawlerFactory.createAzureDeltaStore(inner, null, options);
}
default: throw new Error(`Invalid delta store provider: ${options.delta.provider}`);
}
return store;
}
static createAzureDeltaStore(inner, name = null, options = {}) {
name = name || config.get('CRAWLER_DELTA_STORAGE_NAME') || `${config.get('CRAWLER_STORAGE_NAME')}-log`;
const account = config.get('CRAWLER_DELTA_STORAGE_ACCOUNT') || config.get('CRAWLER_STORAGE_ACCOUNT');
const key = config.get('CRAWLER_DELTA_STORAGE_KEY') || config.get('CRAWLER_STORAGE_KEY');
factoryLogger.info('creating delta store', { name: name, account: account });
const blobService = CrawlerFactory.createBlobService(account, key);
return new DeltaStore(inner, blobService, name, options);
}
static getRedisClient(logger) {
factoryLogger.info('retrieving redis client');
if (redisClient) {
return redisClient;
}
const url = config.get('CRAWLER_REDIS_URL');
const port = config.get('CRAWLER_REDIS_PORT');
const key = config.get('CRAWLER_REDIS_ACCESS_KEY');
const tls = config.get('CRAWLER_REDIS_TLS') === 'true';
redisClient = CrawlerFactory.createRedisClient(url, key, port, tls, logger);
return redisClient;
}
static createRedisClient(url, key, port, tls, logger) {
factoryLogger.info(`creating redis client`, { url: url, port: port, tls: tls });
const options = {};
if (key) {
options.auth_pass = key;
}
if (tls) {
options.tls = {
servername: url
};
}
const redisClient = redis.createClient(port, url, options);
redisClient.on('error', error => logger.info(`Redis client error: ${error}`));
redisClient.on('reconnecting', properties => logger.info(`Redis client reconnecting: ${JSON.stringify(properties)}`));
setInterval(() => {
redisClient.ping(err => {
if (err) {
logger.info(`Redis client ping failure: ${err}`);
}
});
}, 60 * 1000);
return redisClient;
}
static createBlobService(account, key) {
factoryLogger.info(`creating blob service`);
const retryOperations = new AzureStorage.ExponentialRetryPolicyFilter();
return AzureStorage.createBlobService(account, key).withFilter(retryOperations);
}
static createLocker(options) {
factoryLogger.info(`creating locker`, { provider: options.provider });
if (options.provider === 'memory') {
return CrawlerFactory.createNolock();
}
return new redlock([CrawlerFactory.getRedisClient(options.logger)], {
driftFactor: 0.01,
retryCount: options.retryCount,
retryDelay: options.retryDelay
});
}
static createLogger(echo = false, level = 'info') {
mockInsights.setup(config.get('CRAWLER_INSIGHTS_KEY') || 'mock', echo);
const result = new winston.Logger();
result.add(aiLogger, {
insights: appInsights,
treatErrorsAsExceptions: true,
exitOnError: false,
level: level
});
return result;
}
static createRequestTracker(prefix, options) {
let locker = null;
if (options.tracker.locking) {
locker = new redlock([CrawlerFactory.getRedisClient(options.logger)], options.tracker);
} else {
locker = CrawlerFactory.createNolock();
}
return new RedisRequestTracker(prefix, CrawlerFactory.getRedisClient(options.logger), locker, options);
}
static createNolock() {
return { lock: () => null, unlock: () => { } };
}
static createQueues(options) {
const provider = options.provider || 'amqp10';
if (provider === 'amqp10') {
return CrawlerFactory.createAmqp10Queues(options);
} else if (provider === 'amqp') {
return CrawlerFactory.createAmqpQueues(options);
} else if (provider === 'memory') {
return CrawlerFactory.createMemoryQueues(options);
} else {
throw new Error(`Invalid queue provider option: ${provider}`);
}
}
static createAmqpQueues(options) {
const managementEndpoint = config.get('CRAWLER_RABBIT_MANAGER_ENDPOINT');
const url = config.get('CRAWLER_AMQP_URL');
const manager = new RabbitQueueManager(url, managementEndpoint);
const env = process.env.NODE_ENV;
const tracker = CrawlerFactory.createRequestTracker(`${env}:AMQP:${options.queueName}`, options);
return CrawlerFactory.createQueueSet(manager, tracker, options);
}
static createAmqp10Queues(options) {
const managementEndpoint = config.get('CRAWLER_SERVICEBUS_MANAGER_ENDPOINT');
const amqpUrl = config.get('CRAWLER_AMQP10_URL');
const manager = new ServiceBusQueueManager(amqpUrl, managementEndpoint);
const env = process.env.NODE_ENV;
const tracker = CrawlerFactory.createRequestTracker(`${env}:AMQP10:${options.queueName}`, options);
return CrawlerFactory.createQueueSet(manager, tracker, options);
}
static createMemoryQueues(options) {
const manager = {
createQueueChain: (name, tracker, options) => {
return CrawlerFactory.createMemoryQueue(name, options);
}
};
return CrawlerFactory.createQueueSet(manager, null, options);
}
static createQueueSet(manager, tracker, options) {
const immediate = manager.createQueueChain('immediate', tracker, options);
const soon = manager.createQueueChain('soon', tracker, options);
const normal = manager.createQueueChain('normal', tracker, options);
const later = manager.createQueueChain('later', tracker, options);
const queues = CrawlerFactory.addEventQueue(manager, [immediate, soon, normal, later], options);
return new QueueSet(queues, options);
}
static createMemoryQueue(name, options) {
return new AttenuatedQueue(new InMemoryCrawlQueue(name, options), options);
}
static addEventQueue(manager, queues, options) {
if (options.events.provider && options.events.provider !== 'none') {
queues.unshift(CrawlerFactory.createEventQueue(manager, options));
}
return queues;
}
static createEventQueue(manager, options) {
if (options.events.provider === 'amqp10') {
return CrawlerFactory.createAmqp10EventSubscription(options);
}
if (options.events.provider === 'webhook') {
return manager.createQueueChain('events', null, options);
}
throw new Error(`No event provider for ${options.events.provider}`);
}
static createAmqp10EventSubscription(options) {
const amqpUrl = config.get('CRAWLER_EVENT_AMQP10_URL');
const actualClient = new AmqpClient(AmqpPolicy.ServiceBusQueue);
const client = actualClient.connect(amqpUrl).then(() => { return actualClient; });
const formatter = new EventFormatter(options);
const queueName = `${options.events.topic}/Subscriptions/${options.events.queueName}`;
const result = new Amqp10Queue(client, 'events', queueName, formatter.format.bind(formatter), null, options);
result.mode = { receive: 'receive' };
return result;
}
static createQueuingMetrics(crawlerName, options) {
if (options.metricsStore !== 'redis') {
return null;
}
const metrics = new RedisMetrics({ client: CrawlerFactory.getRedisClient(options.logger) });
const queueNames = ['immediate', 'soon', 'normal', 'later', 'events'];
const operations = ['push', 'repush', 'done', 'abandon'];
const queuesMetrics = {};
const queueNamePrefix = options.queueName;
queueNames.forEach(queueName => {
queuesMetrics[queueName] = {};
operations.forEach(operation => {
const name = `${queueNamePrefix}:${queueName}:${operation}`;
queuesMetrics[queueName][operation] = metrics.counter(name, { timeGranularity: 'second', namespace: 'crawlermetrics' }); // Stored in Redis as {namespace}:{name}:{period}
});
});
return queuesMetrics;
}
static loadOrgs() {
let orgList = config.get('CRAWLER_ORGS');
if (orgList) {
orgList = orgList.split(';').map(entry => entry.toLowerCase().trim());
} else {
orgList = CrawlerFactory._loadLines(config.get('CRAWLER_ORGS_FILE'));
}
return orgList;
}
static _loadLines(path) {
if (!path || !fs.existsSync(path)) {
return [];
}
let result = fs.readFileSync(path, 'utf8');
result = result.split(/\s/);
return result.filter(line => { return line; }).map(line => { return line.toLowerCase(); });
}
}
factoryLogger = CrawlerFactory.createLogger(true);
module.exports = CrawlerFactory;
class EventFormatter {
constructor(options) {
this.options = options;
this.logger = options.logger;
}
format(message) {
// The message here is expected to be a WEBHOOK event. Use the information included to identify the
// repo or org to poll for new events.
const type = message.applicationProperties.event;
const event = message.body;
const eventsUrl = event.repository ? event.repository.events_url : event.organization.events_url;
const result = new Request('event_trigger', `${eventsUrl}`);
result.payload = { body: event, etag: 1, fetchedAt: moment.utc().toISOString() };
// requests off directly off the event feed do not need exclusivity
request.requiresLock = false;
// if the event is for a private repo, mark the request as needing private access.
if (event.repository && event.repository.private) {
request.context.repoType = 'private';
}
// mark it to be retried on the immediate queue as we don't want to requeue it on this shared topic
request._retryQueue = 'immediate';
return request;
}
}

25
middleware/auth.js Normal file
Просмотреть файл

@ -0,0 +1,25 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
let force = false;
let token = null;
function initialize(tokenValue, forceValue = false) {
force = forceValue;
token = tokenValue;
}
exports.initialize = initialize;
function validate(request, response, next) {
// if running on localhost, don't bother to validate
if ((!token || process.env.NODE_ENV === 'localhost') && !force) {
return next();
}
// TODO temporary poor man's token management
if (request.header('X-token') === token) {
return next();
}
response.status(403).send('Authentication required');
}
exports.validate = validate;

13
middleware/promiseWrap.js Normal file
Просмотреть файл

@ -0,0 +1,13 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const Q = require('q');
function wrap(genFn) {
var cr = Q.async(genFn);
return function (req, res, next) {
cr(req, res, next).catch(next);
};
}
module.exports = wrap;

29
middleware/sendHelper.js Normal file
Просмотреть файл

@ -0,0 +1,29 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const express = require('express');
const htmlencode = require('htmlencode').htmlEncode;
function create() {
return function (request, response, next) {
response.helpers = response.helpers || {};
response.helpers.send = {
context: {
request: request,
response: response
},
noContent: noContent,
partialHtml: partialHtml
};
next();
};
}
module.exports = create;
function noContent() {
this.context.response.sendStatus(204).end();
}
function partialHtml(title, html) {
this.context.response.type('html').status(200).end('<html><head><title>' + htmlencode(title) + '</title></head><body>' + html + '</body></html>');
}

Просмотреть файл

@ -28,7 +28,36 @@
"node-uuid": "^1.4.7",
"parse-link-header": "^0.4.1",
"q": "1.4.1",
"qlimit": "^0.1.1"
"qlimit": "^0.1.1",
"amqp10": "noodlefrenzy/node-amqp10#issue295",
"amqplib": "^0.5.1",
"applicationinsights": "^0.17.0",
"azure-common": "github:geneh/azure-common",
"azure-sb": "geneh/azure-sb",
"azure-storage": "^1.3.2",
"body-parser": "^1.15.2",
"connect-redis": "^3.1.0",
"debug": "^2.6.1",
"express": "^4.14.0",
"express-init": "^1.1.0",
"express-joi": "^0.3.1",
"ghcrawler": "0.2.22",
"ghrequestor": "^0.1.6",
"htmlencode": "0.0.4",
"ip": "^1.1.4",
"memory-cache": "^0.1.6",
"mongodb": "2.2.11",
"morgan": "^1.7.0",
"painless-config": "^0.1.0",
"promise-retry": "1.1.1",
"redis": "2.6.3",
"redis-metrics": "^0.4.1",
"redis-rate-limiter": "jeffmcaffer/redis-rate-limiter",
"redlock": "2.0.1",
"refreshing-config": "^0.1.2",
"refreshing-config-redis": "^0.1.0",
"winston": "2.2.0",
"winston-azure-application-insights": "^1.1.1"
},
"devDependencies": {
"chai": "^3.5.0",
@ -38,4 +67,4 @@
"mocha": "^3.1.2",
"sinon": "^1.17.6"
}
}
}

Просмотреть файл

@ -0,0 +1,58 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const Q = require('q');
class LimitedTokenFactory {
constructor(factory, limiter, options) {
this.factory = factory;
this.limiter = limiter;
this.logger = options.logger;
this.options = options;
}
/**
* Find all of the tokens that match the given traits and return a random one that is
* not on the bench. If no candidates are found, return either the soonest time one will
* come off the bench or null if there simply were none.
*/
getToken(traits) {
const token = this.factory.getToken(traits);
if (token === null || typeof token === 'number') {
return Q(token);
}
const deferred = Q.defer();
const key = token.slice(0, 4);
this.limiter({ key: key }, (error, rate) => {
if (error) {
return deferred.reject(error);
}
if (rate.over) {
// too many asks for this token too fast, exhaust this token for a bit to cool down.
const now = Date.now();
const delay = Math.floor((this.options.clientCallCapWindow || 1000) / 4);
let restoreTime = this.exhaust(token, now + delay);
restoreTime = restoreTime || now;
this.logger.info('Exceeded ', `Call cap for token ${token.slice(0, 4)}. Benched until ${restoreTime - now}ms from now`);
return deferred.resolve(restoreTime);
}
deferred.resolve(token);
});
return deferred.promise;
}
/**
* Mark the given token as exhausted until the given time and return the time at which it will be restored.
* If the token is already on the bench, it's restore time is unaffected. Null is returned if the token
* could not be found.
**/
exhaust(value, until) {
return this.factory.exhaust(value, until);
}
setTokens(tokens) {
this.factory.setTokens(tokens);
}
}
module.exports = LimitedTokenFactory;

Просмотреть файл

@ -0,0 +1,105 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
class TokenFactory {
static createToken(spec) {
const parts = spec.split('#');
const value = parts[0];
const traits = parts[1].split(',');
return { value: value, traits: traits };
}
constructor(tokens, options) {
this.setTokens(tokens);
this.options = options;
}
setTokens(spec) {
if (!spec) {
this.tokens = [];
return;
}
if (Array.isArray(spec)) {
this.tokens = spec;
return;
}
const tokenSpecs = spec.split(';');
this.tokens = tokenSpecs.map(spec => TokenFactory.createToken(spec));
}
/**
* Given a collection of trait sets, find the first set that has any number of matching tokens in the
* factory. From that set return a random one that is not on the bench. If all candidates are benched,
* return either the soonest time one will come off the bench. If no matching tokens are found for a given
* set, move on to the next set. If no tokens match any of the sets, return null.
*/
getToken(desiredTraitSets) {
desiredTraitSets = (!desiredTraitSets || desiredTraitSets.length === 0) ? [[]] : desiredTraitSets;
for (let i = 0; i < desiredTraitSets.length; i++) {
const token = this._getToken(desiredTraitSets[i]);
if (token) {
return token;
}
}
return null;
}
_getToken(desiredTraits) {
let minBench = Number.MAX_SAFE_INTEGER;
const now = Date.now();
const candidates = this.tokens.filter(token => {
if (this._traitsMatch(token.traits, desiredTraits)) {
if (!token.benchUntil || now > token.benchUntil) {
return true;
}
minBench = Math.min(token.benchUntil, minBench);
return false;
}
return false;
});
if (candidates.length === 0) {
return minBench === Number.MAX_SAFE_INTEGER ? null : minBench;
}
const index = Math.floor(Math.random() * candidates.length);
return candidates[index].value;
}
/**
* Mark the given token as exhausted until the given time and return the time at which it will be restored.
* If the token is already on the bench, it's restore time is unaffected. Null is returned if the token
* could not be found.
**/
exhaust(value, until) {
const now = Date.now();
let result = null;
this.tokens.filter(token => token.value === value).forEach(token => {
// If the token is not benched or the bench time is passed, update the bench time. Otherwise, leave it as is.
if (!token.benchUntil || now > token.benchUntil) {
result = token.benchUntil = until;
} else {
result = token.benchUntil;
}
});
return result;
}
// desired can be an array of traits or an array of arrays of traits if there are fall backs
_traitsMatch(available, desired) {
if (desired.length === 0) {
return true;
}
// just a single trait. See that it is available
if (typeof desired === 'string') {
return available.includes(desired);
}
// An array of traits. Make sure available includes them all
if (typeof desired[0] === 'string') {
return desired.every(trait => { return available.includes(trait); });
}
return false;
}
}
module.exports = TokenFactory;

Просмотреть файл

@ -0,0 +1,80 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const moment = require('moment');
const Q = require('q');
class ComputeLimiter {
constructor(limiter, options) {
this.limiter = limiter;
this.options = options;
this.updater = options.baselineUpdater;
this.nextUpdate = moment();
this.baseline = options.defaultBaseline || 500;
}
/**
* Consume the given amount of resources relative to the identified key assuming that the preallocated amount has already
* been accounted for. If the resource limit for that key has been exceeded, exhaust the key using the supplied function.
*
* Also update the baseline if the given amount is lower than the current baseline.
*
* @return {object} An object describing what happened. If limit is available, the object will have a 'remaining'
* property indicating the number of resources left. If no limit is available., the returned object has an 'overage'
* property indicating how far over the limit you've gone, a reset property indicating when more limit will be available,
* and an updated property indicating whether this overrun caused a change in the reset time.
*/
consume(key, amount, preallocated, exhaust) {
this._update();
// in betwee updates, lower the baseline bar if we see something faster than the current baseline
this.baseline = Math.min(amount, this.baseline);
const consumedAmount = amount - this.baseline - preallocated;
return this.allocate(key, consumedAmount, exhaust);
}
/**
* Consume the given amount of resources relative to the identified key. If the resource limit for that key has been exceeded,
* exhaust the key using the supplied function.
*
* @return {object} An object describing what happened. If limit is available, the object will have a 'remaining'
* property indicating the number of resources left. If no limit is available., the returned object has an 'overage'
* property indicating how far over the limit you've gone, a reset property indicating when more limit will be available,
* and an updated property indicating whether this overrun caused a change in the reset time.
*/
allocate(key, amount, exhaust) {
const deferred = Q.defer();
this.limiter({ key: key, amount: amount }, (error, rate) => {
if (error) {
return deferred.reject(error);
}
if (rate.over) {
const now = Date.now();
const resetTime = now + Math.floor(rate.window * 1000 / 4);
const actualResetTime = exhaust(resetTime);
const overage = rate.current - rate.limit;
return deferred.resolve({ overage: overage, reset: actualResetTime, updated: resetTime === actualResetTime });
}
deferred.resolve({ remaining: rate.limit - rate.current });
});
return deferred.promise;
}
_update() {
const now = moment();
if (!this.updater || now.isBefore(this.nextUpdate)) {
return;
}
this.nextUpdate = now.add(this.options.baselineFrequency || 60, 's');
setTimeout(() =>
Q
.try(this.updater)
.then(baseline => {
if (baseline) {
this.baseline = baseline;
}
}),
1);
}
}
module.exports = ComputeLimiter;

Просмотреть файл

@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const memoryCache = require('memory-cache');
class InMemoryRateLimiter {
static create(settings) {
const limiter = new InMemoryRateLimiter(settings);
return limiter.get.bind(limiter);
}
constructor(settings) {
this.options = settings;
}
get(request, callback) {
// prefix the key as the memoryCache is shared across the process
const key = `ratelimit:${this.options.key(request)}`;
const incr = this.options.incr ? this.options.incr(request) : 1;
let current = memoryCache.get(key);
if (!current) {
current = { count: 0 };
// add to the cache. Note, our window is in seconds for compatibility with the redis-rate-limiter API,
// but the memory cache works in milliseconds
memoryCache.put(key, current, this.options.window() * 1000);
}
current.count += incr;
callback(null, {
key: key,
current: current.count,
limit: this.options.limit(),
window: this.options.window(),
over: current.count > this.options.limit()
});
}
}
module.exports = InMemoryRateLimiter;

Просмотреть файл

@ -0,0 +1,83 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const appInsights = require("applicationinsights");
class MockInsights {
constructor(client = null) {
this.client = client;
}
static setup(key = null, echo = false) {
// exit if we we are already setup
if (appInsights.client instanceof MockInsights) {
return;
}
if (!key || key === 'mock') {
appInsights.client = new MockInsights();
} else {
appInsights
.setup(key)
.setAutoCollectPerformance(false)
.setAutoCollectDependencies(false)
.start();
if (echo) {
appInsights.client = new MockInsights(appInsights.client);
}
}
}
trackEvent(name, properties, measurements) {
console.log(`Event: ${name}, properties: ${JSON.stringify(properties)}`);
if (this.client) {
this.client.trackEvent(name, properties, measurements);
}
}
trackException(error, properties) {
console.log('trackException:');
console.dir(error);
properties = properties || {};
if (error && error._type) {
properties.type = error._type;
properties.url = error._url;
properties.cid = error._cid;
}
if (this.client) {
this.client.trackException(error, properties);
}
}
trackMetric(name, value, count, min, max, stdDev) {
console.log(`Metric: ${name} = ${value}`);
if (this.client) {
this.client.trackMetric(name, value, count, min, max, stdDev);
}
}
trackRequest(request, response, properties) {
console.log('Request: ');
if (this.client) {
this.client.trackRequest(request, response, properties);
}
}
trackTrace(message, severityLevel = 1, properties = null) {
// const severities = ['Verbose', 'Info', 'Warning', 'Error', 'Critical'];
const severities = ['V', 'I', 'W', 'E', 'C'];
const hasProperties = properties && Object.keys(properties).length > 0;
const propertyString = hasProperties ? `${JSON.stringify(properties)}` : '';
console.log(`[${severities[severityLevel]}] ${message}${propertyString}`);
if (this.client) {
this.client.trackTrace(message, severityLevel, properties);
}
}
trackDependency(name, commandName, elapsedTimeMs, success, dependencyTypeName, properties, dependencyKind, async, dependencySource) {
console.log(`Dependency: ${name}`);
if (this.client) {
this.client.trackDependency(name, commandName, elapsedTimeMs, success, dependencyTypeName, properties, dependencyKind, async, dependencySource);
}
}
}
module.exports = MockInsights;

Просмотреть файл

@ -0,0 +1,335 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const amqp10 = require('amqp10');
const moment = require('moment');
const Q = require('q');
const qlimit = require('qlimit');
const AmqpPolicy = amqp10.Policy;
class Amqp10Queue {
constructor(client, name, queueName, formatter, manager, options) {
this.debug = require('debug')(`crawler:queuing:ampq10:${queueName}`);
this.debug.log = console.info.bind(console);
this.client = client;
this.name = name;
this.queueName = queueName;
this.messageFormatter = formatter;
this.manager = manager;
this.options = options;
this.logger = options.logger;
this.mode = { receive: 'receive', send: 'send' };
this.currentAmqpCredit = options.credit || 10;
this.options._config.on('changed', this._reconfigure.bind(this));
this.receiver = null;
this.sender = null;
this.messages = [];
}
subscribe() {
this._silly('subscribe: enter');
if (this.receiver && this.sender) {
this._silly('subscribe: exit (no receiver or sender)');
return Q();
}
const receive = this.mode.receive === 'receive';
const send = this.mode.send === 'send';
return this.client.then(client => {
const queuePromise = this.manager ? this.manager.createQueue(this.queueName) : Q();
return queuePromise.then(() => {
const size = (this.options.messageSize || 200) * 1024;
const basePolicy = {
senderLink: { attach: { maxMessageSize: size } },
receiverLink: { attach: { maxMessageSize: size } }
};
const receivePolicy = AmqpPolicy.Utils.RenewOnSettle(this.currentAmqpCredit || 10, 1, basePolicy).receiverLink;
return Q.spread([
receive ? client.createReceiver(this.queueName, receivePolicy) : Q(null),
send ? client.createSender(this.queueName, basePolicy.senderLink) : Q(null)
], (receiver, sender) => {
this.logger.info(`Connecting to ${this.queueName}`);
if (sender) {
this.sender = sender;
sender.on('errorReceived', err => {
this._logReceiverSenderError(err, 'sender');
});
sender.on('attached', () => {
this.logger.info(`Sender attached to ${this.queueName}`);
});
sender.on('detached', () => {
this.logger.info(`Sender detached from ${this.queueName}`);
});
}
if (receiver) {
this.receiver = receiver;
receiver.on('message', message => {
this._silly('receiver: message received');
this.messages.push(message);
});
receiver.on('errorReceived', err => {
this._logReceiverSenderError(err, 'receiver');
});
receiver.on('attached', () => {
this.logger.info(`Receiver attached to ${this.queueName}`);
});
receiver.on('detached', () => {
this.logger.info(`Receiver detached from ${this.queueName}`);
});
}
process.once('SIGINT', () => {
this._silly('client: disconnecting due to SIGINT');
client.disconnect();
});
this._silly('subscribe: exit');
return Q();
});
});
}).catch(error => {
this.logger.error(`${this.queueName} could not be instantiated. Error: ${error}`);
this._silly('subscribe: exit (error)');
});
}
unsubscribe() {
this._silly('unsubscribe: enter');
this.logger.info(`Detaching from ${this.queueName}`);
if (this.sender) {
this._silly('unsubscribe: detaching sender');
this.sender.detach({ closed: true });
}
if (this.receiver) {
this._silly('unsubscribe: detaching receiver');
this.receiver.detach({ closed: true });
}
this.receiver = null;
this.sender = null;
this.messages = [];
this._silly('unsubscribe: exit');
return Q();
}
push(requests) {
this._silly('push: enter');
if (!this.sender) {
this._silly('push: exit (no sender)');
return Q();
}
requests = Array.isArray(requests) ? requests : [requests];
this._silly(`push: pushing ${requests.length} requests`);
let body = null;
return Q.all(requests.map(qlimit(this.options.parallelPush || 1)(request => {
this._incrementMetric('push');
this._silly(`push: ${request.type} ${request.url} (state: ${this.sender.state()})`);
body = JSON.stringify(request);
return this.sender.send(body);
}))).then(
result => {
this._silly('push: exit');
return result;
},
error => {
// if there was as force detach, a reattach should come real soon so try resending
// after a short delay.
if (error.message && error.message.includes('force')) {
return Q.delay(500).then(() => this.sender.send(body));
}
throw error;
});
}
pop() {
this._silly('pop: enter');
const message = this._findMessage();
if (!message || !message.body || !this.receiver) {
this._silly('pop: exit (nothing to pop)');
return Q(null);
}
this._incrementMetric('pop');
const request = this.messageFormatter(message);
if (!request) {
// We are never going to process this message (no formatter). Make sure to accept the message to
// ensure the queuing system gives back the credits.
this._accept(message, 'pop');
this._silly('pop: exit (message formatter returned null)')
return Q(null);
}
request._message = message;
this._silly(`pop: exit (${request.type} ${request.url})`);
return Q(request);
}
_findMessage() {
this._silly('_findMessage: enter');
// Clean up and trim off any messages that have actually expired according to the queuing system
const now = moment();
const validIndex = this.messages.findIndex(message => now.isBefore(message.messageAnnotations['x-opt-locked-until']));
if (validIndex < 0) {
this._silly('_findMessage: exit (all expired)');
return null;
}
// remove any expired messages. Make sure to release them so the AMPQ client does the proper accounting and sends more messages.
const expired = this.messages.splice(0, validIndex);
if (expired && expired.length > 0) {
this.logger.info(`Releasing ${expired.length} expired messages from ${this.queueName}.`);
expired.forEach(message => this._release(message, 'pop'));
}
// Find a candidate message -- one that is not expired or deferred
const candidateIndex = this.messages.findIndex(message =>
now.isBefore(message.messageAnnotations['x-opt-locked-until']) && (!message._deferUntil || message._deferUntil.isBefore(now)));
if (candidateIndex < 0) {
this._silly('_findMessage: exit (all expired or deferred)');
return null;
}
const result = this.messages[candidateIndex];
this.messages.splice(candidateIndex, 1);
this._silly('_findMessage: exit');
return result;
}
_release(message, caller) {
this._silly('_release: enter');
try {
return Q(this.receiver.release(message)).then(result => {
this._silly('_release: exit');
return result;
});
} catch (error) {
this.logger.info(`Could not release message for ${this.queueName}. Caller: ${caller} Error: ${error.message}`);
this._silly('_release: exit (error)');
return Q();
}
}
_accept(message, caller) {
this._silly('_accept: enter');
try {
return Q(this.receiver.accept(message)).then(result => {
this._silly('_accept: exit');
return result;
});
} catch (error) {
this.logger.info(`Could not accept message for ${this.queueName}. Caller: ${caller} Error: ${error.message}`);
this._silly('_accept: exit (error)');
return Q();
}
}
done(request) {
this._silly('done: enter');
if (!request || !request._message || !this.receiver) {
this._silly('done: exit (nothing to do)');
return Q();
}
// delete the message so a subsequent abandon or done does not retry the ack/nak
this._incrementMetric('done');
const message = request._message;
delete request._message;
return this._accept(message, 'done').then(result => {
this._silly(`done: exit (ACKed: ${request.type} ${request.url})`);
return result;
});
}
/**
* Don't give up on the given request but also don't immediately try it again -- defer try
*/
defer(request) {
this._silly('defer: enter');
const message = request._message;
if (!message) {
this._silly('defer: exit (nothing to do)');
return;
}
this._incrementMetric('defer');
// TODO allow the caller to pass in the wake up time.
message._deferUntil = moment().add(500, 'ms');
this.messages.push(message);
delete request._message;
this._silly(`defer: exit (DEFERed: ${request.type} ${request.url})`);
}
abandon(request) {
this._silly('abandon: enter');
if (!request || !request._message || !this.receiver) {
this._silly('abandon: nothing to do');
return Q();
}
// delete the message so a subsequent abandon or done does not retry the ack/nak
this._incrementMetric('abandon');
const message = request._message;
delete request._message;
return this._release(message, 'abandon').then(result => {
this._silly(`abandon: exit (NAKed: ${request.type} ${request.url})`);
return result;
});
}
flush() {
this._silly('flush: enter');
if (!this.manager) {
this._silly('flush: exit (no manager)');
return Q();
}
return Q
.try(this.unsubscribe.bind(this))
.then(this.manager.flushQueue.bind(this.manager, this.queueName))
.then(this.subscribe.bind(this))
.then(() => {
this._silly('flush: exit');
return this;
});
}
getInfo() {
if (!this.manager) {
return Q(null);
}
return this.manager.getInfo(this.queueName).then(info => {
if (!info) {
return null;
}
info.metricsName = `${this.options.queueName}:${this.name}`;
return info;
});
}
getName() {
return this.name;
}
_reconfigure(current, changes) {
if (changes.some(patch => patch.path === '/credit') && this.currentAmqpCredit !== this.options.credit) {
this.logger.info(`Reconfiguring AMQP 1.0 credit from ${this.currentAmqpCredit} to ${this.options.credit} for ${this.getName()}`);
this.receiver.addCredits(this.options.credit - this.currentAmqpCredit);
this.currentAmqpCredit = this.options.credit;
}
return Q();
}
_incrementMetric(operation) {
const metrics = this.logger.metrics;
if (metrics && metrics[this.name] && metrics[this.name][operation]) {
metrics[this.name][operation].incr();
}
}
_silly(message) {
if (this.logger) {
this.logger.silly(message);
}
this.debug(message);
}
_logReceiverSenderError(err, type) {
if (err.condition === 'amqp:link:detach-forced' || err.condition === 'amqp:connection:forced') {
this.logger.info(`${this.queueName} - ${type} timeout: ${err.condition}`);
} else {
this.logger.error(err, `${this.queueName} - ${type} error`);
}
}
}
module.exports = Amqp10Queue;

Просмотреть файл

@ -0,0 +1,175 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const amqp = require('amqplib');
const Q = require('q');
const qlimit = require('qlimit');
class AmqpQueue {
constructor(manager, name, formatter, options) {
this.manager = manager;
this.messageFormatter = formatter;
this.name = name;
this.queueName = `${options.queueName}-${name}`;
this.options = options;
this.logger = options.logger;
this.pending = [];
this.channel = null;
}
subscribe() {
if (this.channel && (this.channel.isFulfilled() || this.channel.isPending())) {
return this.channel;
}
return this._reconnect();
}
_reconnect() {
const url = this.manager.url;
if (this.channel) {
this.logger.warn(`Reconnecting ${this.queueName} using AMQP`);
}
// Create a channel promise that is (or will be) connected to the queue.
const self = this;
this.channel = Q
.try(() => {
const socketOptions = self.options.socketOptions || {};
return amqp.connect(url, socketOptions);
})
.then(connection => {
connection.on('error', self._reconnect.bind(self));
process.once('SIGINT', function () { connection.close(); });
return connection.createConfirmChannel().then(channel => {
channel.on('error', self._reconnect.bind(self));
return channel.assertQueue(self.queueName, { durable: true }).then(() => channel);
});
})
.then(
channel => {
this.logger.info(`Connected ${this.queueName} using AMQP`);
return channel;
},
error => {
this.logger.warn(`Reconnection failed for ${this.queueName} using AMQP`, error);
return Q.delay(5000).then(self._reconnect.bind(self));
});
return this.channel;
}
unsubscribe() {
if (this.channel) {
this.channel.then(channel => { return channel.close(); });
}
return Q();
}
push(requests) {
requests = Array.isArray(requests) ? requests : [requests];
return Q.all(requests.map(qlimit(this.options.parallelPush || 1)(request => {
const body = JSON.stringify(request);
const deferred = Q.defer();
this.channel.then(channel => {
channel.sendToQueue(this.queueName, new Buffer(body), {}, (err, ok) => {
if (err) {
return deferred.reject(err);
}
this._incrementMetric('push');
const attemptString = request.attemptCount ? ` (attempt ${request.attemptCount})` : '';
this.logger.verbose(`Queued ${request.policy.getShortForm()} ${request.toString()}${attemptString}`);
deferred.resolve();
});
});
return deferred.promise;
})));
}
pop() {
const self = this;
const message = this.pending.shift();
if (message) {
return Q(this._handleMessage(message));
}
return this.channel.then(channel => {
return channel.get(self.queueName).then(response => {
if (!response) {
return null;
}
return this._handleMessage(response);
});
});
}
_handleMessage(response) {
this._incrementMetric('pop');
const message = new Buffer(response.content).toString();
const request = this.messageFormatter(message);
request._message = response;
return request;
}
done(request) {
if (request._message) {
// delete the message so a subsequent abandon or done does not retry the ack/nak
this._incrementMetric('done');
const message = request._message;
delete request._message;
this._silly(`ACKed: ${request.type} ${request.url}`);
// ACK and don't worry if it fails. The request will go back on the queue and be processed again.
this.channel.then(channel => channel.ack(message));
}
return Q();
}
defer(request) {
if (request._message) {
this._incrementMetric('defer');
// TODO this is not likely enough. See the code in the amqp10 queue
this.pending.push(request._message);
delete request._message;
this._silly(`Deferred: ${request.type} ${request.url}`);
}
return Q();
}
abandon(request) {
if (request._message) {
// delete the message so a subsequent abandon or done does not retry the ack/nak
this._incrementMetric('abandon');
const message = request._message;
delete request._message;
this._silly(`NAKed: ${request.type} ${request.url}`);
this.channel.then(channel => channel.nack(message));
}
return Q();
}
flush() {
return this.manager.flushQueue(this.queueName).then(() => this);
}
getInfo() {
return this.manager.getInfo(this.queueName).then(info => {
info.metricsName = `${this.options.queueName}:${this.name}`;
return info;
});
}
getName() {
return this.name;
}
_incrementMetric(operation) {
const metrics = this.logger.metrics;
if (metrics && metrics[this.name] && metrics[this.name][operation]) {
metrics[this.name][operation].incr();
}
}
_silly(message) {
if (this.logger) {
this.logger.silly(message);
}
}
}
module.exports = AmqpQueue;

Просмотреть файл

@ -0,0 +1,59 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const memoryCache = require('memory-cache');
const NestedQueue = require('./nestedQueue');
const Q = require('q');
const qlimit = require('qlimit');
class AttenuatedQueue extends NestedQueue{
constructor(queue, options) {
super(queue);
this.options = options;
this.logger = options.logger;
}
push(requests) {
const self = this;
requests = Array.isArray(requests) ? requests : [requests];
return Q.all(requests.map(qlimit(this.options.parallelPush || 1)(request => {
return self._pushOne(request);
})));
}
_pushOne(request) {
// Include the attempt count in the key. This allows for one concurrent requeue
const attemptCount = request.attemptCount || 0;
const key = `t:${attemptCount}:${request.toUniqueString()}`;
let entry = memoryCache.get(key);
if (entry) {
// We've seen this request recently. The push is either in progress (and may fail) or is already done.
// Either way, tack handlers on the (potentially) pending promise such that success is passed through
// and rejection causes this call's request to be pushed (i.e., retry). Ensure that the new promise
// is stored for the next caller. This approach attempts to eliminate the JavaScript lockstep tick
// execution where multiple "chains" all read, then all update, then all write thereby missing the fact
// that there are several chains writing.
const attemptString = attemptCount ? `(attempt ${request.attemptCount}) ` : '';
this.logger.verbose(`Attenuated ${attemptString}${request.type}@${request.url}`);
// overwrite the promise so we keep the same ttl as the original
entry.promise = entry.promise.catch(error => {
return this.queue.push(request);
});
request.queueOutcome = 'Attenuated';
return entry.promise;
}
entry = {
timestamp: Date.now(),
promise: this.queue.push(request)
};
const ttl = this.options.attenuation.ttl || 1000;
memoryCache.put(key, entry, ttl);
return entry.promise;
}
_log(message) {
return this.logger ? this.logger.silly(message) : null;
}
}
module.exports = AttenuatedQueue;

Просмотреть файл

@ -0,0 +1,79 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const extend = require('extend');
const Q = require('q');
const Request = require('ghcrawler').request;
class InMemoryCrawlQueue {
constructor(name, options) {
this.name = name;
this.queue = [];
this.options = options;
this.logger = options.logger;
}
getName() {
return this.name;
}
push(requests) {
this._incrementMetric('push');
requests = Array.isArray(requests) ? requests : [requests];
requests = requests.map(request => extend(true, {}, request));
this.queue = this.queue.concat(requests);
return Q.resolve();
}
subscribe() {
return Q(null);
}
pop() {
const result = this.queue.shift();
if (!result) {
return Q();
}
this._incrementMetric('pop');
return Q.resolve(Request.adopt(result));
}
done() {
this._incrementMetric('done');
return Q(null);
}
// We popped but cannot process right now (e.g., no rate limit). Stash it away and allow it to be popped later.
defer(request) {
this._incrementMetric('defer');
// TODO likely need to do more here. see the amqp10 code
this.queue.push(request);
}
abandon(request) {
this._incrementMetric('abandon');
this.queue.unshift(request);
return Q.resolve();
}
flush() {
this.queue = [];
return Q(this);
}
getInfo() {
return Q({
count: this.queue.length,
metricsName: this.name
});
}
_incrementMetric(operation) {
const metrics = this.logger.metrics;
if (metrics && metrics[this.name] && metrics[this.name][operation]) {
metrics[this.name][operation].incr();
}
}
}
module.exports = InMemoryCrawlQueue;

Просмотреть файл

@ -0,0 +1,50 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
class NestedQueue {
constructor(queue) {
this.queue = queue;
}
push(requests) {
return this.queue.push(requests);
}
pop() {
return this.queue.pop();
}
done(request) {
return this.queue.done(request);
}
defer(request) {
return this.queue.defer(request);
}
abandon(request) {
return this.queue.abandon(request);
}
subscribe() {
return this.queue.subscribe();
}
unsubscribe() {
return this.queue.unsubscribe();
}
flush() {
return this.queue.flush();
}
getInfo() {
return this.queue.getInfo();
}
getName() {
return this.queue.getName();
}
}
module.exports = NestedQueue;

Просмотреть файл

@ -0,0 +1,58 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const AmqpQueue = require('./amqpQueue');
const AttenuatedQueue = require('./attenuatedQueue');
const Q = require('q');
const request = require('request');
const Request = require('ghcrawler').request;
const TrackedQueue = require('./trackedQueue');
class RabbitQueueManager {
constructor(amqpUrl, managementEndpoint) {
this.url = amqpUrl;
this.managementEndpoint = managementEndpoint;
}
createQueueChain(name, tracker, options) {
const formatter = message => {
return Request.adopt(JSON.parse(message));
};
let queue = new AmqpQueue(this, name, formatter, options);
if (tracker) {
queue = new TrackedQueue(queue, tracker, options);
}
return new AttenuatedQueue(queue, options);
}
flushQueue(name) {
return this._call('delete', `${this.managementEndpoint}/api/queues/%2f/${name}/contents`, `Could not flush queue ${name}`, false);
}
getInfo(name) {
return this._call('get', `${this.managementEndpoint}/api/queues/%2f/${name}`, `Could not get info for queue ${name}`).then(info => {
return { count: info.messages };
});
}
_call(method, url, errorMessage, json = true, body = null) {
const deferred = Q.defer();
const options = {};
if (json) {
options.json = json;
}
if (body) {
options.body = body;
}
request[method](url, options, (error, response, body) => {
if (error || response.statusCode > 299) {
const detail = error ? error.message : (typeof body === 'string' ? body : body.message);
return deferred.reject(new Error(`${errorMessage}: ${detail}.`));
}
deferred.resolve(body);
});
return deferred.promise;
}
}
module.exports = RabbitQueueManager;

Просмотреть файл

@ -0,0 +1,52 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const limiter = require('../limiting/inmemoryRateLimiter');
const NestedQueue = require('./nestedQueue');
const Q = require('q');
const qlimit = require('qlimit');
const debug = require('debug')('crawler:queuing:ratelimitedpushqueue');
debug.log = console.info.bind(console);
class RateLimitedPushQueue extends NestedQueue {
constructor(queue, limiter, options) {
super(queue);
this.limiter = limiter;
this.options = options;
}
push(requests) {
debug('push: enter');
const self = this;
requests = Array.isArray(requests) ? requests : [requests];
return Q.all(requests.map(qlimit(self.options.parallelPush || 1)(request => {
return self._pushOne(request);
}))).then(result => {
debug('push: exit (success)');
return result;
});
}
_pushOne(request) {
debug(`_pushOne(${request.toUniqueString()}: enter`);
const deferred = Q.defer();
const self = this;
this.limiter(null, (error, rate) => {
if (error) {
debug(`_pushOne(${request.toUniqueString()}: exit (error)`);
return deferred.reject(error);
}
if (rate.over) {
return deferred.resolve(Q.delay(Math.floor((self.options.pushRateWindow || 2) * 1000 / 4)).then(() => {
debug(`_pushOne(${request.toUniqueString()}: exit (delayed)`);
return self._pushOne(request);
}));
}
debug(`_pushOne(${request.toUniqueString()}: exit (success)`);
deferred.resolve(self.queue.push(request));
});
return deferred.promise;
}
}
module.exports = RateLimitedPushQueue;

Просмотреть файл

@ -0,0 +1,164 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const Q = require('q');
const debug = require('debug')('crawler:queuing:requesttracker:redis');
debug.log = console.info.bind(console);
class RedisRequestTracker {
constructor(prefix, redisClient, locker, options) {
this.prefix = prefix;
this.redisClient = redisClient;
this.locker = locker;
this.options = options;
this.logger = options.logger;
}
track(request, operation) {
debug(`track(${request.toUniqueString()}): enter`);
const key = this._getKey(request);
const self = this;
return this._lockExecuteUnlock(key + '-lock', () => {
return self._getTag(key, request).then(timestamp => {
if (timestamp) {
const diff = Date.now() - parseInt(timestamp, 10);
const attemptString = request.attemptCount ? `(attempt ${request.attemptCount}) ` : '';
self.logger.verbose(`Bounced ${attemptString}${request.type}@${request.url} from ${diff}ms ago`, request.meta);
request.queueOutcome = 'Bounced';
debug(`track(${request.toUniqueString()}): exit (bounced)`);
return Q();
}
return operation(request).then(result => {
debug(`track(${request.toUniqueString()}): operation success`);
return self._setTag(key, request)
.then(success => {
debug(`track(${request.toUniqueString()}): exit (success)`);
return result;
});
});
});
});
}
untrack(request) {
debug(`untrack(${request.toUniqueString()}): enter`);
const key = this._getKey(request);
const self = this;
return this._lockExecuteUnlock(key + '-lock', () => {
return self._removeTag(key, request);
}).then(request => {
debug(`untrack(${request.toUniqueString()}): exit (success)`);
return request;
});
}
flush() {
debug(`flush: enter`);
// delete all the redis keys being maintained for this queue.
const deferred = Q.defer();
const pattern = `${this.prefix}:*`;
const count = 10000;
this.redisClient.eval(this._getDeleteScript(), 0, pattern, count, (error, result) => {
if (error) {
debug(`flush: exit (error)`);
return deferred.reject(error);
}
debug(`flush: exit (success)`);
deferred.resolve(result);
});
return deferred.promise;
}
_lockExecuteUnlock(key, operation) {
const self = this;
let lock = null;
return Q
.try(() => {
return self.locker.lock(key, self.options.tracker.lockTtl);
})
.then(acquiredLock => {
lock = acquiredLock;
return operation();
})
.finally(() => {
if (lock) {
Q
.try(() => {
return self.locker.unlock(lock);
})
.catch(error =>
self._log(`FAILED to unlock ${key}`));
}
});
}
_getTag(key, request) {
const deferred = Q.defer();
this.redisClient.get(key, function (err, reply) {
if (err) {
return deferred.reject(err);
}
deferred.resolve(reply);
});
return deferred.promise;
}
_setTag(key, request) {
const deferred = Q.defer();
const self = this;
const ttl = this.options.tracker.ttl || 60000;
this.redisClient.set([key, Date.now().toString(), 'PX', ttl, 'NX'], function (err, reply) {
// resolve even if the set failed. Failure to track is not fatal to the queuing operation
// const message = err ? 'Failed to track' : 'Tracked';
// self._log(`${message} tag: ${key}`);
deferred.resolve(err ? null : request);
});
return deferred.promise;
}
_removeTag(key, request) {
const deferred = Q.defer();
const self = this;
this.redisClient.del(key, function (err, reply) {
if (err) {
// This is a BIG deal. If we fail to remove here then other agents will think that everything is ok
// and decline to queue new entries for this request when in fact, we may not be successful here.
// Log all the info and then ensure that we fail this and cause the request to get requeued
self.logger.error(new Error(`Failed to remove tracking tag: ${request.type}@${request.url}`), request.meta);
return deferred.reject(err);
}
self._log(`Untracked tag: ${key}`);
deferred.resolve(request);
});
return deferred.promise;
}
_getKey(request) {
return `${this.prefix}:${request.toUniqueString()}`;
}
_log(message) {
return this.logger ? this.logger.silly(message) : null;
}
_getDeleteScript() {
return `
local done = false;
local cursor = "0"
redis.replicate_commands()
repeat
local result = redis.call("SCAN", cursor, "match", ARGV[1], "count", ARGV[2])
cursor = result[1];
local keys = result[2];
for i, key in ipairs(keys) do
redis.call("DEL", key);
end
if cursor == "0" then
done = true;
end
until done
return true;
`;
}
}
module.exports = RedisRequestTracker;

Просмотреть файл

@ -0,0 +1,128 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const amqp10 = require('amqp10');
const Amqp10Queue = require('./amqp10Queue');
const AttenuatedQueue = require('./attenuatedQueue');
const azureCommon = require('azure-common');
const InMemoryRateLimiter = require('../limiting/inmemoryRateLimiter');
const RateLimitedPushQueue = require('./ratelimitedPushQueue');
const Request = require('ghcrawler').request;
const serviceBus = require('azure-sb');
const TrackedQueue = require('./trackedQueue');
const Q = require('q');
const AmqpClient = amqp10.Client;
const AmqpPolicy = amqp10.Policy;
class ServiceBusQueueManager {
constructor(amqpUrl, managementEndpoint) {
this.amqpUrl = amqpUrl;
this.managementEndpoint = managementEndpoint;
this.client = null;
const retryOperations = new azureCommon.ExponentialRetryPolicyFilter();
this.serviceBusService = serviceBus.createServiceBusService(managementEndpoint).withFilter(retryOperations);
}
createQueueClient(name, formatter, options) {
return this._createClient(name, `${options.queueName}-${name}`, formatter, options);
}
createSubscriptionClient(name, topic, subscription, formatter, options) {
return this._createClient(name, `${topic}/Subscriptions/${subscription}`, formatter, options);
}
_createClient(name, queueName, formatter, options) {
return new Amqp10Queue(this._getClient(), name, queueName, formatter, this, options);
}
_getClient() {
if (this.client) {
return this.client;
}
const actualClient = new AmqpClient(AmqpPolicy.ServiceBusQueue);
this.client = actualClient.connect(this.amqpUrl).then(() => { return actualClient; });
return this.client;
}
createQueueChain(name, tracker, options) {
const formatter = message => {
// make sure the message/request object is copied to enable deferral scenarios (i.e., the request is modified
// and then put back on the in-memory queue)
return Request.adopt(Object.assign({}, message.body));
};
let queue = this.createQueueClient(name, formatter, options);
if (tracker) {
queue = new TrackedQueue(queue, tracker, options);
}
if (options.pushRateLimit) {
const limiter = InMemoryRateLimiter.create({
key: () => 'queue:' + name,
window: () => options.pushRateWindow || 2,
limit: () => options.pushRateLimit || 300
});
queue = new RateLimitedPushQueue(queue, limiter, options);
}
return new AttenuatedQueue(queue, options);
}
flushQueue(name) {
return Q()
.then(this.deleteQueue.bind(this, name))
.then(this.createQueue.bind(this, name));
}
deleteQueue(name) {
const deferred = Q.defer();
this.serviceBusService.deleteQueue(name, error => {
if (error) {
return deferred.reject(error);
}
deferred.resolve();
});
return deferred.promise;
}
createQueue(name) {
const options = {
EnablePartitioning: true,
LockDuration: 'PT5M',
DefaultMessageTimeToLive: 'P10675199D',
MaxDeliveryCount: '10000000'
};
const deferred = Q.defer();
this.serviceBusService.createQueueIfNotExists(name, options, (error, created, response) => {
if (error) {
return deferred.reject(error);
}
deferred.resolve(response.body);
});
return deferred.promise;
}
getInfo(name) {
// TODO see if this works for subscriptions?
const deferred = Q.defer();
this.serviceBusService.getQueue(name, (error, queue) => {
if (error) {
if (error.code === 'QueueNotFound') {
return deferred.resolve(null);
}
return deferred.reject(error);
}
// length of queue (active messages ready to read)
let activeMessageCount;
try {
activeMessageCount = queue.CountDetails['d2p1:ActiveMessageCount'];
} catch (e) {
activeMessageCount = 0;
}
deferred.resolve({ count: activeMessageCount });
});
return deferred.promise;
}
}
module.exports = ServiceBusQueueManager;

Просмотреть файл

@ -0,0 +1,65 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const NestedQueue = require('./nestedQueue');
const Q = require('q');
const qlimit = require('qlimit');
const debug = require('debug')('crawler:queuing:trackedqueue');
debug.log = console.info.bind(console);
class TrackedQueue extends NestedQueue {
constructor(queue, tracker, options) {
super(queue);
this.tracker = tracker;
this.options = options;
this.logger = options.logger;
}
push(requests) {
debug('push: enter');
const self = this;
requests = Array.isArray(requests) ? requests : [requests];
return Q.all(requests.map(qlimit(self.options.parallelPush || 1)(request => {
return self.tracker.track(request, self.queue.push.bind(self.queue));
}))).then(result => {
debug('push: exit (success)');
return result;
});
}
pop() {
debug('pop: enter');
const self = this;
return this.queue.pop().then(request => {
if (!request) {
debug('pop: exit (no request)');
return null;
}
return self.tracker.untrack(request).then(
() => {
debug('pop: exit (untracked)');
return request;
},
error => {
// if we cannot untrack, abandon the popped message and fail the pop.
return self.abandon(request).finally(() => {
debug('pop: exit (abandoned)');
throw error;
});
});
});
}
flush() {
debug('flush: enter');
const self = this;
return this.tracker.flush().then(() => {
return self.queue.flush();
}).then(result => {
debug('flush: exit (success)');
return result;
});
}
}
module.exports = TrackedQueue;

Просмотреть файл

@ -0,0 +1,109 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const moment = require('moment');
const uuid = require('node-uuid');
const Q = require('q');
class DeltaStore {
constructor(baseStore, blobService, name, options) {
this.baseStore = baseStore;
this.service = blobService;
this.name = name;
this.options = options;
this.blobPromise = null;
this.blobSequenceNumber = 1;
this.uniqueBlobId = uuid.v4(); // Avoid clashes in multiple processes environment
this.currentHour = moment.utc().format('HH');
}
connect() {
return this.baseStore.connect().then(() => {
return this._createContainer(this.name);
});
}
upsert(document) {
return this.baseStore.upsert(document).then(() => {
const text = JSON.stringify(document) + '\n';
return this._append(text);
});
}
get(type, key) {
return this.baseStore.get(type, key);
}
etag(type, key) {
return this.baseStore.etag(type, key);
}
close() {
return this.baseStore.close();
}
_createContainer(name) {
const createContainerIfNotExists = Q.nbind(this.service.createContainerIfNotExists, this.service);
return createContainerIfNotExists(name);
}
_append(text) {
return this._azureAppend(this.name, this._getBlobName(), text).catch(error => {
// If this is a non-recoverable error rethrow
if (error.statusCode !== 404 && error.statusCode !== 409) {
throw error;
}
// if a new blob is being created, wait for that to finish and then append our text
// be sure to recurse here as the newly created blob may have been on the time unit boundary
// and this new content should be written in the new time block.
if (this.blobPromise) {
return this.blobPromise.then(() => {
return this._append(text);
});
}
this.blobPromise = this._createBlob(text).finally(() => {
this.blobPromise = null;
});
return this.blobPromise;
});
}
_createBlob(text) {
// Walk over the sequence of blobs until we find one that can take the text. Create a new blob if needed.
// First try to append to the current blob to ensure we are not overwriting something
return this._azureAppend(this.name, this._getBlobName(), text).catch(error => {
if (error.statusCode === 409) {
this.blobSequenceNumber++;
return this._createBlob(text);
}
if (error.statusCode === 404) {
return this._azureCreate(this.name, this._getBlobName(), text);
}
throw error;
});
}
_azureCreate(containerName, blobName, text) {
return Q.nbind(this.service.createAppendBlobFromText, this.service)(containerName, blobName, text);
}
_azureAppend(containerName, blobName, text) {
return Q.nbind(this.service.appendBlockFromText, this.service)(containerName, blobName, text);
}
_getBlobName() {
const now = moment.utc();
const year = now.format('YYYY');
const month = now.format('MM');
const day = now.format('DD');
const hour = now.format('HH');
const formatted = now.format('YYYY_MM_DD_HH');
if (hour !== this.currentHour) {
this.currentHour = hour;
this.blobSequenceNumber = 1;
}
return `v1/${year}/${month}/${day}/${formatted}_${this.blobSequenceNumber}_${this.uniqueBlobId}.json`;
}
}
module.exports = DeltaStore;

Просмотреть файл

@ -0,0 +1,75 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const Q = require('q');
class InmemoryDocStore {
constructor() {
this.collections = {};
}
connect() {
return Q(null);
}
upsert(document) {
const selfHref = document._metadata.links.self.href;
const type = document._metadata.type;
let collection = this.collections[type];
if (!collection) {
collection = {};
this.collections[type] = collection;
}
collection[selfHref] = document;
return Q(document);
}
get(type, url) {
// TODO interesting question as to what a mongo store would do if the doc does not exist.
const collection = this.collections[type];
if (!collection) {
return Q.reject();
}
return collection[url] ? Q(collection[url]) : Q.reject();
}
etag(type, url) {
// TODO interesting question as to what a mongo store would do if the doc does not exist.
const collection = this.collections[type];
if (!collection) {
return Q(null);
}
let result = collection[url];
result = result ? result._metadata.etag : null;
return Q(result);
}
listDocuments(pattern) {
const result = [];
for (let collection in collections) {
for (let document in collection.for) {
result.push(document._metadata);
}
}
return Q(result);
}
delete(type, url) {
const collection = this.collections[type];
if (!collection) {
return Q(null);
}
delete collection[url];
return Q(true);
}
count(pattern) {
return this.list(pattern).then(results => { return results.length });
}
close() {
content = {};
}
}
module.exports = InmemoryDocStore;

Просмотреть файл

@ -0,0 +1,82 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const memoryCache = require('memory-cache');
const Mongo = require('mongodb');
const promiseRetry = require('promise-retry');
const Q = require('q');
class MongoDocStore {
constructor(url, options) {
this.url = url;
this.options = options;
this.client = Mongo.MongoClient;
}
connect() {
return promiseRetry((retry, number) => {
return this.client.connect(this.url).then(db => {
this.db = db;
})
.catch(retry);
});
}
upsert(document) {
const selfHref = document._metadata.links.self.href;
const collection = this.db.collection(document._metadata.type);
return collection.updateOne({ '_metadata.links.self.href': selfHref }, document, { upsert: true }).then(result => {
memoryCache.put(document._metadata.url, { etag: document._metadata.etag, document: document }, this.options.ttl);
return result;
});
}
get(type, url) {
const cached = memoryCache.get(url);
if (cached) {
return Q(cached.document);
}
return this.db.collection(type).findOne({ '_metadata.url': url }).then(value => {
if (value) {
memoryCache.put(url, { etag: value._metadata.etag, document: value }, this.options.ttl);
return value;
}
return null;
});
}
etag(type, url) {
const cached = memoryCache.get(url);
if (cached) {
return Q(cached.etag);
}
return this.db.collection(type).findOne({ '_metadata.url': url }).then(value => {
if (value) {
memoryCache.put(url, { etag: value._metadata.etag, document: value }, this.options.ttl);
return value._metadata.etag;
}
return null;
});
}
listDocuments(pattern) {
// TODO implement
return Q([]);
}
delete(type, url) {
// TODO implement
return Q(true);
}
count(pattern) {
// TODO implement
return Q(0);
}
close() {
this.db.close();
}
}
module.exports = MongoDocStore;

Просмотреть файл

@ -0,0 +1,218 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const async = require('async');
const azure = require('azure-storage');
const memoryCache = require('memory-cache');
const Q = require('q');
const URL = require('url');
class AzureStorageDocStore {
constructor(blobService, name, options) {
this.service = blobService;
this.name = name;
this.options = options;
this._getBlobNameFromKey = this.options.blobKey === 'url' ? this._getBlobNameFromUrl : this._getBlobNameFromUrn;
}
connect() {
return this._createContainer(this.name);
}
_createContainer(name) {
const deferred = Q.defer();
this.service.createContainerIfNotExists(name, (error, result, response) => {
if (error) {
return deferred.reject(error);
}
deferred.resolve(this.service);
});
return deferred.promise;
}
upsert(document) {
const deferred = Q.defer();
const blobName = this._getBlobNameFromDocument(document);
const text = JSON.stringify(document);
const blobMetadata = {
version: document._metadata.version,
etag: document._metadata.etag,
type: document._metadata.type,
url: document._metadata.url,
urn: document._metadata.links.self.href,
fetchedAt: document._metadata.fetchedAt,
processedAt: document._metadata.processedAt
};
if (document._metadata.extra) {
blobMetadata.extra = JSON.stringify(document._metadata.extra);
}
const options = { metadata: blobMetadata, contentSettings: { contentType: 'application/json' } };
this.service.createBlockBlobFromText(this.name, blobName, text, options, (error, result, response) => {
if (error) {
return deferred.reject(error);
}
memoryCache.put(document._metadata.url, { etag: document._metadata.etag, document: document }, this.options.ttl);
deferred.resolve(blobName);
});
return deferred.promise;
}
get(type, key) {
const cached = memoryCache.get(key);
if (cached) {
return Q(cached.document);
}
const deferred = Q.defer();
const blobName = this._getBlobNameFromKey(type, key);
this.service.getBlobToText(this.name, blobName, (error, text, blob, response) => {
if (error) {
return deferred.reject(error);
}
const result = JSON.parse(text);
memoryCache.put(key, { etag: result._metadata.etag, document: result }, this.options.ttl);
deferred.resolve(result);
});
return deferred.promise;
}
etag(type, key) {
const cached = memoryCache.get(key);
if (cached) {
return Q(cached.etag);
}
const deferred = Q.defer();
const blobName = this._getBlobNameFromKey(type, key);
this.service.getBlobMetadata(this.name, blobName, (error, blob, response) => {
deferred.resolve(error ? null : blob.metadata.etag);
});
return deferred.promise;
}
list(pattern) {
const blobPattern = this._getBlobPathFromUrn(null, pattern);
var entries = [];
var continuationToken = null;
const deferred = Q.defer();
async.doWhilst(
callback => {
var started = new Date().getTime();
this.service.listBlobsSegmentedWithPrefix(this.name, blobPattern, continuationToken, { include: azure.BlobUtilities.BlobListingDetails.METADATA, location: azure.StorageUtilities.LocationMode.PRIMARY_THEN_SECONDARY }, function (err, result, response) {
// metricsClient.trackDependency(url.parse(blobService.host.primaryHost).hostname, 'listBlobsSegmented', (new Date().getTime() - started), !err, "Http", { 'Container name': 'download', 'Continuation token present': result == null ? false : (result.continuationToken != null), 'Blob count': result == null ? 0 : result.entries.length });
if (err) {
continuationToken = null;
// metricsClient.trackError(err);
callback(err);
}
entries = entries.concat(result.entries.map(entry => {
const blobMetadata = entry.metadata;
if (blobMetadata.extra) {
blobMetadata.extra = JSON.parse(blobMetadata.extra);
}
return blobMetadata;
}));
callback(null);
});
},
function () {
return continuationToken !== null && entries.length < 10000;
},
function (err) {
if (err) {
return deferred.reject(err);
}
deferred.resolve(entries);
});
return deferred.promise;
}
delete(type, key) {
const deferred = Q.defer();
const blobName = this._getBlobNameFromKey(type, key);
this.service.deleteBlob(this.name, blobName, (error, response) => {
if (error) {
return deferred.reject(error);
}
deferred.resolve(true);
});
return deferred.promise;
}
count(pattern, force = false) {
const key = `${this.name}:count:${pattern || ''}`;
if (!force) {
const cachedCount = memoryCache.get(key);
if (cachedCount) {
return Q(cachedCount);
}
}
const blobPattern = this._getBlobPathFromUrn(null, pattern);
var entryCount = 0;
var continuationToken = null;
const deferred = Q.defer();
async.doWhilst(
callback => {
this.service.listBlobsSegmentedWithPrefix(this.name, blobPattern, continuationToken, { location: azure.StorageUtilities.LocationMode.PRIMARY_THEN_SECONDARY }, function (err, result, response) {
if (err) {
continuationToken = null;
callback(err);
}
entryCount += result.entries.length;
callback(null);
});
},
function () {
return continuationToken !== null;
},
function (err) {
if (err) {
return deferred.reject(err);
}
memoryCache.put(key, entryCount, 60000);
deferred.resolve(entryCount);
});
return deferred.promise;
}
close() {
return Q();
}
_getBlobNameFromDocument(document) {
const type = document._metadata.type;
if (this.options.blobKey === 'url') {
return this._getBlobNameFromUrl(type, document._metadata.url);
}
return this._getBlobNameFromUrn(type, document._metadata.links.self.href);
}
_getBlobNameFromUrl(type, url) {
if (!(url.startsWith('http:') || url.startsWith('https:'))) {
return url;
}
const parsed = URL.parse(url, true);
return `${type}${parsed.path.toLowerCase()}.json`;
}
_getBlobPathFromUrn(type, urn) {
if (!urn) {
return '';
}
if (!urn.startsWith('urn:')) {
return urn;
}
const pathed = urn.startsWith('urn:') ? urn.slice(4) : urn;
return pathed.replace(/:/g, '/').toLowerCase();
}
_getBlobNameFromUrn(type, urn) {
if (!urn.startsWith('urn:')) {
return urn;
}
return `${this._getBlobPathFromUrn(type, urn)}.json`;
}
}
module.exports = AzureStorageDocStore;

Просмотреть файл

@ -0,0 +1,72 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const Q = require('q');
class UrltoUrnMappingStore {
constructor(baseStore, redisClient, name, options) {
this.baseStore = baseStore;
this.redisClient = redisClient;
this.name = name;
this.options = options;
}
connect() {
return this.baseStore.connect();
}
upsert(document) {
return this.baseStore.upsert(document).then(blobName => {
const url = document._metadata.url;
const urn = document._metadata.links.self.href;
const deferred = Q.defer();
this.redisClient.hmset(this.name, [urn, blobName, url, blobName], this._callbackToPromise(deferred));
return deferred.promise;
});
}
get(type, url) {
return this._getUrnForUrl(url).then(urn => {
if (!urn) {
throw new Error(`Document not found at ${url}`);
}
return this.baseStore.get(type, urn);
});
}
etag(type, url) {
return this._getUrnForUrl(url).then(urn => {
return urn ? this.baseStore.etag(type, urn) : null;
});
}
list(pattern) {
return this.baseStore.list(pattern);
}
delete(type, url) {
return this.baseStore.delete(type, url);
}
count(pattern) {
return this.baseStore.count(pattern);
}
close() {
return this.baseStore.close();
}
_getUrnForUrl(url) {
const deferred = Q.defer();
this.redisClient.hget(this.name, url, this._callbackToPromise(deferred));
return deferred.promise;
}
_callbackToPromise(deferred) {
return (error, value) => {
return error ? deferred.reject(error) : deferred.resolve(value);
};
}
}
module.exports = UrltoUrnMappingStore;

52
routes/config.js Normal file
Просмотреть файл

@ -0,0 +1,52 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const auth = require('../middleware/auth');
const express = require('express');
const Q = require('q');
const wrap = require('../middleware/promiseWrap');
let crawlerService = null;
const router = express.Router();
router.patch('/', auth.validate, wrap(function* (request, response, next) {
const sorted = collectPatches(request.body);
yield Q.all(Object.getOwnPropertyNames(sorted).map(key => {
return crawlerService.options[key]._config.apply(sorted[key]);
}));
response.sendStatus(200);
}));
router.get('/', auth.validate, function (request, response, next) {
result = Object.assign({}, crawlerService.options);
Object.getOwnPropertyNames(result).forEach(subsystemName => {
result[subsystemName] = Object.assign({}, result[subsystemName]);
delete result[subsystemName]._config;
delete result[subsystemName].logger;
});
response.json(result).status(200).end();
});
router.post('/tokens', auth.validate, (request, response, next) => {
const body = request.body;
crawlerService.fetcher.tokenFactory.setTokens(body);
response.sendStatus(200);
});
function setup(service) {
crawlerService = service;
return router;
}
function collectPatches(patches) {
return patches.reduce((result, patch) => {
const segments = patch.path.split('/');
const key = segments[1];
result[key] = result[key] || [];
patch.path = '/' + segments.slice(2).join('/');
result[key].push(patch);
return result;
}, {});
}
module.exports = setup;

44
routes/deadletters.js Normal file
Просмотреть файл

@ -0,0 +1,44 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const auth = require('../middleware/auth');
const express = require('express');
const expressJoi = require('express-joi');
const Request = require('ghcrawler').request;
const wrap = require('../middleware/promiseWrap');
let crawlerService = null;
const router = express.Router();
router.head('/', auth.validate, wrap(function* (request, response) {
const count = yield crawlerService.getDeadletterCount();
response.setHeader('X-Total-Count', count);
response.status(204).end();
}));
router.get('/', auth.validate, wrap(function* (request, response) {
const requests = yield crawlerService.listDeadletters();
response.setHeader('X-Total-Count', requests.length);
response.json(requests);
}));
router.get('/:urn', auth.validate, wrap(function* (request, response) {
const document = yield crawlerService.getDeadletter(request.params.urn);
response.json(document);
}));
router.delete('/:urn', auth.validate, wrap(function* (request, response) {
let requeue = request.query.requeue;
if (requeue) {
yield crawlerService.requeueDeadletter(request.params.urn, requeue);
} else {
yield crawlerService.deleteDeadletter(request.params.urn);
}
response.status(204).end();
}));
function setup(service) {
crawlerService = service;
return router;
}
module.exports = setup;

31
routes/queues.js Normal file
Просмотреть файл

@ -0,0 +1,31 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const auth = require('../middleware/auth');
const express = require('express');
const wrap = require('../middleware/promiseWrap');
let crawlerService = null;
const router = express.Router();
router.put('/:name', auth.validate, wrap(function* (request, response) {
const result = yield crawlerService.flushQueue(request.params.name);
if (!result) {
return response.sendStatus(404);
}
response.sendStatus(200);
}));
router.get('/:name/info', auth.validate, wrap(function* (request, response) {
const info = yield crawlerService.getQueueInfo(request.params.name);
if (!info) {
return response.sendStatus(404);
}
response.json(info);
}));
function setup(service) {
crawlerService = service;
return router;
}
module.exports = setup;

87
routes/requests.js Normal file
Просмотреть файл

@ -0,0 +1,87 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const auth = require('../middleware/auth');
const express = require('express');
const expressJoi = require('express-joi');
const Request = require('ghcrawler').request;
const TraversalPolicy = require('ghcrawler').traversalPolicy;
const wrap = require('../middleware/promiseWrap');
const requestsSchema = {
queue: expressJoi.Joi.types.String().alphanum().min(2).max(50).required(),
count: expressJoi.Joi.types.Number().integer().min(0).max(100)
};
const queueSchema = {
name: expressJoi.Joi.types.String().alphanum().min(2).max(50).required()
};
let crawlerService = null;
const router = express.Router();
router.post('/:queue?', auth.validate, wrap(function* (request, response) {
const result = yield queueRequests(request.body, request.params.queue || 'normal');
if (!result) {
return response.sendStatus(404);
}
response.sendStatus(201);
}));
router.get('/:queue', auth.validate, expressJoi.joiValidate(requestsSchema), wrap(function* (request, response) {
const requests = yield crawlerService.getRequests(request.params.queue, parseInt(request.query.count, 10), false);
if (!requests) {
return response.sendStatus(404);
}
response.json(requests);
}));
router.delete('/:queue', auth.validate, expressJoi.joiValidate(requestsSchema), wrap(function* (request, response) {
const requests = yield crawlerService.getRequests(request.params.queue, parseInt(request.query.count, 10), true);
if (!requests) {
return response.sendStatus(404);
}
response.json(requests);
}));
function queueRequests(requestSpecs, queueName) {
requestSpecs = Array.isArray(requestSpecs) ? requestSpecs : [requestSpecs];
const requests = requestSpecs.map(spec => rationalizeRequest(spec));
return crawlerService.queue(requests, queueName).catch(error => {
if (error.message && error.message.startsWith('Queue not found')) {
return null;
}
throw error;
});
}
function rationalizeRequest(request) {
let result = request;
if (typeof request === 'string') {
request = buildRequestFromSpec(request);
}
return Request.adopt(request);
}
function buildRequestFromSpec(spec) {
let crawlType = null;
let crawlUrl = 'https://api.github.com/';
if (spec.indexOf('/') > -1) {
crawlType = 'repo';
crawlUrl += 'repos/' + spec;
} else {
crawlType = 'org';
crawlUrl += 'orgs/' + spec;
}
return {
"type": crawlType,
"url": crawlUrl,
"policy": "default"
};
}
function setup(service) {
crawlerService = service;
return router;
}
module.exports = setup;

27
routes/status.js Normal file
Просмотреть файл

@ -0,0 +1,27 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const auth = require('../middleware/auth');
const express = require('express');
let crawlerService = null;
const router = express.Router();
router.get('/', auth.validate, function (request, response, next) {
// Gets some of the live, non-configurable values and put them in at the root
const result = {};
result.actualCount = crawlerService.status();
const loop = crawlerService.loops[0];
if (loop) {
result.delay = loop.options.delay || 0;
}
response.status(200).send(result);
});
function setup(service) {
crawlerService = service;
return router;
}
module.exports = setup;

71
routes/webhook.js Normal file
Просмотреть файл

@ -0,0 +1,71 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const crypto = require('crypto');
const express = require('express');
const moment = require('moment');
const Request = require('ghcrawler').request;
const wrap = require('../middleware/promiseWrap');
let crawlerService = null;
let webhookSecret = null;
const router = express.Router();
router.post('/', wrap(function* (request, response, next) {
if (crawlerService.options.queuing.events.provider !== 'webhook') {
return warn(request, response, 'Webhooks not enabled');
}
getLogger().verbose('Received', `Webhook event`, {delivery: request.headers['x-github-delivery']});
const signature = request.headers['x-hub-signature'];
const eventType = request.headers['x-github-event'];
if (!signature || !eventType) {
return fatal(request, response, 'Missing signature or event type on GitHub webhook');
}
const data = request.body;
const computedSignature = 'sha1=' + crypto.createHmac('sha1', webhookSecret).update(data).digest('hex');
if (!crypto.timingSafeEqual(new Buffer(signature), new Buffer(computedSignature))) {
return fatal(request, response, 'X-Hub-Signature does not match blob signature');
}
const event = JSON.parse(request.body);
const eventsUrl = event.repository ? event.repository.events_url : event.organization.events_url;
const result = new Request('event_trigger', `${eventsUrl}`);
result.payload = { body: event, etag: 1, fetchedAt: moment.utc().toISOString() };
// requests off directly off the event feed do not need exclusivity
result.requiresLock = false;
// if the event is for a private repo, mark the request as needing private access.
if (event.repository && event.repository.private) {
result.context.repoType = 'private';
}
yield crawlerService.queue(result, 'events');
getLogger().info('Queued', `Webhook event for ${eventsUrl}`, {delivery: request.headers['x-github-delivery']});
response.status(200).end();
}));
function warn(request, response, message) {
getLogger().warn(fatal, { delivery: request.headers['x-github-delivery']});
response.status(500);
response.setHeader('content-type', 'text/plain');
response.end(JSON.stringify(fatal));
}
function fatal(request, response, error) {
getLogger().error(error, { delivery: request.headers['x-github-delivery']});
response.status(400);
response.setHeader('content-type', 'text/plain');
response.end(JSON.stringify(error));
}
function getLogger() {
return crawlerService.crawler.logger;
}
function setup(service, secret) {
crawlerService = service;
webhookSecret = secret;
return router;
}
module.exports = setup;

Просмотреть файл

@ -0,0 +1,181 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const Amqp10Queue = require('../../providers/queuing/amqp10Queue');
const config = require('painless-config');
const expect = require('chai').expect;
const CrawlerFactory = require('../../lib/crawlerFactory');
const Q = require('q');
const Request = require('ghcrawler').request;
const url = config.get('CRAWLER_AMQP10_URL'); // URL should be: amqps://<keyName>:<key>@<host>
const name = 'test';
const formatter = message => {
Request.adopt(message);
return message;
};
const options = {
logger: CrawlerFactory.createLogger(true, 'silly'),
queueName: 'ghcrawler',
credit: 2,
_config: { on: () => { } }
};
describe('AMQP 1.0 Integration', () => {
before(() => {
if (!url) {
throw new Error('CRAWLER_AMQP10_URL not configured.');
}
return drainTestQueue(100);
});
it('Should pop no message if the queue is empty', () => {
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
return amqpQueue.subscribe().then(() => {
return amqpQueue.pop().then(message => {
expect(message).to.be.null;
return amqpQueue.unsubscribe();
});
});
});
it('Should push, pop and ack a message', (done) => {
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
amqpQueue.subscribe().then(() => {
let msg = new Request('user', 'http://test.com/users/user1');
amqpQueue.push(msg).then(() => {
setTimeout(() => {
amqpQueue.pop().then(message => {
expect(message).to.exist;
expect(message instanceof Request).to.be.true;
amqpQueue.done(message).then(() => {
amqpQueue.unsubscribe().then(done());
});
});
}, 500);
});
});
});
it('Should push, pop and ack a message, then pop no message from the empty queue', (done) => {
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
amqpQueue.subscribe().then(() => {
let msg = new Request('user', 'http://test.com/users/user2');
amqpQueue.push(msg).then(() => {
setTimeout(() => {
amqpQueue.pop().then(message => {
expect(message).to.exist;
expect(message instanceof Request).to.be.true;
amqpQueue.done(message).then(() => {
amqpQueue.pop().then(emptyMessage => {
expect(emptyMessage).to.be.null;
amqpQueue.unsubscribe().then(done());
});
});
});
}, 500);
});
});
});
it('Should push, pop, abandon, pop and ack a message', (done) => {
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
amqpQueue.subscribe().then(() => {
let msg = new Request('user', 'http://test.com/users/user3');
amqpQueue.push(msg).then(() => {
setTimeout(() => {
amqpQueue.pop().then(message => {
expect(message).to.exist;
expect(message instanceof Request).to.be.true;
amqpQueue.abandon(message).then(() => {
setTimeout(() => {
amqpQueue.pop().then(abandonedMessage => {
expect(abandonedMessage).to.exist;
expect(abandonedMessage instanceof Request).to.be.true;
amqpQueue.done(abandonedMessage).then(() => {
amqpQueue.unsubscribe().then(done());
});
});
}, 500);
});
});
}, 500);
});
});
});
it('Should subscribe, unsubscribe, subscribe, push, pop, ack.', (done) => {
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
const msg = new Request('user', 'http://test.com/users/user4');
amqpQueue.subscribe().delay(200).then(() => {
amqpQueue.unsubscribe().then(() => {
amqpQueue.subscribe().delay(200).then(() => {
amqpQueue.push(msg).delay(1000).then(() => {
amqpQueue.pop().then(message => {
expect(message).to.be.not.null;
amqpQueue.done(message).then(() => {
amqpQueue.unsubscribe().then(done());
});
});
});
});
});
});
});
it('Should push without connecting, fail, try unsubscribibg', (done) => {
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
const msg = new Request('user', 'http://test.com/users/user4');
amqpQueue.push(msg).then(message => { }, reason => {
expect(reason).to.be.not.null;
amqpQueue.unsubscribe().then(done());
});
});
it('Should push pop and ack 10 messages when initial credit is 10', () => {
const pushPromises = [];
const popPromises = [];
options.credit = 10;
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
return amqpQueue.subscribe().delay(2000).then(() => {
for (let i = 1; i <= 10; i++) {
let msg = new Request('user', 'http://test.com/users/user' + i);
pushPromises.push(amqpQueue.push(msg));
}
return Q.all(pushPromises).then(() => {
for (let i = 1; i <= 10; i++) {
popPromises.push(amqpQueue.pop().then(message => {
expect(message).to.exist;
expect(message instanceof Request).to.be.true;
return amqpQueue.done(message);
}));
}
return Q.all(popPromises).then(() => {
return amqpQueue.unsubscribe();
});
});
});
});
});
function drainTestQueue(numOfMessages) {
console.log('Drain the testing queue.');
const deferred = Q.defer();
const popPromises = [];
options.credit = numOfMessages;
const amqpQueue = new Amqp10Queue(url, name, formatter, options);
amqpQueue.subscribe().then(() => {
setTimeout(() => { // Wait for messages to be read.
for (let i = 0; i < numOfMessages; i++) {
popPromises.push(amqpQueue.pop().then(message => {
amqpQueue.done(message);
}));
}
Q.all(popPromises).then(() => {
amqpQueue.unsubscribe().then(deferred.resolve());
});
}, 2000);
});
return deferred.promise;
}

Просмотреть файл

@ -0,0 +1,63 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const expect = require('chai').expect;
const CrawlerFactory = require('../../lib/crawlerFactory');
const Q = require('q');
const qlimit = require('qlimit');
const sinon = require('sinon');
let deltaStore;
describe('Delta Store Integration', function () {
this.timeout(5000);
before(() => {
const baseStore = {
connect: () => logAndResolve('connect'),
upsert: () => logAndResolve('upsert'),
get: () => logAndResolve('get'),
etag: () => logAndResolve('etag'),
close: () => logAndResolve('close')
};
deltaStore = CrawlerFactory.createDeltaStore(baseStore);
});
it('Should connect, get, etag and close', () => {
return Q.all([
deltaStore.connect(),
deltaStore.get('test', 'test'),
deltaStore.etag('test', 'test'),
deltaStore.close()
]);
});
it('Should connect and upsert twice', () => {
return deltaStore.connect()
.then(() => { return deltaStore.upsert({ test: process.hrtime().join(' ') }); })
.then(() => { return deltaStore.upsert({ test: process.hrtime().join(' ') }); });
});
it('Should connect and upsert many times', () => {
sinon.spy(deltaStore, '_azureAppend');
const document = { abc: 1 };
const docs = [];
for (let i = 0; i < 50; i++) {
docs.push(document);
}
let counter = 0;
return deltaStore.connect().then(() => {
return Q.all(docs.map(qlimit(10)(doc => {
console.log(++counter);
return deltaStore.upsert(doc);
})));
}).then(() => {
expect(deltaStore._azureAppend.callCount).to.be.equal(50);
});
});
});
function logAndResolve(name) {
console.log(`Called baseStore.${name}()`);
return Q();
}

Просмотреть файл

@ -0,0 +1,98 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const expect = require('chai').expect;
const DeltaStore = require('../../providers/storage/deltaStore');
const Q = require('q');
const sinon = require('sinon');
let baseStore;
describe('Logging Store', () => {
beforeEach(() => {
baseStore = {
connect: sinon.spy(() => Q()),
upsert: sinon.spy(() => Q()),
get: sinon.spy(() => Q()),
etag: sinon.spy(() => Q()),
close: sinon.spy(() => Q())
};
});
afterEach(() => {
baseStore.connect.reset();
baseStore.upsert.reset();
baseStore.get.reset();
baseStore.etag.reset();
baseStore.close.reset();
});
it('Should connect, get, etag and close', () => {
let blobService = {
createContainerIfNotExists: sinon.spy((name, cb) => { cb(null); })
};
let deltaStore = new DeltaStore(baseStore, blobService, 'test');
return Q.all([
deltaStore.connect(),
deltaStore.get('test', 'test'),
deltaStore.etag('test', 'test'),
deltaStore.close()
]).then(() => {
expect(blobService.createContainerIfNotExists.callCount).to.be.equal(1);
expect(baseStore.connect.callCount).to.be.equal(1);
expect(baseStore.upsert.callCount).to.be.equal(0);
expect(baseStore.get.callCount).to.be.equal(1);
expect(baseStore.etag.callCount).to.be.equal(1);
expect(baseStore.close.callCount).to.be.equal(1);
});
});
it('Should upsert ten times', () => {
let blobService = {
createAppendBlobFromText: sinon.spy((name, blobName, text, cb) => { cb(); }),
appendBlockFromText: sinon.spy((name, blobName, text, cb) => { cb(); })
};
deltaStore = new DeltaStore(baseStore, blobService, 'test');
const promises = [];
for (let i = 0; i < 10; i++) {
promises.push(deltaStore.upsert({ test: true }));
}
return Q.all(promises).then(() => {
expect(blobService.createAppendBlobFromText.callCount).to.be.equal(0);
expect(blobService.appendBlockFromText.callCount).to.be.equal(10);
expect(baseStore.upsert.callCount).to.be.equal(10);
expect(deltaStore.blobSequenceNumber).to.be.equal(1);
expect(deltaStore.name).to.be.equal('test');
});
});
it('Should create blob if not exists', () => {
const appendResponses = [{ statusCode: 404 }, { statusCode: 404 }, null];
let blobService = {
createAppendBlobFromText: sinon.spy((name, blobName, text, cb) => { cb(); }),
appendBlockFromText: sinon.spy((name, blobName, text, cb) => { cb(appendResponses.shift()); })
};
deltaStore = new DeltaStore(baseStore, blobService, 'test');
return deltaStore.upsert({ test: true }).then(() => {
expect(blobService.createAppendBlobFromText.callCount).to.be.equal(1);
expect(blobService.appendBlockFromText.callCount).to.be.above(1);
expect(baseStore.upsert.callCount).to.be.equal(1);
expect(deltaStore.blobSequenceNumber).to.be.equal(1);
});
});
it('Should increment blob sequence number', () => {
const appendResponses = [{ statusCode: 409 }, { statusCode: 409 }, { statusCode: 404 }, null];
let blobService = {
createAppendBlobFromText: sinon.spy((name, blobName, text, cb) => { cb(); }),
appendBlockFromText: sinon.spy((name, blobName, text, cb) => { cb(appendResponses.shift()); })
};
deltaStore = new DeltaStore(baseStore, blobService, 'test');
return deltaStore.upsert({ test: true }).then(() => {
expect(blobService.createAppendBlobFromText.callCount).to.be.equal(1);
expect(blobService.appendBlockFromText.callCount).to.be.above(1);
expect(baseStore.upsert.callCount).to.be.equal(1);
expect(deltaStore.blobSequenceNumber).to.be.equal(2);
});
});
});

Просмотреть файл

@ -0,0 +1,238 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const expect = require('chai').expect;
const CrawlerFactory = require('../../lib/crawlerFactory');
const Q = require('q');
const Request = require('ghcrawler').request;
const sinon = require('sinon');
let crawler = null;
let spies = {};
describe('Simple processing', () => {
it('sdfs', () => {
return createCrawler().then(newCrawler => {
crawler = newCrawler;
const request = new Request('org', 'https://api.github.com/orgs/test')
return crawler.queue(request)
.then(processOne)
.then(checkDoc.bind(null, 'org', 'urn:org:1', 4))
.then(processOne)
.then(checkDoc.bind(null, 'user', 'urn:user:1', 1))
.then(processOne)
.then(checkDoc.bind(null, 'repos', 'urn:org:1:repos:page:1', 0))
.then(processOne)
.then(checkDoc.bind(null, 'members', 'urn:org:1:members:page:1', 2))
.then(processOne)
.then(checkDoc.bind(null, 'teams', 'urn:org:1:teams:page:1', 1))
.then(processOne)
.then(checkDoc.bind(null, 'repos', 'urn:org:1:repos:page:1', 0))
.then(processOne)
.then(checkDoc.bind(null, 'user', 'urn:user:2', 1)) // queued as a member of the org
.then(processOne)
.then(checkDoc.bind(null, 'team', 'urn:team:20', 2))
.then(processOne)
.then(checkDoc.bind(null, 'repos', 'urn:user:2:repos:page:1', 0))
.then(processOne)
.then(checkDoc.bind(null, 'members', 'urn:team:20:members:page:1', 0))
.then(processOne)
.then(checkDoc.bind(null, 'repos', 'urn:team:20:repos:page:1', 0))
.then(processOne)
.then(processOne)
.then(processOne);
});
});
});
function processOne() {
resetCrawlerSpies(crawler);
return crawler.processOne({ loopName: 'test' });
}
function checkDoc(type, urn, queuedCount) {
const doc = crawler.store.collections[type][urn];
expect(!!doc).to.be.equal(true, urn);
expect(doc._metadata.links.self.href).to.be.equal(urn, urn);
const queued = gatherQueued(spies.queueSpy);
expect(queued.length).to.be.equal(queuedCount, urn);
}
function gatherQueued(spy) {
let result = [];
for (let i = 0; i < spy.callCount; i++) {
result = result.concat(spy.getCall(i).args[0]);
}
return result;
}
function createCrawler() {
const service = CrawlerFactory.createService('InMemory');
return service.ensureInitialized().then(() => {
const crawler = service.crawler;
crawler.options.orgList = null;
crawler.fetcher = new TestFetcher();
return spyOnCrawler(crawler);
});
}
function spyOnCrawler(crawler) {
spies = {};
spies.pushSpy = sinon.spy(crawler.queues, 'push');
spies.queueSpy = sinon.spy(crawler, 'queue');
return crawler;
}
function resetCrawlerSpies(crawler) {
for (let spy in spies) {
spies[spy].reset();
}
return crawler;
}
class TestFetcher {
constructor() {
this.resources = resources;
}
fetch(request) {
const response = this.resources[request.url];
if (!response) {
return Q.reject('Not found');
}
response.statusCode = 200;
request.document = response.body;
request.contentOrigin = 'origin';
request.response = response;
return Q(request);
}
}
const resources = {
'https://api.github.com/orgs/test': {
body: {
"id": 1,
"url": "https://api.github.com/orgs/test",
"repos_url": "https://api.github.com/orgs/test/repos",
"members_url": "https://api.github.com/orgs/test/members{/member}",
}
},
'https://api.github.com/orgs/test/repos': {
body: [
// {
// "url": "https://api.github.com/repos/test/repo1",
// }
]
},
'https://api.github.com/users/test/repos': {
body: []
},
'https://api.github.com/users/test': {
body:
{
"id": 1,
"url": "https://api.github.com/users/test",
"repos_url": "https://api.github.com/users/test/repos",
}
},
'https://api.github.com/users/user2': {
body:
{
"id": 2,
"url": "https://api.github.com/users/user2",
"repos_url": "https://api.github.com/users/user2/repos",
}
},
'https://api.github.com/users/user2/repos': {
body: [
// {
// "url": "https://api.github.com/repos/user2/repo2",
// }
]
},
'https://api.github.com/teams/20': {
body:
{
"id": 20,
"members_url": "https://api.github.com/teams/20/members{/member}",
"repositories_url": "https://api.github.com/teams/20/repos",
"members_count": 3,
"repos_count": 10,
"organization": {
"id": 1,
"url": "https://api.github.com/orgs/test",
}
}
},
'https://api.github.com/teams/20/repos': {
body: []
},
'https://api.github.com/teams/20/members': {
body: []
},
'https://api.github.com/repos/test/repo1': {
body: {
"id": 10,
"owner": {
"id": 1,
"url": "https://api.github.com/users/test",
},
"collaborators_url": "http://api.github.com/repos/test/repo1/collaborators{/collaborator}",
"commits_url": "http://api.github.com/repos/test/repo1/commits{/sha}",
"contributors_url": "http://api.github.com/repos/test/repo1/contributors",
"events_url": "http://api.github.com/repos/test/repo1/events",
"issues_url": "http://api.github.com/repos/test/repo1/issues{/number}",
"pulls_url": "http://api.github.com/repos/test/repo1/pulls{/number}",
"subscribers_url": "http://api.github.com/repos/test/repo1/subscribers",
"teams_url": "http://api.github.com/repos/test/repo1/teams",
"subscribers_count": 42,
"organization": {
"id": 1,
"url": "https://api.github.com/orgs/test",
}
}
}, 'https://api.github.com/repos/user2/repo2': {
body: {
"id": 11,
"owner": {
"id": 2,
"url": "https://api.github.com/users/user2",
},
"collaborators_url": "http://api.github.com/repos/user2/repo2/collaborators{/collaborator}",
"commits_url": "http://api.github.com/repos/user2/repo2/commits{/sha}",
"contributors_url": "http://api.github.com/repos/user2/repo2/contributors",
"events_url": "http://api.github.com/repos/user2/repo2/events",
"issues_url": "http://api.github.com/repos/user2/repo2/issues{/number}",
"pulls_url": "http://api.github.com/repos/user2/repo2/pulls{/number}",
"subscribers_url": "http://api.github.com/repos/user2/repo2/subscribers",
"teams_url": "http://api.github.com/repos/user2/repo2/teams",
}
},
'https://api.github.com/repos/test/repo1/collaborators': {
body: [
{
"url": "https://api.github.com/users/test",
},
{
"url": "https://api.github.com/users/user2",
}
]
},
'https://api.github.com/orgs/test/members': {
body: [
{
"url": "https://api.github.com/users/test",
},
{
"url": "https://api.github.com/users/user2",
}
]
},
'https://api.github.com/orgs/test/teams': {
body: [
{
"url": "https://api.github.com/teams/20",
}
]
}
};

Просмотреть файл

@ -0,0 +1,476 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const assert = require('chai').assert;
const chai = require('chai');
const expect = require('chai').expect;
const extend = require('extend');
const Q = require('q');
const redlock = require('redlock');
const Request = require('ghcrawler').request;
const RequestTracker = require('../../providers/queuing/redisRequestTracker.js');
const sinon = require('sinon');
describe('NON Locking Request Tracker track', () => {
it('should set the tag and call the operation', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, null); }), set: sinon.spy((values, cb) => { cb(null); }) });
const locker = createNolock();
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => { return Q(24); });
return tracker.track(request, operation).then(() => {
expect(operation.callCount).to.be.equal(1);
expect(redis.get.callCount).to.be.equal(1);
expect(redis.set.callCount).to.be.equal(1);
expect(parseInt(redis.set.getCall(0).args[0][1])).to.be.approximately(Date.now(), 10);
});
});
it('should reject and not call the operation if could not read tag', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(new Error('fail!')); }), set: sinon.spy(() => { }) });
const locker = createNolock();
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => { });
return tracker.track(request, operation).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(redis.get.callCount).to.be.equal(1);
expect(operation.callCount).to.be.equal(0);
expect(redis.set.callCount).to.be.equal(0);
});
});
it('should not tag if the operation fails', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, null); }), set: sinon.spy((values, cb) => { }) });
const locker = createNolock();
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => { throw new Error('fail!'); });
return tracker.track(request, operation).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(operation.callCount).to.be.equal(1);
expect(redis.get.callCount).to.be.equal(1);
expect(redis.set.callCount).to.be.equal(0);
});
});
it('should not fail if everything works and tagging fails', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, null); }), set: sinon.spy((values, cb) => { cb(new Error('fail!')); }) });
const locker = createNolock();
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => Q(24));
return tracker.track(request, operation).then(
result => {
expect(result).to.be.equal(24);
expect(operation.callCount).to.be.equal(1);
expect(redis.get.callCount).to.be.equal(1);
expect(redis.set.callCount).to.be.equal(1);
});
});
it('should skip the operation if already tagged', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, 13); }) });
const locker = createNolock();
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => Q(24));
return tracker.track(request, operation).then(
result => {
expect(result).to.not.be.equal(24);
expect(operation.callCount).to.be.equal(0);
expect(redis.get.callCount).to.be.equal(1);
});
});
});
describe('NON locking Request Tracker untrack', () => {
it('should remove the tag', () => {
const redis = createRedisClient({ del: sinon.spy((key, cb) => { cb(null); }) });
const locker = createNolock();
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
return tracker.untrack(request).then(() => {
expect(redis.del.callCount).to.be.equal(1);
expect(redis.del.getCall(0).args[0].startsWith('test:')).to.be.true;
});
});
it('will reject if tag removal fails', () => {
const redis = createRedisClient({ del: sinon.spy((key, cb) => { cb(new Error('fail!')); }) });
const locker = createNolock();
const tracker = createTracker('test', redis, locker);
tracker.logger = { error: sinon.spy(error => { }) };
const request = new Request('org', 'http://test.com');
return tracker.untrack(request).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(tracker.logger.error.callCount).to.be.equal(1);
expect(tracker.logger.error.getCall(0).args[0].message.startsWith('Failed')).to.be.true;
expect(redis.del.callCount).to.be.equal(1);
expect(redis.del.getCall(0).args[0].startsWith('test:')).to.be.true;
});
});
});
describe('Locking Request Tracker track', () => {
it('should set the tag and call the operation having locked and unlocked', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, null); }), set: sinon.spy((values, cb) => { cb(null); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(lock => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => { return Q(24); });
return tracker.track(request, operation).then(() => {
expect(locker.lock.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(operation.callCount).to.be.equal(1);
expect(redis.get.callCount).to.be.equal(1);
expect(redis.set.callCount).to.be.equal(1);
expect(parseInt(redis.set.getCall(0).args[0][1])).to.be.approximately(Date.now(), 10);
});
});
it('should reject and not attempt tagging or call the operation if could not lock', () => {
const redis = createRedisClient({ get: sinon.spy(() => { }), set: sinon.spy(() => { }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { throw new redlock.LockError('fail!'); });
locker.unlock = sinon.spy(lock => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => { });
return tracker.track(request, operation).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(locker.lock.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(0);
expect(operation.callCount).to.be.equal(0);
expect(redis.get.callCount).to.be.equal(0);
expect(redis.set.callCount).to.be.equal(0);
});
});
it('should reject and not call the operation if could not read tag', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(new Error('fail!')); }), set: sinon.spy(() => { }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(lock => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => { });
return tracker.track(request, operation).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(locker.lock.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.get.callCount).to.be.equal(1);
expect(operation.callCount).to.be.equal(0);
expect(redis.set.callCount).to.be.equal(0);
});
});
it('should not tag if the operation fails', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, null); }), set: sinon.spy((values, cb) => { }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(() => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => { throw new Error('fail!'); });
return tracker.track(request, operation).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(locker.lock.callCount).to.be.equal(1);
expect(operation.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.get.callCount).to.be.equal(1);
expect(redis.set.callCount).to.be.equal(0);
});
});
it('should not fail if everything works and tagging fails', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, null); }), set: sinon.spy((values, cb) => { cb(new Error('fail!')); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(() => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => Q(24));
return tracker.track(request, operation).then(
result => {
expect(result).to.be.equal(24);
expect(locker.lock.callCount).to.be.equal(1);
expect(operation.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.get.callCount).to.be.equal(1);
expect(redis.set.callCount).to.be.equal(1);
});
});
it('should not fail if everything works and unlock fails', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, null); }), set: sinon.spy((values, cb) => { cb(new Error('fail!')); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(() => { throw new Error('fail!'); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => Q(24));
return tracker.track(request, operation).then(
result => {
expect(result).to.be.equal(24);
expect(locker.lock.callCount).to.be.equal(1);
expect(operation.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.get.callCount).to.be.equal(1);
expect(redis.set.callCount).to.be.equal(1);
});
});
it('should skip the operation if already tagged', () => {
const redis = createRedisClient({ get: sinon.spy((key, cb) => { cb(null, 13); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(() => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => Q(24));
return tracker.track(request, operation).then(
result => {
expect(result).to.not.be.equal(24);
expect(locker.lock.callCount).to.be.equal(1);
expect(operation.callCount).to.be.equal(0);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.get.callCount).to.be.equal(1);
});
});
});
describe('Request Tracker untrack', () => {
it('should remove the tag having locked and unlocked', () => {
const redis = createRedisClient({ del: sinon.spy((key, cb) => { cb(null); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(lock => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
return tracker.untrack(request).then(() => {
expect(locker.lock.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.del.callCount).to.be.equal(1);
expect(redis.del.getCall(0).args[0].startsWith('test:')).to.be.true;
});
});
it('will reject and not remove the tag if locking fails', () => {
const redis = createRedisClient({ del: sinon.spy((key, cb) => { cb(null); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { throw new redlock.LockError('fail!'); });
locker.unlock = sinon.spy(lock => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
return tracker.untrack(request).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(locker.lock.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(0);
expect(redis.del.callCount).to.be.equal(0);
});
});
it('will reject if tag removal fails', () => {
const redis = createRedisClient({ del: sinon.spy((key, cb) => { cb(new Error('fail!')); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(lock => { return Q(); });
const tracker = createTracker('test', redis, locker);
tracker.logger = { error: sinon.spy(error => { }) };
const request = new Request('org', 'http://test.com');
return tracker.untrack(request).then(
() => assert.fail(),
error => {
expect(error.message).to.be.equal('fail!');
expect(tracker.logger.error.callCount).to.be.equal(1);
expect(tracker.logger.error.getCall(0).args[0].message.startsWith('Failed')).to.be.true;
expect(locker.lock.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.del.callCount).to.be.equal(1);
expect(redis.del.getCall(0).args[0].startsWith('test:')).to.be.true;
});
});
it('will resolve and remove the tag even if unlock fails', () => {
const redis = createRedisClient({ del: sinon.spy((key, cb) => { cb(null); }) });
const locker = createRedlock();
locker.lock = sinon.spy(() => { return Q({ value: 42 }); });
locker.unlock = sinon.spy(lock => { throw new redlock.LockError('fail!'); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
return tracker.untrack(request).then(
result => {
expect(locker.lock.callCount).to.be.equal(1);
expect(locker.unlock.callCount).to.be.equal(1);
expect(locker.unlock.getCall(0).args[0].value).to.be.equal(42);
expect(redis.del.callCount).to.be.equal(1);
expect(redis.del.getCall(0).args[0].startsWith('test:')).to.be.true;
},
error =>
assert.fail()
);
});
});
describe('Request Tracker concurrency', () => {
it('should remove the tag having locked and unlocked', () => {
const getResponses = [null, 13];
const redis = createRedisClient({
get: delaySpy((key, cb) => { cb(null, getResponses.shift()); }),
set: delaySpy((values, cb) => { cb(null); }),
del: delaySpy((key, cb) => { cb(null); })
});
const locker = createRedlock();
const lockResponses = [{ value: 42 }, null]
locker.lock = delaySpy(() => { if (lockResponses.shift) return Q({ value: 42 }); else throw new Error('fail'); });
locker.unlock = delaySpy(lock => { return Q(); });
const tracker = createTracker('test', redis, locker);
const request = new Request('org', 'http://test.com');
const operation = sinon.spy(() => Q(24));
const path1 = tracker.track(request, operation);
const path2 = tracker.track(request, operation);
return Q.all([path1, path2]).spread((one, two) => {
expect(locker.lock.callCount).to.be.equal(2);
expect(locker.unlock.callCount).to.be.equal(2);
expect(operation.callCount).to.be.equal(1);
expect(redis.get.callCount).to.be.equal(2);
expect(redis.set.callCount).to.be.equal(1);
expect(parseInt(redis.set.getCall(0).args[0][1])).to.be.approximately(Date.now(), 10);
});
});
});
// TODO, yes we should theoretically be able to use "arguments" here but it seems to be
// binding to the factory method arguments, not the runtime arguments. Doing this for now.
function delaySpy(f, time = 2) {
if (f.length === 0)
return delaySpy0(f, time);
if (f.length === 1)
return delaySpy1(f, time);
if (f.length === 2)
return delaySpy2(f, time);
}
function delaySpy0(f, time = 2) {
return sinon.spy(() => {
const self = this;
return Q.delay(time).then(() => { return f.apply(self, []); });
});
}
function delaySpy1(f, time = 2) {
return sinon.spy(x => {
const self = this;
if (typeof x === 'function') {
setTimeout(() => { f.apply(self, [x]); }, time);
} else {
return Q.delay(time).then(() => { return f.apply(self, [x]); });
}
});
}
function delaySpy2(f, time = 2) {
return sinon.spy((x, y) => {
const self = this;
if (typeof y === 'function') {
setTimeout(() => { f.apply(self, [x, y]); }, time);
} else {
return Q.delay(time).then(() => { return f.apply(self, [x, y]); });
}
});
}
function createRedisClient({ get = null, set = null, del = null } = {}) {
const result = {};
result.get = get || (() => assert.fail('should not lock'));
result.set = set || (() => assert.fail('should not extend'));
result.del = del || (() => assert.fail('should not unlock'));
return result;
}
function createRedlock({ lock = null, extend = null, unlock = null } = {}) {
const result = {};
result.lock = lock || (() => assert.fail('should not lock'));
result.extend = extend || (() => assert.fail('should not extend'));
result.unlock = unlock || (() => assert.fail('should not unlock'));
return result;
}
function createNolock() {
const result = {};
result.lock = () => null;
result.extend = extend || (() => assert.fail('should not extend'));
result.unlock = () => { };
return result;
}
function createTracker(prefix, redisClient = createRedisClient(), locker = createNolock(), options = createOptions()) {
return new RequestTracker(prefix, redisClient, locker, options);
}
function createOptions() {
return {
logger: createBaseLog(),
tracker: {
lockTtl: 1000,
ttl: 6 * 60 * 1000
}
};
}
function createBaseLog({ info = null, warn = null, error = null, verbose = null, silly = null } = {}) {
const result = {};
result.info = info || (() => { });
result.warn = warn || (() => { });
result.error = error || (() => { });
result.verbose = verbose || ((message) => { console.log(message) });
result.silly = silly || ((message) => { console.log(message) });
result.level = 'silly';
return result;
}

Просмотреть файл

@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const expect = require('chai').expect;
const TokenFactory = require('../../providers/fetcher/tokenFactory');
describe('Token Factory', () => {
it('should find a token with multiple desired traits', () => {
const factory = new TokenFactory('1111#admin,private,push;2222#public', null);
let token = null;
token = factory.getToken([]);
expect(token).to.be.not.null;
token = factory.getToken();
expect(token).to.be.not.null;
token = factory.getToken([['admin'], ['admin'], ['public']]);
expect(token).to.be.equal('1111');
token = factory.getToken(['public']);
expect(token).to.be.equal('2222');
});
});