Initial code moved from ospo-ghcrawler

This commit is contained in:
Jeff McAffer 2017-02-19 20:23:04 -08:00
Родитель 4803fdb99d
Коммит 9708c7fff6
4 изменённых файлов: 297 добавлений и 0 удалений

Просмотреть файл

@ -1,3 +1,53 @@
# Crawler command line
[GHCrawler](https://github.com/Microsoft/ospo-ghcrawler.git) is utility for walking GitHub APIs and tracking GitHub events. This command line app allows you to control various
aspects of a crawler's behavior. There is some overlap in function with the [Crawler Dashboard](https://github.com/Microsoft/crawler-dashboard.git)
This project also has a simple Node client library for talking to a crawler.
# Controlling a crawler
The ```cc``` utility is in the ```bin``` directory of this repo. It can be run interactively or as a single command processor. The general format of using the command line is
```node cc [options] [command]```
where the available options are:
```-i``` -- Run in interactive mode
```-s <url>``` -- Control the crawler service running at the given url. Defaults to http://localhost:3000. You can also set the CRAWLER_SERVICE_URL environment variable.
```-t``` -- The crawler service API token to use. This can also be supplied via the ```CRAWLER_SERVICE_AUTH_TOKEN``` environment variable. If not defined in either place, the default ```"secret"``` value is used.
and the available commands are:
```start [count]``` -- Start the crawler processing with count concurrent operations. If count is not specified, 1 is the default. On a reasonably fast network a count of 10 to 15 should be sufficient. This also depends on how many tokens you are using.
```stop``` -- Stop crawler processing. The crawler service is left running but it stops pulling requests off the queue.
```queue <requests...>``` -- Queues the given requests for processing. The requests parameter is a list of GitHub "org" and/or "org/repo" names.
```orgs <org orgs...>``` -- Set the crawler's to traverse only the GitHub orgs named in the given list.
```config``` -- Dumps the crawler service's configuration to the console.
```tokens <spec...>``` -- Set the GitHub tokens to be used by the crawler when calling GitHub APIs. The spec value is a list of token specs. Each spec has the form <token>#<trait>,<trait>... where the token is the GitHub OAuth or Personal Access token and the comma-separated list of traits identify what permissions the token has. The available traits are: ```public```, ```admin```, ```private```. You can list as many tokens and traits as you like. Note that you can also configure the GitHub tokens the CRAWLER_GITHUB_TOKENS environment variable instead **before starting the crawler**. For example, ```export CRAWLER_GITHUB_TOKENS="<token1>#public <token2>#admin"```.
A typical sequence shown in the snippet below configures the crawler with a set of tokens, configures the org filter set and then queues and starts the processing of the org.
```
> node bin/cc
http://localhost:3000> tokens 43984b2344ca575d0f0e097efd97#public 972bbdfe098098fa9ce082309#admin
http://localhost:3000> orgs contoso-d
http://localhost:3000> queue contoso-d
http://localhost:3000> start 5
http://localhost:3000> exit
>
```
# API
API doc is coming.
# Contributing
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

115
bin/cc Normal file
Просмотреть файл

@ -0,0 +1,115 @@
#!/usr/bin/env node
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const commander = require('commander');
const CrawlerClient = require('../lib/crawlerClient');
const readline = require('readline');
const split = require('argv-split');
const Q = require('q');
const crawlerClient = new CrawlerClient(process.env.CRAWLER_SERVICE_URL || 'http://localhost:3000', process.env.CRAWLER_SERVICE_AUTH_TOKEN || 'secret');
let promise = Q();
const commands = getCommands();
if (!process.argv.slice(2).length) {
commands.help();
}
commands.parse(process.argv);
promise.then(() => {
if (commands.interactive) {
startReplLoop(commands);
}
});
function getCommands() {
const commands = new commander.Command()
commands
.version('0.0.1')
.option('-i, --interactive', 'Run in interactive mode. Otherwise the given command is executed and this tool exits.')
.option('-s, --service <url>', 'URL of the crawler service', url => crawlerUrl = url)
.option('-t, --token <token>', 'Token for talking to the crawler service', token => authToken = token);
commands
.command('help')
.description('Print out this message')
.action(() => commands.outputHelp());
commands
.command('stop')
.description('Stop all processing in the crawler')
.action(() => configureCount(0));
commands
.command('queue <requests...>')
.description('Queue the given list of orgs and/or repos to be processed.')
.action(requests => queueRequests(requests));
commands
.command('start [count]')
.description('Start the crawler processing request with [count] concurrency')
.action(count => configureCount(count || 1));
commands
.command('orgs <orgs...>')
.description('Configure the crawler to process requests from only the given GitHub organizations')
.action(orgs => configureOrgs(orgs));
commands
.command('config')
.description('Dump the current crawler configuration to the console')
.action(dumpConfig);
commands
.command('tokens <tokens...>')
.description('Set the GitHub tokens to be used by the crawler. The parameter is a list of <token>#<trait>[,<trait>]* where the possible traits are "admin", "public", and "private"')
.action(() => setTokens(tokens));
commands
.command('exit')
.description('Exit this tool')
.action(() => process.exit(0));
return commands;
}
function startReplLoop(commands) {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
rl.setPrompt(crawlerUrl + '> ');
rl.prompt();
rl.on('line', (line) => {
const command = split(line);
// pad up the command line to keep commander happy
command.unshift('node', 'cc');
commands.parse(command);
promise
.catch(error => console.log(error.message))
.finally(() => {
promise = Q();
rl.prompt();
});
});
}
function configureCount(count) {
count = Math.max(count, 0);
const patch = [
{ "op": "replace", "path": "/crawler/count", "value": count }
];
promise = crawlerClient.configureCrawler(patch).then(() => console.log(`${count ? 'Started' : 'Stopped'} crawler processing`));
}
function configureOrgs(orgs) {
const patch = [
{ "op": "replace", "path": "/crawler/orgList", "value": orgs }
];
promise = crawlerClient.configureCrawler(patch).then(() => console.log('Configured org list'));
}
function dumpConfig() {
promise = crawlerClient.getConfiguration().then(config => console.dir(config));
}
function setTokens(tokens) {
promise = crawlerClient.setTokens(tokens).then(() => console.log('Tokens set'));
}
function queueRequests(specs) {
promise = crawlerClient.queueRequests(specs).then(() => console.log(`Queued ${specs.length} requests`));
}

108
crawlerClient.js Normal file
Просмотреть файл

@ -0,0 +1,108 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
const request = require('request');
const Q = require('q');
class CrawlerClient {
constructor(url, token) {
this.crawlerUrl = process.env.CRAWLER_SERVICE_URL || 'http://localhost:3000';
this.authToken = process.env.CRAWLER_SERVICE_AUTH_TOKEN || 'secret';
}
configureCount(count) {
count = Math.max(count, 0);
const patch = [
{ "op": "replace", "path": "/crawler/count", "value": count }
];
return configureCrawler(patch);
}
configureOrgs(orgs) {
const patch = [
{ "op": "replace", "path": "/crawler/orgList", "value": orgs }
];
return configureCrawler(patch);
}
configureCrawler(patch) {
const deferred = Q.defer();
request.patch(`${this.crawlerUrl}/config`, {
headers: {
'X-token': this.authToken
},
json: true,
body: patch
}, (error, response, body) => {
if (error) {
return deferred.reject(new Error(`Failed to configure crawler: ${error.message}.`));
}
if (response.statusCode !== 200) {
return deferred.reject(new Error(`Failed to configure crawler: ${body}.`));
}
deferred.resolve();
});
return deferred.promise;
}
getConfiguration() {
const deferred = Q.defer();
request.get(`${this.crawlerUrl}/config`, {
headers: {
'X-token': this.authToken
},
json: true,
}, (error, response, body) => {
if (error) {
return deferred.reject(new Error(`Failed to get crawler configuration: ${error.message}.`));
}
if (response.statusCode !== 200) {
return deferred.reject(new Error(`Failed to get crawler configuration: ${body}.`));
}
deferred.resolve(body);
});
return deferred.promise;
}
setTokens(tokens) {
const deferred = Q.defer();
request.put(`${this.crawlerUrl}/tokens`, {
headers: {
'X-token': this.authToken
},
body: tokens.join(';')
}, (error, response, body) => {
if (error) {
return deferred.reject(new Error(`Failed to set tokens: ${error.message}.`));
}
if (response.statusCode !== 200) {
return deferred.reject(new Error(`Failed to set tokens: ${body}.`));
}
deferred.resolve(null);
});
return deferred.promise;
}
queueRequests(requests, queueName = 'normal') {
const deferred = Q.defer();
request.post(`${this.crawlerUrl}/requests/${queueName}`, {
headers: {
'X-token': this.authToken
},
json: true,
body: requests
}, (error, response, body) => {
if (error) {
return deferred.reject(new Error(`Failed to queue requests: ${error.message}.`));
}
if (response.statusCode !== 200) {
return deferred.reject(new Error(`Failed to queue requests: ${body}.`));
}
deferred.resolve();
});
return deferred.promise;
}
}
module.exports = CrawlerClient;

24
package.json Normal file
Просмотреть файл

@ -0,0 +1,24 @@
{
"name": "crawler-cli",
"version": "0.1.0",
"description": "A simple command line app for controlling a GitHub Crawler",
"scripts": {
"start": "node ./bin/cc.js"
},
"keywords": [
"GitHub",
"API",
"crawler"
],
"author": "Jeff McAffer",
"license": "MIT",
"dependencies": {
"argv-split": "^1.0.1",
"commander": "^2.9.0",
"painless-config": "^0.1.0",
"q": "1.4.1",
"readline": "^1.3.0",
"request": "^2.79.0"
},
"devDependencies": {}
}