Initial code moved from ospo-ghcrawler
This commit is contained in:
Родитель
4803fdb99d
Коммит
9708c7fff6
50
README.md
50
README.md
|
@ -1,3 +1,53 @@
|
|||
# Crawler command line
|
||||
|
||||
[GHCrawler](https://github.com/Microsoft/ospo-ghcrawler.git) is utility for walking GitHub APIs and tracking GitHub events. This command line app allows you to control various
|
||||
aspects of a crawler's behavior. There is some overlap in function with the [Crawler Dashboard](https://github.com/Microsoft/crawler-dashboard.git)
|
||||
|
||||
This project also has a simple Node client library for talking to a crawler.
|
||||
|
||||
# Controlling a crawler
|
||||
|
||||
The ```cc``` utility is in the ```bin``` directory of this repo. It can be run interactively or as a single command processor. The general format of using the command line is
|
||||
|
||||
```node cc [options] [command]```
|
||||
|
||||
where the available options are:
|
||||
|
||||
```-i``` -- Run in interactive mode
|
||||
|
||||
```-s <url>``` -- Control the crawler service running at the given url. Defaults to http://localhost:3000. You can also set the CRAWLER_SERVICE_URL environment variable.
|
||||
|
||||
```-t``` -- The crawler service API token to use. This can also be supplied via the ```CRAWLER_SERVICE_AUTH_TOKEN``` environment variable. If not defined in either place, the default ```"secret"``` value is used.
|
||||
|
||||
and the available commands are:
|
||||
|
||||
```start [count]``` -- Start the crawler processing with count concurrent operations. If count is not specified, 1 is the default. On a reasonably fast network a count of 10 to 15 should be sufficient. This also depends on how many tokens you are using.
|
||||
|
||||
```stop``` -- Stop crawler processing. The crawler service is left running but it stops pulling requests off the queue.
|
||||
|
||||
```queue <requests...>``` -- Queues the given requests for processing. The requests parameter is a list of GitHub "org" and/or "org/repo" names.
|
||||
|
||||
```orgs <org orgs...>``` -- Set the crawler's to traverse only the GitHub orgs named in the given list.
|
||||
|
||||
```config``` -- Dumps the crawler service's configuration to the console.
|
||||
|
||||
```tokens <spec...>``` -- Set the GitHub tokens to be used by the crawler when calling GitHub APIs. The spec value is a list of token specs. Each spec has the form <token>#<trait>,<trait>... where the token is the GitHub OAuth or Personal Access token and the comma-separated list of traits identify what permissions the token has. The available traits are: ```public```, ```admin```, ```private```. You can list as many tokens and traits as you like. Note that you can also configure the GitHub tokens the CRAWLER_GITHUB_TOKENS environment variable instead **before starting the crawler**. For example, ```export CRAWLER_GITHUB_TOKENS="<token1>#public <token2>#admin"```.
|
||||
|
||||
A typical sequence shown in the snippet below configures the crawler with a set of tokens, configures the org filter set and then queues and starts the processing of the org.
|
||||
|
||||
```
|
||||
> node bin/cc
|
||||
http://localhost:3000> tokens 43984b2344ca575d0f0e097efd97#public 972bbdfe098098fa9ce082309#admin
|
||||
http://localhost:3000> orgs contoso-d
|
||||
http://localhost:3000> queue contoso-d
|
||||
http://localhost:3000> start 5
|
||||
http://localhost:3000> exit
|
||||
>
|
||||
```
|
||||
|
||||
# API
|
||||
API doc is coming.
|
||||
|
||||
# Contributing
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
#!/usr/bin/env node
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
const commander = require('commander');
|
||||
const CrawlerClient = require('../lib/crawlerClient');
|
||||
const readline = require('readline');
|
||||
const split = require('argv-split');
|
||||
const Q = require('q');
|
||||
|
||||
const crawlerClient = new CrawlerClient(process.env.CRAWLER_SERVICE_URL || 'http://localhost:3000', process.env.CRAWLER_SERVICE_AUTH_TOKEN || 'secret');
|
||||
let promise = Q();
|
||||
|
||||
const commands = getCommands();
|
||||
if (!process.argv.slice(2).length) {
|
||||
commands.help();
|
||||
}
|
||||
commands.parse(process.argv);
|
||||
promise.then(() => {
|
||||
if (commands.interactive) {
|
||||
startReplLoop(commands);
|
||||
}
|
||||
});
|
||||
|
||||
function getCommands() {
|
||||
const commands = new commander.Command()
|
||||
commands
|
||||
.version('0.0.1')
|
||||
.option('-i, --interactive', 'Run in interactive mode. Otherwise the given command is executed and this tool exits.')
|
||||
.option('-s, --service <url>', 'URL of the crawler service', url => crawlerUrl = url)
|
||||
.option('-t, --token <token>', 'Token for talking to the crawler service', token => authToken = token);
|
||||
commands
|
||||
.command('help')
|
||||
.description('Print out this message')
|
||||
.action(() => commands.outputHelp());
|
||||
commands
|
||||
.command('stop')
|
||||
.description('Stop all processing in the crawler')
|
||||
.action(() => configureCount(0));
|
||||
commands
|
||||
.command('queue <requests...>')
|
||||
.description('Queue the given list of orgs and/or repos to be processed.')
|
||||
.action(requests => queueRequests(requests));
|
||||
commands
|
||||
.command('start [count]')
|
||||
.description('Start the crawler processing request with [count] concurrency')
|
||||
.action(count => configureCount(count || 1));
|
||||
commands
|
||||
.command('orgs <orgs...>')
|
||||
.description('Configure the crawler to process requests from only the given GitHub organizations')
|
||||
.action(orgs => configureOrgs(orgs));
|
||||
commands
|
||||
.command('config')
|
||||
.description('Dump the current crawler configuration to the console')
|
||||
.action(dumpConfig);
|
||||
commands
|
||||
.command('tokens <tokens...>')
|
||||
.description('Set the GitHub tokens to be used by the crawler. The parameter is a list of <token>#<trait>[,<trait>]* where the possible traits are "admin", "public", and "private"')
|
||||
.action(() => setTokens(tokens));
|
||||
commands
|
||||
.command('exit')
|
||||
.description('Exit this tool')
|
||||
.action(() => process.exit(0));
|
||||
return commands;
|
||||
}
|
||||
|
||||
function startReplLoop(commands) {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout
|
||||
});
|
||||
|
||||
rl.setPrompt(crawlerUrl + '> ');
|
||||
rl.prompt();
|
||||
|
||||
rl.on('line', (line) => {
|
||||
const command = split(line);
|
||||
// pad up the command line to keep commander happy
|
||||
command.unshift('node', 'cc');
|
||||
commands.parse(command);
|
||||
promise
|
||||
.catch(error => console.log(error.message))
|
||||
.finally(() => {
|
||||
promise = Q();
|
||||
rl.prompt();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function configureCount(count) {
|
||||
count = Math.max(count, 0);
|
||||
const patch = [
|
||||
{ "op": "replace", "path": "/crawler/count", "value": count }
|
||||
];
|
||||
promise = crawlerClient.configureCrawler(patch).then(() => console.log(`${count ? 'Started' : 'Stopped'} crawler processing`));
|
||||
}
|
||||
|
||||
function configureOrgs(orgs) {
|
||||
const patch = [
|
||||
{ "op": "replace", "path": "/crawler/orgList", "value": orgs }
|
||||
];
|
||||
promise = crawlerClient.configureCrawler(patch).then(() => console.log('Configured org list'));
|
||||
}
|
||||
|
||||
function dumpConfig() {
|
||||
promise = crawlerClient.getConfiguration().then(config => console.dir(config));
|
||||
}
|
||||
|
||||
function setTokens(tokens) {
|
||||
promise = crawlerClient.setTokens(tokens).then(() => console.log('Tokens set'));
|
||||
}
|
||||
|
||||
function queueRequests(specs) {
|
||||
promise = crawlerClient.queueRequests(specs).then(() => console.log(`Queued ${specs.length} requests`));
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
const request = require('request');
|
||||
const Q = require('q');
|
||||
|
||||
class CrawlerClient {
|
||||
|
||||
constructor(url, token) {
|
||||
this.crawlerUrl = process.env.CRAWLER_SERVICE_URL || 'http://localhost:3000';
|
||||
this.authToken = process.env.CRAWLER_SERVICE_AUTH_TOKEN || 'secret';
|
||||
}
|
||||
|
||||
configureCount(count) {
|
||||
count = Math.max(count, 0);
|
||||
const patch = [
|
||||
{ "op": "replace", "path": "/crawler/count", "value": count }
|
||||
];
|
||||
return configureCrawler(patch);
|
||||
}
|
||||
|
||||
configureOrgs(orgs) {
|
||||
const patch = [
|
||||
{ "op": "replace", "path": "/crawler/orgList", "value": orgs }
|
||||
];
|
||||
return configureCrawler(patch);
|
||||
}
|
||||
|
||||
configureCrawler(patch) {
|
||||
const deferred = Q.defer();
|
||||
request.patch(`${this.crawlerUrl}/config`, {
|
||||
headers: {
|
||||
'X-token': this.authToken
|
||||
},
|
||||
json: true,
|
||||
body: patch
|
||||
}, (error, response, body) => {
|
||||
if (error) {
|
||||
return deferred.reject(new Error(`Failed to configure crawler: ${error.message}.`));
|
||||
}
|
||||
if (response.statusCode !== 200) {
|
||||
return deferred.reject(new Error(`Failed to configure crawler: ${body}.`));
|
||||
}
|
||||
deferred.resolve();
|
||||
});
|
||||
return deferred.promise;
|
||||
}
|
||||
|
||||
getConfiguration() {
|
||||
const deferred = Q.defer();
|
||||
request.get(`${this.crawlerUrl}/config`, {
|
||||
headers: {
|
||||
'X-token': this.authToken
|
||||
},
|
||||
json: true,
|
||||
}, (error, response, body) => {
|
||||
if (error) {
|
||||
return deferred.reject(new Error(`Failed to get crawler configuration: ${error.message}.`));
|
||||
}
|
||||
if (response.statusCode !== 200) {
|
||||
return deferred.reject(new Error(`Failed to get crawler configuration: ${body}.`));
|
||||
}
|
||||
deferred.resolve(body);
|
||||
});
|
||||
return deferred.promise;
|
||||
}
|
||||
|
||||
setTokens(tokens) {
|
||||
const deferred = Q.defer();
|
||||
request.put(`${this.crawlerUrl}/tokens`, {
|
||||
headers: {
|
||||
'X-token': this.authToken
|
||||
},
|
||||
body: tokens.join(';')
|
||||
}, (error, response, body) => {
|
||||
if (error) {
|
||||
return deferred.reject(new Error(`Failed to set tokens: ${error.message}.`));
|
||||
}
|
||||
if (response.statusCode !== 200) {
|
||||
return deferred.reject(new Error(`Failed to set tokens: ${body}.`));
|
||||
}
|
||||
deferred.resolve(null);
|
||||
});
|
||||
return deferred.promise;
|
||||
}
|
||||
|
||||
queueRequests(requests, queueName = 'normal') {
|
||||
const deferred = Q.defer();
|
||||
request.post(`${this.crawlerUrl}/requests/${queueName}`, {
|
||||
headers: {
|
||||
'X-token': this.authToken
|
||||
},
|
||||
json: true,
|
||||
body: requests
|
||||
}, (error, response, body) => {
|
||||
if (error) {
|
||||
return deferred.reject(new Error(`Failed to queue requests: ${error.message}.`));
|
||||
}
|
||||
if (response.statusCode !== 200) {
|
||||
return deferred.reject(new Error(`Failed to queue requests: ${body}.`));
|
||||
}
|
||||
deferred.resolve();
|
||||
});
|
||||
return deferred.promise;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = CrawlerClient;
|
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"name": "crawler-cli",
|
||||
"version": "0.1.0",
|
||||
"description": "A simple command line app for controlling a GitHub Crawler",
|
||||
"scripts": {
|
||||
"start": "node ./bin/cc.js"
|
||||
},
|
||||
"keywords": [
|
||||
"GitHub",
|
||||
"API",
|
||||
"crawler"
|
||||
],
|
||||
"author": "Jeff McAffer",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"argv-split": "^1.0.1",
|
||||
"commander": "^2.9.0",
|
||||
"painless-config": "^0.1.0",
|
||||
"q": "1.4.1",
|
||||
"readline": "^1.3.0",
|
||||
"request": "^2.79.0"
|
||||
},
|
||||
"devDependencies": {}
|
||||
}
|
Загрузка…
Ссылка в новой задаче