From 9708c7fff6e104493be5f89e96537e6bc157f9ac Mon Sep 17 00:00:00 2001 From: Jeff McAffer Date: Sun, 19 Feb 2017 20:23:04 -0800 Subject: [PATCH] Initial code moved from ospo-ghcrawler --- README.md | 50 +++++++++++++++++++++ bin/cc | 115 +++++++++++++++++++++++++++++++++++++++++++++++ crawlerClient.js | 108 ++++++++++++++++++++++++++++++++++++++++++++ package.json | 24 ++++++++++ 4 files changed, 297 insertions(+) create mode 100644 bin/cc create mode 100644 crawlerClient.js create mode 100644 package.json diff --git a/README.md b/README.md index 8624b3d..0a0ab2b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,53 @@ +# Crawler command line + +[GHCrawler](https://github.com/Microsoft/ospo-ghcrawler.git) is utility for walking GitHub APIs and tracking GitHub events. This command line app allows you to control various +aspects of a crawler's behavior. There is some overlap in function with the [Crawler Dashboard](https://github.com/Microsoft/crawler-dashboard.git) + +This project also has a simple Node client library for talking to a crawler. + +# Controlling a crawler + +The ```cc``` utility is in the ```bin``` directory of this repo. It can be run interactively or as a single command processor. The general format of using the command line is + + ```node cc [options] [command]``` + +where the available options are: + +```-i``` -- Run in interactive mode + +```-s ``` -- Control the crawler service running at the given url. Defaults to http://localhost:3000. You can also set the CRAWLER_SERVICE_URL environment variable. + +```-t``` -- The crawler service API token to use. This can also be supplied via the ```CRAWLER_SERVICE_AUTH_TOKEN``` environment variable. If not defined in either place, the default ```"secret"``` value is used. + +and the available commands are: + + ```start [count]``` -- Start the crawler processing with count concurrent operations. If count is not specified, 1 is the default. On a reasonably fast network a count of 10 to 15 should be sufficient. This also depends on how many tokens you are using. + + ```stop``` -- Stop crawler processing. The crawler service is left running but it stops pulling requests off the queue. + + ```queue ``` -- Queues the given requests for processing. The requests parameter is a list of GitHub "org" and/or "org/repo" names. + +```orgs ``` -- Set the crawler's to traverse only the GitHub orgs named in the given list. + +```config``` -- Dumps the crawler service's configuration to the console. + +```tokens ``` -- Set the GitHub tokens to be used by the crawler when calling GitHub APIs. The spec value is a list of token specs. Each spec has the form #,... where the token is the GitHub OAuth or Personal Access token and the comma-separated list of traits identify what permissions the token has. The available traits are: ```public```, ```admin```, ```private```. You can list as many tokens and traits as you like. Note that you can also configure the GitHub tokens the CRAWLER_GITHUB_TOKENS environment variable instead **before starting the crawler**. For example, ```export CRAWLER_GITHUB_TOKENS="#public #admin"```. + +A typical sequence shown in the snippet below configures the crawler with a set of tokens, configures the org filter set and then queues and starts the processing of the org. + +``` +> node bin/cc +http://localhost:3000> tokens 43984b2344ca575d0f0e097efd97#public 972bbdfe098098fa9ce082309#admin +http://localhost:3000> orgs contoso-d +http://localhost:3000> queue contoso-d +http://localhost:3000> start 5 +http://localhost:3000> exit +> +``` + +# API +API doc is coming. + # Contributing This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/bin/cc b/bin/cc new file mode 100644 index 0000000..79015d4 --- /dev/null +++ b/bin/cc @@ -0,0 +1,115 @@ +#!/usr/bin/env node +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +const commander = require('commander'); +const CrawlerClient = require('../lib/crawlerClient'); +const readline = require('readline'); +const split = require('argv-split'); +const Q = require('q'); + +const crawlerClient = new CrawlerClient(process.env.CRAWLER_SERVICE_URL || 'http://localhost:3000', process.env.CRAWLER_SERVICE_AUTH_TOKEN || 'secret'); +let promise = Q(); + +const commands = getCommands(); +if (!process.argv.slice(2).length) { + commands.help(); +} +commands.parse(process.argv); +promise.then(() => { + if (commands.interactive) { + startReplLoop(commands); + } +}); + +function getCommands() { + const commands = new commander.Command() + commands + .version('0.0.1') + .option('-i, --interactive', 'Run in interactive mode. Otherwise the given command is executed and this tool exits.') + .option('-s, --service ', 'URL of the crawler service', url => crawlerUrl = url) + .option('-t, --token ', 'Token for talking to the crawler service', token => authToken = token); + commands + .command('help') + .description('Print out this message') + .action(() => commands.outputHelp()); + commands + .command('stop') + .description('Stop all processing in the crawler') + .action(() => configureCount(0)); + commands + .command('queue ') + .description('Queue the given list of orgs and/or repos to be processed.') + .action(requests => queueRequests(requests)); + commands + .command('start [count]') + .description('Start the crawler processing request with [count] concurrency') + .action(count => configureCount(count || 1)); + commands + .command('orgs ') + .description('Configure the crawler to process requests from only the given GitHub organizations') + .action(orgs => configureOrgs(orgs)); + commands + .command('config') + .description('Dump the current crawler configuration to the console') + .action(dumpConfig); + commands + .command('tokens ') + .description('Set the GitHub tokens to be used by the crawler. The parameter is a list of #[,]* where the possible traits are "admin", "public", and "private"') + .action(() => setTokens(tokens)); + commands + .command('exit') + .description('Exit this tool') + .action(() => process.exit(0)); + return commands; +} + +function startReplLoop(commands) { + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout + }); + + rl.setPrompt(crawlerUrl + '> '); + rl.prompt(); + + rl.on('line', (line) => { + const command = split(line); + // pad up the command line to keep commander happy + command.unshift('node', 'cc'); + commands.parse(command); + promise + .catch(error => console.log(error.message)) + .finally(() => { + promise = Q(); + rl.prompt(); + }); + }); +} + +function configureCount(count) { + count = Math.max(count, 0); + const patch = [ + { "op": "replace", "path": "/crawler/count", "value": count } + ]; + promise = crawlerClient.configureCrawler(patch).then(() => console.log(`${count ? 'Started' : 'Stopped'} crawler processing`)); +} + +function configureOrgs(orgs) { + const patch = [ + { "op": "replace", "path": "/crawler/orgList", "value": orgs } + ]; + promise = crawlerClient.configureCrawler(patch).then(() => console.log('Configured org list')); +} + +function dumpConfig() { + promise = crawlerClient.getConfiguration().then(config => console.dir(config)); +} + +function setTokens(tokens) { + promise = crawlerClient.setTokens(tokens).then(() => console.log('Tokens set')); +} + +function queueRequests(specs) { + promise = crawlerClient.queueRequests(specs).then(() => console.log(`Queued ${specs.length} requests`)); +} diff --git a/crawlerClient.js b/crawlerClient.js new file mode 100644 index 0000000..3c9276b --- /dev/null +++ b/crawlerClient.js @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +const request = require('request'); +const Q = require('q'); + +class CrawlerClient { + + constructor(url, token) { + this.crawlerUrl = process.env.CRAWLER_SERVICE_URL || 'http://localhost:3000'; + this.authToken = process.env.CRAWLER_SERVICE_AUTH_TOKEN || 'secret'; + } + + configureCount(count) { + count = Math.max(count, 0); + const patch = [ + { "op": "replace", "path": "/crawler/count", "value": count } + ]; + return configureCrawler(patch); + } + + configureOrgs(orgs) { + const patch = [ + { "op": "replace", "path": "/crawler/orgList", "value": orgs } + ]; + return configureCrawler(patch); + } + + configureCrawler(patch) { + const deferred = Q.defer(); + request.patch(`${this.crawlerUrl}/config`, { + headers: { + 'X-token': this.authToken + }, + json: true, + body: patch + }, (error, response, body) => { + if (error) { + return deferred.reject(new Error(`Failed to configure crawler: ${error.message}.`)); + } + if (response.statusCode !== 200) { + return deferred.reject(new Error(`Failed to configure crawler: ${body}.`)); + } + deferred.resolve(); + }); + return deferred.promise; + } + + getConfiguration() { + const deferred = Q.defer(); + request.get(`${this.crawlerUrl}/config`, { + headers: { + 'X-token': this.authToken + }, + json: true, + }, (error, response, body) => { + if (error) { + return deferred.reject(new Error(`Failed to get crawler configuration: ${error.message}.`)); + } + if (response.statusCode !== 200) { + return deferred.reject(new Error(`Failed to get crawler configuration: ${body}.`)); + } + deferred.resolve(body); + }); + return deferred.promise; + } + + setTokens(tokens) { + const deferred = Q.defer(); + request.put(`${this.crawlerUrl}/tokens`, { + headers: { + 'X-token': this.authToken + }, + body: tokens.join(';') + }, (error, response, body) => { + if (error) { + return deferred.reject(new Error(`Failed to set tokens: ${error.message}.`)); + } + if (response.statusCode !== 200) { + return deferred.reject(new Error(`Failed to set tokens: ${body}.`)); + } + deferred.resolve(null); + }); + return deferred.promise; + } + + queueRequests(requests, queueName = 'normal') { + const deferred = Q.defer(); + request.post(`${this.crawlerUrl}/requests/${queueName}`, { + headers: { + 'X-token': this.authToken + }, + json: true, + body: requests + }, (error, response, body) => { + if (error) { + return deferred.reject(new Error(`Failed to queue requests: ${error.message}.`)); + } + if (response.statusCode !== 200) { + return deferred.reject(new Error(`Failed to queue requests: ${body}.`)); + } + deferred.resolve(); + }); + return deferred.promise; + } +} + +module.exports = CrawlerClient; \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..8e1dd7a --- /dev/null +++ b/package.json @@ -0,0 +1,24 @@ +{ + "name": "crawler-cli", + "version": "0.1.0", + "description": "A simple command line app for controlling a GitHub Crawler", + "scripts": { + "start": "node ./bin/cc.js" + }, + "keywords": [ + "GitHub", + "API", + "crawler" + ], + "author": "Jeff McAffer", + "license": "MIT", + "dependencies": { + "argv-split": "^1.0.1", + "commander": "^2.9.0", + "painless-config": "^0.1.0", + "q": "1.4.1", + "readline": "^1.3.0", + "request": "^2.79.0" + }, + "devDependencies": {} +}