Add pool and jobs stats

- Resolves #110
2017-08-04 14:47:16 -07:00 · 2017-08-04 14:47:16 -07:00 · 8a396f0e18
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,8 @@
 ### Added
 - `random` and `file` task factories. See task factory guide for more
 information.
 - Summary statistics: `pool stats` and `jobs stats`. See the usage doc for
 more information.
 - Delete unusable nodes from pool with `--all-unusable` option for
 `pool delnode`
 - CNTK-GPU-Infiniband-IntelMPI recipe
@ -34,6 +36,8 @@ guide for more information.
 - Job priority support
 - Job migration support
 - Compute node fill type support
 - New commands: `jobs enable` and `jobs disable`. Please see the usage doc
 for more information.
 - From Scratch: Step-by-Step guide
 - Azure Cloud Shell information
--- a/convoy/batch.py
+++ b/convoy/batch.py
@ -45,6 +45,7 @@ import tempfile
 import time
 # non-stdlib imports
 import azure.batch.models as batchmodels
 import dateutil.tz
 # local imports
 from . import autoscale
 from . import crypto
@ -820,6 +821,178 @@ def del_pool(batch_client, config, pool_id=None):
    return True
 def pool_stats(batch_client, config, pool_id=None):
    # type: (azure.batch.batch_service_client.BatchServiceClient, dict,
    #        str) -> None
    """Get pool stats
    :param batch_client: The batch client to use.
    :type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
    :param dict config: configuration dict
    :param str pool_id: pool id
    """
    if util.is_none_or_empty(pool_id):
        pool_id = settings.pool_id(config)
    try:
        pool = batch_client.pool.get(
            pool_id=pool_id,
            pool_get_options=batchmodels.PoolGetOptions(expand='stats'),
        )
    except batchmodels.batch_error.BatchErrorException as ex:
        if 'The specified pool does not exist' in ex.message.value:
            logger.error('pool {} does not exist'.format(pool_id))
            return
    if pool.stats is not None and pool.stats.usage_stats is not None:
        usage_stats = '{}    * Total core hours: {} (last updated: {})'.format(
            os.linesep,
            pool.stats.usage_stats.dedicated_core_time,
            pool.stats.usage_stats.last_update_time,
        )
    else:
        usage_stats = ''
    nodes = list(batch_client.compute_node.list(pool_id))
    nsc = []
    runnable_nodes = 0
    for key, value in _node_state_counts(nodes)._asdict().items():
        if key == 'running' or key == 'idle':
            runnable_nodes += value
        nsc.append('  * {}: {}'.format(key, value))
    node_up_times = []
    node_alloc_times = []
    node_start_times = []
    tasks_run = []
    tasks_running = []
    now = datetime.datetime.now(dateutil.tz.tzutc())
    for node in nodes:
        if node.last_boot_time is not None:
            node_up_times.append((now - node.last_boot_time).total_seconds())
        if (node.start_task_info is not None and
                node.start_task_info.end_time is not None):
            node_alloc_times.append(
                (node.start_task_info.end_time -
                 node.allocation_time).total_seconds()
            )
            node_start_times.append(
                (node.start_task_info.end_time -
                 node.last_boot_time).total_seconds()
            )
        tasks_run.append(node.total_tasks_run)
        tasks_running.append(node.running_tasks_count)
    total_running_tasks = sum(tasks_running)
    runnable_task_slots = runnable_nodes * pool.max_tasks_per_node
    total_task_slots = (
        pool.current_dedicated_nodes + pool.current_low_priority_nodes
    ) * pool.max_tasks_per_node
    busy_task_slots_fraction = (
        0 if total_task_slots == 0 else
        total_running_tasks / runnable_task_slots
    )
    version = 'N/A'
    for md in pool.metadata:
        if md.name == settings.get_metadata_version_name():
            version = md.value
            break
    log = [
        '* Batch Shipyard version: {}'.format(version),
        '* Total nodes: {}'.format(
            pool.current_dedicated_nodes + pool.current_low_priority_nodes
        ),
        '  * Dedicated nodes: {0} ({1:.1f}% of target){2}'.format(
            pool.current_dedicated_nodes,
            100 * (
                1 if pool.target_dedicated_nodes == 0 else
                pool.current_dedicated_nodes / pool.target_dedicated_nodes),
            usage_stats,
        ),
        '  * Low Priority nodes: {0} ({1:.1f}% of target)'.format(
            pool.current_low_priority_nodes,
            100 * (
                1 if pool.target_low_priority_nodes == 0 else
                pool.current_low_priority_nodes /
                pool.target_low_priority_nodes)
        ),
        '* Node states:',
        os.linesep.join(nsc),
    ]
    if len(node_up_times) > 0:
        log.extend([
            '* Node uptime:',
            '  * Mean: {}'.format(
                datetime.timedelta(
                    seconds=(sum(node_up_times) / len(node_up_times)))
            ),
            '  * Min: {}'.format(
                datetime.timedelta(seconds=min(node_up_times))
            ),
            '  * Max: {}'.format(
                datetime.timedelta(seconds=max(node_up_times))
            ),
        ])
    if len(node_alloc_times) > 0:
        log.extend([
            '* Time taken for node creation to ready:',
            '  * Mean: {}'.format(
                datetime.timedelta(
                    seconds=(sum(node_alloc_times) / len(node_alloc_times)))
            ),
            '  * Min: {}'.format(
                datetime.timedelta(seconds=min(node_alloc_times))
            ),
            '  * Max: {}'.format(
                datetime.timedelta(seconds=max(node_alloc_times))
            ),
        ])
    if len(node_start_times) > 0:
        log.extend([
            '* Time taken for last boot startup (includes prep):',
            '  * Mean: {}'.format(
                datetime.timedelta(
                    seconds=(sum(node_start_times) / len(node_start_times)))
            ),
            '  * Min: {}'.format(
                datetime.timedelta(seconds=min(node_start_times))
            ),
            '  * Max: {}'.format(
                datetime.timedelta(seconds=max(node_start_times))
            ),
        ])
    if len(tasks_running) > 0:
        log.extend([
            '* Running tasks:',
            '  * Sum: {}'.format(total_running_tasks),
            '  * Mean: {}'.format(total_running_tasks / len(tasks_running)),
            '  * Min: {}'.format(min(tasks_running)),
            '  * Max: {}'.format(max(tasks_running)),
        ])
    if len(tasks_run) > 0:
        log.extend([
            '* Total tasks run:',
            '  * Sum: {}'.format(sum(tasks_run)),
            '  * Mean: {}'.format(sum(tasks_run) / len(tasks_run)),
            '  * Min: {}'.format(min(tasks_run)),
            '  * Max: {}'.format(max(tasks_run)),
        ])
    log.extend([
        '* Task scheduling slots:',
        '  * Busy: {0} ({1:.2f}% of runnable)'.format(
            total_running_tasks, 100 * busy_task_slots_fraction
        ),
        '  * Available: {0} ({1:.2f}% of runnable)'.format(
            runnable_task_slots - total_running_tasks,
            100 * (1 - busy_task_slots_fraction)
        ),
        '  * Runnable: {0} ({1:.2f}% of total)'.format(
            runnable_task_slots,
            100 * (
                runnable_task_slots / total_task_slots
                if total_task_slots > 0 else 0
            ),
        ),
        '  * Total: {}'.format(total_task_slots),
    ])
    logger.info('statistics summary for pool {}{}{}'.format(
        pool_id, os.linesep, os.linesep.join(log)))
 def pool_autoscale_disable(batch_client, config):
    # type: (batch.BatchServiceClient, dict) -> None
    """Enable autoscale formula
@ -1069,6 +1242,131 @@ def update_job_with_pool(batch_client, config, jobid=None, poolid=None):
            job_id, poolid))
 def job_stats(batch_client, config, jobid=None):
    # type: (azure.batch.batch_service_client.BatchServiceClient, dict,
    #        str) -> None
    """Job stats
    :param batch_client: The batch client to use.
    :type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
    :param dict config: configuration dict
    :param str jobid: job id to query
    """
    if jobid is not None:
        try:
            job = batch_client.job.get(
                job_id=jobid,
                job_get_options=batchmodels.JobGetOptions(expand='stats'),
            )
        except batchmodels.batch_error.BatchErrorException as ex:
            if 'The specified job does not exist' in ex.message.value:
                raise RuntimeError('job {} does not exist'.format(jobid))
        jobs = [job]
    else:
        jobs = list(batch_client.job.list(
            job_list_options=batchmodels.JobListOptions(expand='stats')))
    job_count = 0
    job_times = []
    task_times = []
    task_wall_times = []
    task_counts = batchmodels.TaskCounts(0, 0, 0, 0, 0, 'validated')
    total_tasks = 0
    for job in jobs:
        job_count += 1
        # get task counts
        tc = batch_client.job.get_task_counts(job_id=job.id)
        task_counts.active += tc.active
        task_counts.running += tc.running
        task_counts.completed += tc.completed
        task_counts.succeeded += tc.succeeded
        task_counts.failed += tc.failed
        total_tasks += tc.active + tc.running + tc.completed
        if (tc.validation_status !=
                batchmodels.TaskCountValidationStatus.validated):
            task_counts.validation_status = tc.validation_status
        if job.execution_info.end_time is not None:
            job_times.append(
                (job.execution_info.end_time -
                 job.execution_info.start_time).total_seconds())
        # get task-level execution info
        tasks = batch_client.task.list(
            job_id=job.id,
            task_list_options=batchmodels.TaskListOptions(
                filter='(state eq \'running\') or (state eq \'completed\')',
                select='id,state,stats,executionInfo',
            ))
        for task in tasks:
            if task.stats is not None:
                task_wall_times.append(
                    task.stats.wall_clock_time.total_seconds())
            if (task.execution_info is not None and
                    task.execution_info.end_time is not None):
                task_times.append(
                    (task.execution_info.end_time -
                     task.execution_info.start_time).total_seconds())
    log = [
        '* Total jobs: {}'.format(job_count),
        '* Total tasks: {} ({})'.format(
            total_tasks, task_counts.validation_status
        ),
        '  * Active: {}'.format(task_counts.active),
        '  * Running: {}'.format(task_counts.running),
        '  * Completed: {}'.format(task_counts.completed),
        '    * Succeeded: {0} ({1:.2f}% of completed)'.format(
            task_counts.succeeded,
            100 * task_counts.succeeded / task_counts.completed
            if task_counts.completed > 0 else 0
        ),
        '    * Failed: {0} ({1:.2f}% of completed)'.format(
            task_counts.failed,
            100 * task_counts.failed / task_counts.completed
            if task_counts.completed > 0 else 0
        ),
    ]
    if len(job_times) > 0:
        log.extend([
            '* Job creation to completion time:',
            '  * Mean: {}'.format(
                datetime.timedelta(seconds=(sum(job_times) / len(job_times)))
            ),
            '  * Min: {}'.format(
                datetime.timedelta(seconds=min(job_times))
            ),
            '  * Max: {}'.format(
                datetime.timedelta(seconds=max(job_times))
            ),
        ])
    if len(task_times) > 0:
        log.extend([
            '* Task end-to-end time:',
            '  * Mean: {}'.format(
                datetime.timedelta(seconds=(sum(task_times) / len(task_times)))
            ),
            '  * Min: {}'.format(
                datetime.timedelta(seconds=min(task_times))
            ),
            '  * Max: {}'.format(
                datetime.timedelta(seconds=max(task_times))
            ),
        ])
    if len(task_wall_times) > 0:
        log.extend([
            '* Task command walltime (running and completed):',
            '  * Mean: {}'.format(
                datetime.timedelta(
                    seconds=(sum(task_wall_times) / len(task_wall_times)))
            ),
            '  * Min: {}'.format(
                datetime.timedelta(seconds=min(task_wall_times))
            ),
            '  * Max: {}'.format(
                datetime.timedelta(seconds=max(task_wall_times))
            ),
        ])
    logger.info('statistics summary for {}{}{}'.format(
        'job {}'.format(jobid) if jobid is not None else 'all jobs',
        os.linesep, os.linesep.join(log)))
 def disable_jobs(
        batch_client, config, disable_tasks_action, jobid=None,
        disabling_state_ok=False, term_tasks=False):
@ -1661,15 +1959,17 @@ def terminate_tasks(
                        raise
-def list_nodes(batch_client, config, nodes=None):
+def list_nodes(batch_client, config, pool_id=None, nodes=None):
-    # type: (batch.BatchServiceClient, dict, list) -> None
+    # type: (batch.BatchServiceClient, dict, str, list) -> None
    """Get a list of nodes
    :param batch_client: The batch client to use.
    :type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
    :param dict config: configuration dict
-    :param lsit nodes: list of nodes
+    :param str pool_id: pool id
    :param list nodes: list of nodes
    """
-    pool_id = settings.pool_id(config)
+    if util.is_none_or_empty(pool_id):
        pool_id = settings.pool_id(config)
    logger.debug('listing nodes for pool {}'.format(pool_id))
    if nodes is None:
        nodes = batch_client.compute_node.list(pool_id)
@ -2325,7 +2625,6 @@ def check_jobs_for_auto_pool(config):
        else:
            autopool.append(True)
    if autopool.count(False) == len(autopool):
        logger.debug('autopool not detected for jobs')
        return False
    elif autopool.count(True) == len(autopool):
        logger.debug('autopool detected for jobs')
--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@ -2509,6 +2509,17 @@ def action_pool_listimages(batch_client, config):
    _list_docker_images(batch_client, config)
 def action_pool_stats(batch_client, config, pool_id):
    # type: (batchsc.BatchServiceClient, dict, str) -> None
    """Action: Pool Stats
    :param azure.batch.batch_service_client.BatchServiceClient batch_client:
        batch client
    :param dict config: configuration dict
    :param str pool_id: pool id
    """
    batch.pool_stats(batch_client, config, pool_id=pool_id)
 def action_pool_autoscale_disable(batch_client, config):
    # type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None
    """Action: Pool Autoscale Disable
@ -2867,6 +2878,17 @@ def action_jobs_enable(batch_client, config, jobid):
    batch.enable_jobs(batch_client, config, jobid=jobid)
 def action_jobs_stats(batch_client, config, job_id):
    # type: (batchsc.BatchServiceClient, dict, str) -> None
    """Action: Jobs Stats
    :param azure.batch.batch_service_client.BatchServiceClient batch_client:
        batch client
    :param dict config: configuration dict
    :param str job_id: job id
    """
    batch.job_stats(batch_client, config, jobid=job_id)
 def action_storage_del(
        blob_client, queue_client, table_client, config, clear_tables, poolid):
    # type: (azureblob.BlockBlobService, azurequeue.QueueService,
--- a/docs/20-batch-shipyard-usage.md
+++ b/docs/20-batch-shipyard-usage.md
@ -388,6 +388,8 @@ file
  * `--requeue` requeue running tasks
  * `--terminate` terminate running tasks
  * `--wait` wait for running tasks to complete
 * `stats` will generate a statistics summary of a job or jobs
  * `--jobid` will query the specified job instead of all jobs
 * `term` will terminate jobs found in the jobs configuration file. If an
 autopool is specified for all jobs and a jobid option is not specified,
 the storage associated with the autopool will be cleaned up.
@ -456,12 +458,13 @@ The `pool` command has the following sub-commands:
  dsu         Delete an SSH user from all nodes in pool
  grls        Get remote login settings for all nodes in...
  list        List all pools in the Batch account
-  listimages  List Docker images in the pool
+  listimages  List Docker images in a pool
  listnodes   List nodes in pool
  listskus    List available VM configurations available to...
  rebootnode  Reboot a node or nodes in a pool
  resize      Resize a pool
-  ssh         Interactively login via SSH to a node in the...
+  ssh         Interactively login via SSH to a node in a...
  stats       Get statistics about a pool
  udi         Update Docker images in a pool
 ```
 * `add` will add the pool defined in the pool configuration file to the
@ -514,6 +517,9 @@ configuration file
    the pool to connect to as listed by `grls`
  * `--nodeid` is the node id to connect to in the pool
  * `--tty` allocates a pseudo-terminal
 * `stats` will generate a statistics summary of the pool
  * `--poolid` will query the specified pool instead of the pool from the
    pool configuration file
 * `udi` will update Docker images on all compute nodes of the pool. This
 command requires a valid SSH user.
  * `--image` will restrict the update to just the image or image:tag
--- a/shipyard.py
+++ b/shipyard.py
@ -1178,7 +1178,7 @@ def pool_dsu(ctx):
@aad_options
@pass_cli_context
 def pool_ssh(ctx, cardinal, nodeid, tty, command):
-    """Interactively login via SSH to a node in the pool"""
+    """Interactively login via SSH to a node in a pool"""
    ctx.initialize_for_batch()
    convoy.fleet.action_pool_ssh(
        ctx.batch_client, ctx.config, cardinal, nodeid, tty, command)
@ -1253,11 +1253,25 @@ def pool_udi(ctx, image, digest, ssh):
@aad_options
@pass_cli_context
 def pool_listimages(ctx):
-    """List Docker images in the pool"""
+    """List Docker images in a pool"""
    ctx.initialize_for_batch()
    convoy.fleet.action_pool_listimages(ctx.batch_client, ctx.config)
@pool.command('stats')
@click.option('--poolid', help='Get stats on specified pool')
@common_options
@batch_options
@keyvault_options
@aad_options
@pass_cli_context
 def pool_stats(ctx, poolid):
    """Get statistics about a pool"""
    ctx.initialize_for_batch()
    convoy.fleet.action_pool_stats(
        ctx.batch_client, ctx.config, pool_id=poolid)
@pool.group()
@pass_cli_context
 def autoscale(ctx):
@ -1530,6 +1544,19 @@ def jobs_enable(ctx, jobid):
    convoy.fleet.action_jobs_enable(ctx.batch_client, ctx.config, jobid)
@jobs.command('stats')
@click.option('--jobid', help='Get stats only on the specified job id')
@common_options
@batch_options
@keyvault_options
@aad_options
@pass_cli_context
 def jobs_stats(ctx, jobid):
    """Get statistics about jobs"""
    ctx.initialize_for_batch()
    convoy.fleet.action_jobs_stats(ctx.batch_client, ctx.config, job_id=jobid)
@cli.group()
@pass_cli_context
 def data(ctx):