devops-pipelines/sla.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SLA Investigation\n",
    "1. Run all cells! (click on Menu > Cell > Run All Cells)\n",
    "1. View report at the bottom."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false,
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "triggerTime = \"2019-08-08T23:50:00.0000000Z\"\n",
    "scaleUnit = \"tfs-wcus-0\"\n",
    "service = \"tfs\"\n",
    "lookback = \"1h\"\n",
    "region = \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "%%capture \n",
    "\n",
    "# install packages, setup workspace root\n",
    "!pip install --upgrade pip azure-kusto-notebooks\n",
    "import os\n",
    "import sys\n",
    "import datetime\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "pd.options.display.html.table_schema = True\n",
    "import concurrent.futures\n",
    "from azure.kusto.notebooks import utils as akn\n",
    "\n",
    "# cwd should be workspace root\n",
    "if os.path.basename(os.getcwd()) == 'devops-pipelines':\n",
    "    os.chdir(os.pardir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# authenticate kusto client\n",
    "# you will need to copy the token into a browser window for AAD auth. \n",
    "client = akn.get_client('https://vso.kusto.windows.net')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# find orchestrations that violate SLA\n",
    "params = {\n",
    "    'TriggerTime': akn.to_kusto_datetime(triggerTime),\n",
    "    'Lookback': akn.to_kusto_timespan(lookback),\n",
    "    'Service': '\"' + service + '\"', \n",
    "    'Region': '\"' + region + '\"',\n",
    "    'ScaleUnit': '\"' + scaleUnit + '\"'\n",
    "}\n",
    "query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLADurationAnalysis.csl')\n",
    "violations = akn.execute_file(client, database='VSO', path=query, params=params)\n",
    "# violations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# collect problematic orchestration ids\n",
    "result = violations.primary_results[0]\n",
    "oid_column_index = next((c.ordinal for c in result.columns if c.column_name == 'OrchestrationId'), None)\n",
    "su_column_index = next((c.ordinal for c in result.columns if c.column_name == 'ScaleUnit'), None)\n",
    "\n",
    "# group\n",
    "by_su = {}\n",
    "for r in result.rows:\n",
    "    su = r[su_column_index]\n",
    "    oid = r[oid_column_index]\n",
    "    l = by_su.get(su, [])\n",
    "    by_su[su] = l\n",
    "    l.append(oid)\n",
    "\n",
    "max_scale_units = []\n",
    "max_problems = 0\n",
    "for k,v in by_su.items():\n",
    "  c = len(v)\n",
    "  if c > max_problems:\n",
    "    max_problems = c\n",
    "    max_scale_units = [k]\n",
    "  elif c == max_problems:\n",
    "    max_scale_units.append(k)\n",
    "max_scale_units.sort()\n",
    "\n",
    "# for su, oids in by_su.items():\n",
    "#     print(su)\n",
    "#     for oid in oids:\n",
    "#         print('   ', oid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# collect visualization data sets\n",
    "query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLAVisualization.csl')\n",
    "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "    hfs = [executor.submit(akn.execute_file, client, 'VSO', query, \n",
    "            {\n",
    "                'ScaleUnit': '\"' + r[su_column_index] + '\"', \n",
    "                'OrchestrationId': '\"' + r[oid_column_index] + '\"'\n",
    "            }) for r in result.rows]\n",
    "    histories = [h.result() for h in concurrent.futures.as_completed(hfs)]\n",
    "\n",
    "# convert to data frames\n",
    "primary_results = [h.primary_results[0] for h in histories]\n",
    "dataframes = None\n",
    "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "    dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in primary_results]\n",
    "    dataframes = [dff.result() for dff in concurrent.futures.as_completed(dataframe_futures)]\n",
    "histories = None\n",
    "\n",
    "# try to filter out false positives? at least a certain number of phases must have been recorded.\n",
    "required_phases = ('RunAgentJob.SendJob', 'RunAgentJob.JobCompleted')\n",
    "filtered_dataframes = [df for df in dataframes if all([p in df['PhaseName'].values for p in required_phases])]\n",
    "number_of_false_positives = len(dataframes) - len(filtered_dataframes)\n",
    "dataframes = filtered_dataframes\n",
    "plans_out_of_sla = [df['PlanId'].iat[0] for df in dataframes]\n",
    "number_of_violations = len(dataframes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "worst_phaseName = ''\n",
    "worst_count = 0\n",
    "worst_team = ''\n",
    "\n",
    "if dataframes:\n",
    "    # what was the worst phase?\n",
    "    combined = pd.concat(dataframes, ignore_index=True)\n",
    "    df = combined.loc[combined['Level'] == 2].groupby(['PhaseName']).size().to_frame('Count').nlargest(1, 'Count')\n",
    "    if len(df.index) > 0:\n",
    "        worst_phaseName = df.index[0]\n",
    "        worst_count = df.iat[0, 0]\n",
    "        worst_team = worst_phaseName.split('.')[0]\n",
    "    \n",
    "    # what was the worst plan?\n",
    "    violations_df = akn.to_dataframe(violations.primary_results[0])\n",
    "    df = violations_df.groupby(['PlanId']).size().to_frame('Count').nlargest(1, 'Count')\n",
    "    plan_with_most_violations = df.index[0]\n",
    "    plan_with_most_violations_count = df.iat[0, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "if number_of_false_positives:\n",
    "    print(number_of_false_positives, 'plans are likely missing kusto data and were ignored.')\n",
    "if number_of_violations <= 0:\n",
    "    print('no problems detected')\n",
    "else:\n",
    "    for su in max_scale_units:\n",
    "        print(max_problems, 'of the problems were in', su)\n",
    "    \n",
    "    print(number_of_violations, \n",
    "          'plans' if number_of_violations > 1 else 'plan', \n",
    "          'had no apparent data problems and', \n",
    "          'are' if number_of_violations > 1 else 'is', \n",
    "          'out of SLA.')\n",
    "    \n",
    "    if plan_with_most_violations in plans_out_of_sla:\n",
    "        print(plan_with_most_violations, 'had the most violations with', plan_with_most_violations_count)\n",
    "    \n",
    "    if worst_phaseName:\n",
    "        print('\"' + worst_phaseName + '\"', 'was the slowest phase in', worst_count, \n",
    "              'of the', number_of_violations, 'SLA violations.')\n",
    "        \n",
    "    print ('\\nConclusion:')\n",
    "    if number_of_violations > 5:    \n",
    "        print('This is likely a real problem. Open icm against scale units:', max_scale_units)\n",
    "        print('Initially route it to:       ', worst_team)\n",
    "    else: \n",
    "        print('Too much uncertainty -- do not open any ICMs.')\n",
    "        \n",
    "        if number_of_false_positives and float(number_of_false_positives) / float(max_problems) > .5:\n",
    "            for su in max_scale_units:\n",
    "                print(su, 'might be unhealthy based on the number of plans missing kusto data.')\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "plt.rcdefaults()\n",
    "\n",
    "if dataframes:\n",
    "    number_of_graphs = min(25, len(dataframes))\n",
    "    fig, axes = plt.subplots(nrows=number_of_graphs,  ncols=1,  figsize=(8, 6 * number_of_graphs), constrained_layout=True)\n",
    "    for i in range(number_of_graphs):\n",
    "        df = dataframes[i]\n",
    "        ax = axes[i] if number_of_graphs > 1 else axes\n",
    "        ax.axhline(0, color='k')\n",
    "\n",
    "        x = df['PhaseName']\n",
    "        xpos = np.arange(len(x))\n",
    "        y = df['PercentDifference']\n",
    "        plan_id = df['PlanId'].iloc[0]\n",
    "        violation_row = violations_df.loc[violations_df['PlanId'] == plan_id]\n",
    "        title = '\\n'.join([\n",
    "            'plan id:' + plan_id,\n",
    "            'scale unit:'     + str(violation_row['ScaleUnit'].iloc[0]),\n",
    "            'definition:'     + str(df['DefinitionName'].iloc[0]),\n",
    "            'plan duration: ' + str(violation_row['PlanDuration'].iloc[0]),\n",
    "            'sla duration: '  + str(violation_row['TotalSLADuration'].iloc[0]),\n",
    "        ])\n",
    "        ax.title.set_text(title)\n",
    "\n",
    "        ax.bar(x=xpos, height=y)\n",
    "        ax.set_xticks(xpos)\n",
    "        ax.set_xticklabels(x, rotation=45, ha=\"right\")\n",
    "\n",
    "# output_filename = 'analysis.svg'\n",
    "# plt.savefig(output_filename, format='svg')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "nteract": {
   "version": "nteract-on-jupyter@2.1.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}