devops-pipelines/impact.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Impact Investigation\n",
    "1. Run all cells.\n",
    "1. View report at the bottom."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false,
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "su = \"tfs-wcus-0\"\n",
    "start = \"2019-08-08T23:50:00.0000000Z\"\n",
    "end = \"2019-08-09T00:24:36.0000000Z\"\n",
    "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
    "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
    "service = \"tfs\"\n",
    "hub = \"Build\"\n",
    "locationName = \"tfsprodwcus0\"\n",
    "mdmAccount = \"VSO-TFS\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install nimport azure-kusto-notebooks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# Import the things we use\n",
    "\n",
    "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
    "# %kql is single line magic\n",
    "# %%kql is cell magic\n",
    "\n",
    "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
    "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
    "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
    "\n",
    "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
    "import pandas as pd\n",
    "pd.options.display.html.table_schema = False\n",
    "from pandas import Series, DataFrame\n",
    "from datetime import datetime, timedelta, timezone\n",
    "from urllib.parse import urlencode, quote_plus\n",
    "from requests.utils import requote_uri\n",
    "import time\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "from nimport.utils import tokenize, open_nb\n",
    "import json\n",
    "import os\n",
    "import calendar as cal\n",
    "import concurrent.futures\n",
    "from azure.kusto.notebooks import utils as akn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "params = {\n",
    "    \"su\": su,\n",
    "    \"start\": start,\n",
    "    \"end\": end,\n",
    "    \"url\": url,\n",
    "    \"baseUrl\": baseUrl,\n",
    "    \"service\": service\n",
    "}\n",
    "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
    "queryPath = os.path.join(root, 'queries')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# authenticate kusto client\n",
    "# you will need to copy the token into a browser window for AAD auth. \n",
    "client = akn.get_client('https://vso.kusto.windows.net')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# authenticate kusto client\n",
    "# you will need to copy the token into a browser window for AAD auth. \n",
    "icm_client = akn.get_client('https://icmcluster.kusto.windows.net')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "q_loc = os.path.join(queryPath, \"LocationName.csl\")\n",
    "q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n",
    "q_mdm = os.path.join(queryPath, \"MDMAccount.csl\")\n",
    "\n",
    "impactPath = os.path.join(queryPath, \"impact\")\n",
    "q_commands = os.path.join(impactPath, \"CommandsReason.csl\")\n",
    "q_commandsAT = os.path.join(impactPath, \"CommandsAT.csl\")\n",
    "q_commandsDb = os.path.join(impactPath, \"CommandsDb.csl\")\n",
    "q_gen2 = os.path.join(impactPath, \"Gen2GCSpikes.csl\")\n",
    "q_dep = os.path.join(impactPath, \"Dependencies.csl\")\n",
    "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "    # materialize location name immediately as we need this for other queries\n",
    "    p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n",
    "    locationNameResult = akn.to_dataframe_from_future(p1)\n",
    "    locationName = locationNameResult[\"Tenant\"][0]\n",
    "    params[\"locationName\"] = locationName\n",
    "    p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n",
    "    \n",
    "    p4 = executor.submit(akn.execute_file, client, 'VSO', q_commandsAT, params)\n",
    "    p5 = executor.submit(akn.execute_file, client, 'VSO', q_commandsDb, params)        \n",
    "    p6 = executor.submit(akn.execute_file, client, 'VSO', q_commands, params)\n",
    "    \n",
    "    p7 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n",
    "                          os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n",
    "    \n",
    "    p8 = executor.submit(akn.execute_file, client, 'VSO', q_gen2, params)\n",
    "    p9 = executor.submit(akn.execute_file, client, 'VSO', q_mdm, params)\n",
    "    p10 = executor.submit(akn.execute_file, client, 'VSO', q_dep, params)\n",
    "\n",
    "q_whatChanged_df = akn.to_dataframe_from_future(p2)\n",
    "\n",
    "q_commandsAT_df = akn.to_dataframe_from_future(p4)\n",
    "\n",
    "q_commandsDb_df = akn.to_dataframe_from_future(p5)\n",
    "\n",
    "q_commands_df = akn.to_dataframe_from_future(p6)\n",
    "\n",
    "q_activeIncidentsResultDf = akn.to_dataframe_from_future(p7)\n",
    "\n",
    "q_gen2Df = akn.to_dataframe_from_future(p8)\n",
    "\n",
    "q_mdmDf = akn.to_dataframe_from_future(p9)\n",
    "params[\"mdmAccount\"] = q_mdmDf[\"monitoringAccount\"][0]\n",
    "\n",
    "q_depDf = akn.to_dataframe_from_future(p10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "print('=' * 50)\n",
    "print('Report!')\n",
    "print('=' * 50, '\\n\\n')\n",
    "\n",
    "# jarvis params\n",
    "jarvisParams = {\n",
    "  'su': su, \n",
    "  'start': akn.get_time(start, -10), \n",
    "  'end': akn.get_time(end, 10), \n",
    "  'service': service,\n",
    "  'location': locationName,\n",
    "  'account': params[\"mdmAccount\"]\n",
    "}\n",
    "\n",
    "display(params)\n",
    "\n",
    "startTime = akn.to_datetime(start)\n",
    "# jarvis\n",
    "jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/DevOpsReports\"\"\" \\\n",
    "    \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
    "    \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
    "    \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n",
    "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
    "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n",
    "    \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n",
    "    \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
    "    \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
    "print('Jarvis dashboard link:\\n', requote_uri(jarvisLink), '\\n')\n",
    "    \n",
    "# slow failed reason analysis\n",
    "print()\n",
    "print('Is it slow commands or failed commands? =============================')\n",
    "freq = q_commands_df[\"Frequency\"]\n",
    "coefficientOfVariance = freq.std()/freq.mean()\n",
    "failedCount = q_commands_df[q_commands_df[\"Reason\"] == \"failed\"][\"Frequency\"].values[0]\n",
    "slowCount = q_commands_df[q_commands_df[\"Reason\"] == \"slow\"][\"Frequency\"].values[0]\n",
    "reason = \"failed or slow\"\n",
    "if coefficientOfVariance > 0.5:\n",
    "    if failedCount > slowCount:\n",
    "        reason = \"failed\"\n",
    "    else:\n",
    "        reason = \"slow\"\n",
    "else:\n",
    "    print(\"Slow and failed commands are too close, both might be contributing...\")\n",
    "if reason:\n",
    "    print(\"Probably due to %s commands; Failed - %s, Slow - %s\" % (reason, failedCount, slowCount))\n",
    "\n",
    "# slow failed reason for AT?\n",
    "print()\n",
    "print('Is it %s because of AT? =============================' % (reason))\n",
    "failed = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"failed\"]\n",
    "slow = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"slow\"]\n",
    "data = q_commandsAT_df\n",
    "if reason == \"failed\":\n",
    "    data = failed\n",
    "elif reason == \"slow\":\n",
    "    data = slow\n",
    "\n",
    "coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n",
    "    \n",
    "if coefficientOfVariance > 0.5:\n",
    "    print(\"Found variance in AT's for %s commands\" % (reason))\n",
    "    print(data.head(30))\n",
    "else:\n",
    "    print(\"Seems be same across AT's for %s commands\" % (reason))\n",
    "    \n",
    "# slow failed reason for Db?\n",
    "print()\n",
    "print('Is it %s because of Db? =============================' % (reason))\n",
    "failed = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"failed\"]\n",
    "slow = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"slow\"]\n",
    "data = q_commandsDb_df\n",
    "if reason == \"failed\":\n",
    "    data = failed\n",
    "elif reason == \"slow\":\n",
    "    data = slow\n",
    "\n",
    "coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n",
    "    \n",
    "if coefficientOfVariance > 0.5:\n",
    "    print(\"Found variance in Db's for %s commands\" % (reason))\n",
    "    print(\"Suffix '%s' to database server name\" % (\".database.windows.net\"))\n",
    "    print(\"Prefix '%s' to database name\" % (params[\"service\"] + \"_\" + params[\"locationName\"] + \"_\"))\n",
    "    print(data.head(30))\n",
    "else:\n",
    "    print(\"Seems be same across Db's for %s commands\" % (reason))    \n",
    "    \n",
    "# what changed? analysis\n",
    "print()\n",
    "print('What changed? =============================')\n",
    "if(len(q_whatChanged_df.index) == 0):\n",
    "    print(\"...no relevant config changes recorded during this period.\")\n",
    "else:\n",
    "    up_prefix = \"\"\n",
    "    mit_prefix = \"\"\n",
    "    vip_prefix = \"\"\n",
    "    f_prefix = \"\"\n",
    "    text = \"\"\n",
    "    for index, row in q_whatChanged_df.iterrows():\n",
    "        delta = startTime.replace(tzinfo=None) - row.TIMESTAMP.replace(tzinfo=None)\n",
    "        when = \"before\"\n",
    "        if delta.total_seconds() < 0:\n",
    "            when = \"after\"\n",
    "            delta = row.TIMESTAMP.replace(tzinfo=None) - startTime.replace(tzinfo=None)\n",
    "        hoursHappened = delta.total_seconds()//3600\n",
    "        considerTime = hoursHappened <= 1\n",
    "        def getText(row):\n",
    "            return \"\"\"%s %s %s (%s days %s hours %s minutes %s the start time) \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber, delta.days, delta.seconds//3600, delta.seconds//60, when)\n",
    "        if(row.title.lower().find('upgrade') != -1):\n",
    "            if not up_prefix:\n",
    "                up_prefix += \"Looks like, there's upgrade...\\n\\n\"\n",
    "            text += getText(row)\n",
    "        if(row.title.lower().find('mitigation') != -1):\n",
    "            if considerTime and not mit_prefix:\n",
    "                mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\"\n",
    "            text += getText(row)\n",
    "        if(row.title.lower().find('vip') != -1):\n",
    "            if considerTime and not vip_prefix:\n",
    "                vip_prefix += \"Looks like, there is VIP swap...\\n\\n\"\n",
    "            text += getText(row)  \n",
    "        if(row.title.lower().find('feature flag') != -1):\n",
    "            if considerTime and not f_prefix:\n",
    "                f_prefix += \"Looks like, some feature flags are enabled...\\n\\n\"\n",
    "            text += getText(row)\n",
    "    if text:\n",
    "        print(up_prefix + mit_prefix + f_prefix + vip_prefix + text)\n",
    "    else:\n",
    "        print(\"...no relevant changes during this period.\")\n",
    "        \n",
    "# active incidents?\n",
    "print()\n",
    "print('Active incidents? =============================')\n",
    "otherIncidentsCount = 0;\n",
    "for index, row in q_activeIncidentsResultDf.iterrows():\n",
    "    if(row.Title.find(\"Customer Impact Monitor\") == -1):\n",
    "        otherIncidentsCount+=1;\n",
    "        \n",
    "if(otherIncidentsCount > 0):\n",
    "    print(\"We found some incidents during the time period, check if they are related...\")\n",
    "    # styling\n",
    "    def make_clickable(url, text):\n",
    "        return '{0}'.format(url)\n",
    "\n",
    "    newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n",
    "    print(\"ICM link to copy - \" + \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\")\n",
    "    print(newDf[['IncidentId','Severity','Title']])\n",
    "else:\n",
    "    print(\"No active incidents that could be related are found...\")  \n",
    "    \n",
    "print()\n",
    "print('Dependencies insights =============================')\n",
    "r = q_depDf.describe()\n",
    "redis = r[\"avg_RedisExecutionTimeInMs\"]\n",
    "s2s = r[\"avg_VssClientExecutionTimeInMs\"]\n",
    "sql = r[\"avg_SqlExecutionTimeInMs\"]\n",
    "queue = r[\"avg_QueueTimeInMs\"]\n",
    "maxs = [redis[\"max\"], s2s[\"max\"], sql[\"max\"], queue[\"max\"]]\n",
    "means = [redis[\"mean\"], s2s[\"mean\"], sql[\"mean\"], queue[\"mean\"]]\n",
    "up = redis[\"75%\"]\n",
    "lo = redis[\"25%\"]\n",
    "IQR = up - lo\n",
    "redisAnom = q_depDf[q_depDf[\"avg_RedisExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_RedisExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp') \n",
    "up = s2s[\"75%\"]\n",
    "lo = s2s[\"25%\"]\n",
    "IQR = up - lo\n",
    "s2sAnom = q_depDf[q_depDf[\"avg_VssClientExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_VssClientExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n",
    "up = sql[\"75%\"]\n",
    "lo = sql[\"25%\"]\n",
    "IQR = up - lo\n",
    "sqlAnom = q_depDf[q_depDf[\"avg_SqlExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_SqlExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n",
    "up = queue[\"75%\"]\n",
    "lo = queue[\"25%\"]\n",
    "IQR = up - lo\n",
    "queueAnom = q_depDf[q_depDf[\"avg_QueueTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_QueueTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n",
    "reasons = [\"Redis\", \"S2S\", \"Sql\", \"RequestsQueued\"]\n",
    "anomdata = [redisAnom, s2sAnom, sqlAnom, queueAnom]\n",
    "anom = [len(redisAnom), len(s2sAnom), len(sqlAnom), len(queueAnom)]\n",
    "top2Anom = np.argsort(anom)[::-1][:2]\n",
    "whenMax = [\n",
    "    redisAnom[redisAnom[\"avg_RedisExecutionTimeInMs\"]==maxs[0]],\n",
    "    s2sAnom[s2sAnom[\"avg_VssClientExecutionTimeInMs\"]==maxs[1]],\n",
    "    sqlAnom[sqlAnom[\"avg_SqlExecutionTimeInMs\"]==maxs[2]],\n",
    "    queueAnom[queueAnom[\"avg_QueueTimeInMs\"]==maxs[3]]\n",
    "]\n",
    "if len(top2Anom) > 0:\n",
    "    print(\"Found top anomolies...\")\n",
    "    for i in top2Anom:\n",
    "        # Wow, such a pain to convert numpy time to python time...\n",
    "        dt64 = whenMax[i][\"PreciseTimeStamp\"].values[0]\n",
    "        ts = (dt64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')\n",
    "        whenMaxTime = datetime.utcfromtimestamp(ts)\n",
    "        delta = startTime.replace(tzinfo=None) - whenMaxTime.replace(tzinfo=None)\n",
    "        when = \"before\"\n",
    "        if delta.total_seconds() < 0:\n",
    "            when = \"after\"\n",
    "            delta = whenMaxTime.replace(tzinfo=None) - startTime.replace(tzinfo=None)\n",
    "        whenAnom = \"\"\"%s days %s hours %s minutes %s the start time - %s\"\"\" % (delta.days, delta.seconds//3600, delta.seconds//60, when, startTime)\n",
    "        print(\" %s (#buckets %s) (max %s) (mean %s) (Max is %s)\" % (reasons[i], anom[i], maxs[i], means[i], whenAnom))    \n",
    "        display(anomdata[i])\n",
    "else:\n",
    "    print(\"Can't say there is...?\")\n",
    "\n",
    "# additional insights\n",
    "print()\n",
    "print('Additional insights =============================')\n",
    "w3wpGc = q_gen2Df[ q_gen2Df[\"ProcessName\"] == \"w3wp\" ].sort_values(by=['Number'], ascending=False)\n",
    "jaGc = q_gen2Df[ q_gen2Df[\"ProcessName\"] == \"TfsJobAgent\" ].sort_values(by=['Number'], ascending=False)\n",
    "now3wpGc = True\n",
    "if len(w3wpGc) > 0:\n",
    "    x = w3wpGc[:2]\n",
    "    if len(x) == 1:\n",
    "        xCount = x[\"Count\"].values\n",
    "        if xCount[0] > 25:\n",
    "            print(\"INSIGHT: There's a new spike in w3wp gc...\")\n",
    "            now3wpGc = False\n",
    "    if len(x) == 2:\n",
    "        xCount = x[\"Count\"].values\n",
    "        if xCount[1] > xCount[0]:\n",
    "            print(\"INSIGHT: There's a spike in w3wp gc...\")\n",
    "            now3wpGc = False\n",
    "if now3wpGc:\n",
    "    print(\"INSIGHT: No gc spikes found from w3wp...\")\n",
    "    \n",
    "nojaGc = True            \n",
    "if len(jaGc) > 0:\n",
    "    x = jaGc[:2]\n",
    "    if len(x) == 1:\n",
    "        xCount = x[\"Count\"].values\n",
    "        if xCount[0] > 25:\n",
    "            print(\"INSIGHT: There's a new spike in ja gc...\")\n",
    "            nojaGc = False\n",
    "    if len(x) == 2:\n",
    "        xCount = x[\"Count\"].values\n",
    "        if xCount[1] > xCount[0] and xCount[1] > 25:\n",
    "            print(\"INSIGHT: There's a spike in ja gc...\")  \n",
    "            nojaGc = False\n",
    "if nojaGc:\n",
    "    print(\"INSIGHT: No ja gc spikes found from w3wp...\")            "
   ]
  }
 ],
 "metadata": {
  "kernel_info": {
   "name": "python3"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "nteract": {
   "version": "0.14.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}