{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Orchestration delays Investigation\n", "1. Run all cells.\n", "1. Scroll down to see for any authentication messages\n", "1. View report at the bottom." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false, "tags": [ "parameters" ] }, "outputs": [], "source": [ "# These are just defaults will be overwritten if you use nimport pip\n", "start = \"2019-08-08T23:50:00.0000000Z\"\n", "end = \"2019-08-09T00:24:36.0000000Z\"\n", "service = \"tfs\"\n", "hub = \"Build\"\n", "su = \"tfs-wcus-0\"\n", "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n", "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "%%capture\n", "!pip install nimport azure-kusto-notebooks" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "# Import the things we use\n", "\n", "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n", "# %kql is single line magic\n", "# %%kql is cell magic\n", "\n", "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n", "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n", "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n", "\n", "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n", "import pandas as pd\n", "pd.options.display.html.table_schema = True\n", "from pandas import Series, DataFrame\n", "from datetime import datetime, timedelta, timezone\n", "from urllib.parse import urlencode, quote_plus\n", "from requests.utils import requote_uri\n", "import time\n", "import numpy as np\n", "from matplotlib import pyplot as plt\n", "from nimport.utils import tokenize, open_nb\n", "import json\n", "import os\n", "import calendar as cal\n", "import concurrent.futures\n", "from azure.kusto.notebooks import utils as akn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "params = {\n", " \"su\": su,\n", " \"start\": start,\n", " \"end\": end,\n", " \"url\": url,\n", " \"baseUrl\": baseUrl,\n", " \"service\": service,\n", " \"hub\": hub\n", "}\n", "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n", "queryPath = os.path.join(root, 'queries')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "# authenticate kusto client\n", "# you will need to copy the token into a browser window for AAD auth. \n", "client = akn.get_client('https://vso.kusto.windows.net')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "# authenticate kusto client\n", "# you will need to copy the token into a browser window for AAD auth. \n", "icm_client = akn.get_client('https://icmcluster.kusto.windows.net')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "q_loc = os.path.join(queryPath, \"LocationName.csl\")\n", "q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n", "q_haActions = os.path.join(queryPath, \"HealthAgentActions.csl\")\n", "q_mdm = os.path.join(queryPath, \"MDMAccount.csl\")\n", "\n", "delaysPath = os.path.join(queryPath, \"delays\")\n", "q_affectedAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n", "q_abusers = os.path.join(delaysPath, \"Abusers.csl\")\n", "q_affAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n", "q_delayedAccountsAreAbusers = os.path.join(delaysPath, \"DelayedAccountsAreAbusers.csl\")\n", "q_whatDelayed = os.path.join(delaysPath, \"WhatDelayed.csl\")\n", "q_load = os.path.join(delaysPath, \"Load.csl\")\n", "\n", "with concurrent.futures.ThreadPoolExecutor() as executor:\n", " # materialize location name immediately as we need this for other queries\n", " p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n", " locationNameResult = akn.to_dataframe_from_future(p1)\n", " locationName = locationNameResult[\"Tenant\"][0]\n", " params[\"locationName\"] = locationName\n", " p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n", " p4 = executor.submit(akn.execute_file, client, 'VSO', q_haActions, params) \n", " \n", " p5 = executor.submit(akn.execute_file, client, 'VSO', q_affectedAccounts, params)\n", " p6 = executor.submit(akn.execute_file, client, 'VSO', q_abusers, params)\n", " p7 = executor.submit(akn.execute_file, client, 'VSO', q_affAccounts, params)\n", " p8 = executor.submit(akn.execute_file, client, 'VSO', q_delayedAccountsAreAbusers, params)\n", " p9 = executor.submit(akn.execute_file, client, 'VSO', q_whatDelayed, params)\n", " p10 = executor.submit(akn.execute_file, client, 'VSO', q_load, params)\n", " \n", " p11 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n", " os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n", " p12 = executor.submit(akn.execute_file, client, 'VSO', q_mdm, params)\n", "\n", "q_whatChanged_df = akn.to_dataframe_from_future(p2)\n", "q_haActions_df = akn.to_dataframe_from_future(p4)\n", "q_affectedAccountsResultDf = akn.to_dataframe_from_future(p5)\n", "\n", "abusersDf = akn.to_dataframe_from_future(p6)\n", "finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);\n", "\n", "q_affAccounts_df = akn.to_dataframe_from_future(p7)\n", "q_delayedAccountsAreAbusers_df = akn.to_dataframe_from_future(p8)\n", "q_whatDelayedResultDf = akn.to_dataframe_from_future(p9)\n", "q_loadResultDf = akn.to_dataframe_from_future(p10)\n", "\n", "q_activeIncidentsResultDf = akn.to_dataframe_from_future(p11)\n", "\n", "q_mdmDf = akn.to_dataframe_from_future(p12)\n", "params[\"mdmAccount\"] = q_mdmDf[\"monitoringAccount\"][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "query = os.path.join(delaysPath, \"OrchestrationLogSpike.csl\")\n", "with concurrent.futures.ThreadPoolExecutor() as executor:\n", " sfs = [executor.submit(akn.execute_file, client, 'VSO', query, \n", " {\n", " **params,\n", " \"hostId\": r\n", " }) for r in q_delayedAccountsAreAbusers_df[\"HostId\"].values]\n", " sfsResults = [s.result() for s in concurrent.futures.as_completed(sfs)]\n", "\n", "# convert to data frames\n", "primary_results = [s.primary_results[0] for s in sfsResults]\n", "spikeResultsDfs = None\n", "with concurrent.futures.ThreadPoolExecutor() as executor:\n", " dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in primary_results]\n", " spikeResultsDfs = [dff.result() for dff in concurrent.futures.as_completed(dataframe_futures)]\n", "sfsResults = None" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "# visualize delays\n", "import plotly\n", "from plotly import graph_objs as go\n", "delays = go.Scatter(\n", " x=q_affAccounts_df[\"PreciseTimeStamp\"],\n", " y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n", " mode = 'lines',\n", " name = 'Delays in seconds',\n", " text= q_affAccounts_df['Name']\n", ")\n", "\n", "changed = go.Scatter(\n", " x=q_whatChanged_df[\"TIMESTAMP\"],\n", " y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n", " mode = 'lines+markers',\n", " name = 'What Changed',\n", " text = q_whatChanged_df[\"Name\"],\n", " marker=dict(\n", " size=32,\n", " color = np.random.randn(500),\n", " colorscale='Viridis'\n", " )\n", ")\n", "\n", "mitigations = go.Scatter(\n", " x=q_haActions_df[\"PreciseTimeStamp\"],\n", " y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n", " mode = 'markers',\n", " name = 'Mitigations',\n", " text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n", " marker = dict(\n", " size = 10,\n", " color = 'rgba(152, 0, 0, .8)',\n", " line = dict(\n", " width = 2,\n", " color = 'rgb(0, 0, 0)'\n", " )\n", " )\n", ")\n", "\n", "data = [delays, changed, mitigations]\n", "plotly.offline.iplot(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [ "# utility functions\n", "content = ''\n", "def r(*args):\n", " '''construct a markdown report'''\n", " global content\n", " content += ''.join([str(a) for a in args]) + '\\n'\n", "\n", "def pandas_df_to_markdown_table(df):\n", " from IPython.display import Markdown, display\n", " fmt = ['---' for i in range(len(df.columns))]\n", " df_fmt = pd.DataFrame([fmt], columns=df.columns)\n", " df_formatted = pd.concat([df_fmt, df])\n", " return df_formatted.to_csv(sep=\"|\", index=False)\n", "\n", "startTime = akn.to_datetime(start)\n", "# report! \n", "r('# OK SO WHAT HAPPENED')\n", "r('|parameter|value|')\n", "r('|---|---|')\n", "r('|startTime|', startTime, '|')\n", "r('|endTime|', akn.to_datetime(end), '|')\n", "r('|scale unit|', su, '|')\n", "r('|service|', service, '|')\n", "\n", "# jarvis params\n", "jarvisParams = {\n", " 'su': su, \n", " 'start': akn.get_time(start, -10), \n", " 'end': akn.get_time(end, 10), \n", " 'service': service,\n", " 'location': locationName,\n", " 'account': params[\"mdmAccount\"]\n", "}\n", "\n", "# what changed? analysis\n", "r('## What changed?')\n", "if(len(q_whatChanged_df.index) == 0):\n", " r(\"...no relevant config changes recorded during this period.\")\n", "else:\n", " up_prefix = \"\"\n", " mit_prefix = \"\"\n", " vip_prefix = \"\"\n", " text = \"\"\n", " for index, row in q_whatChanged_df.iterrows():\n", " delta = startTime.replace(tzinfo=None) - row.TIMESTAMP.replace(tzinfo=None)\n", " when = \"before\"\n", " if delta.total_seconds() < 0:\n", " when = \"after\"\n", " delta = row.TIMESTAMP.replace(tzinfo=None) - startTime.replace(tzinfo=None)\n", " hoursHappened = delta.total_seconds()//3600\n", " considerTime = hoursHappened <= 1\n", " def getText(row):\n", " return \"\"\"%s %s %s (%s days %s hours %s minutes %s the start time) \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber, delta.days, delta.seconds//3600, delta.seconds//60, when)\n", " if(row.title.lower().find('upgrade') != -1):\n", " if not up_prefix:\n", " up_prefix += \"Looks like, there's upgrade...\\n\\n\"\n", " text += getText(row)\n", " if(row.title.lower().find('mitigation') != -1):\n", " if considerTime and not mit_prefix:\n", " mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\"\n", " text += getText(row)\n", " if(row.title.lower().find('vip') != -1):\n", " if considerTime and not mit_prefix:\n", " mit_prefix += \"Looks like, there is VIP swap...\\n\\n\"\n", " text += getText(row)\n", " if(row.title.lower().find('feature flag') != -1):\n", " if considerTime and not mit_prefix:\n", " mit_prefix += \"Looks like, some feature flags are enabled...\\n\\n\"\n", " text += getText(row) \n", " if text:\n", " r(up_prefix + mit_prefix + vip_prefix + text)\n", " else:\n", " r(\"...no relevant changes during this period.\")\n", " \n", " \n", " \n", "# active incidents?\n", "r('## Active incidents?')\n", "otherIncidentsCount = 0;\n", "for index, row in q_activeIncidentsResultDf.iterrows():\n", " if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n", " otherIncidentsCount += 1\n", " \n", "if(otherIncidentsCount > 0):\n", " r(\"INSIGHT: There were incidents recorded during this period. These might be related:\")\n", " newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: \"\"\"[%s](https://icm.ad.msft.net/imp/v3/incidents/details/%s/home)\"\"\" % (x,x), q_activeIncidentsResultDf.IncidentId)]) \n", " r(\"\\n\")\n", " r(pandas_df_to_markdown_table(newDf[['URL','Severity','Title']]))\n", "else:\n", " r(\"...no relevant incidents during this period.\")\n", "\n", " \n", " \n", " \n", "r('## Queue Load')\n", "ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n", "queuedGreatherThan500 = np.where(ar > 500)\n", "if len(queuedGreatherThan500[0]) > 0:\n", " r('INSIGHT: There was a high rate of jobs queued during this period (max: ', np.amax(ar), ' / minute)...')\n", "else: \n", " r('...everything looks good? (max: ', np.amax(ar), ' / minute)')\n", " \n", "r('## Orchestration phase Load')\n", "for spikeResultDf in spikeResultsDfs:\n", " countResult = spikeResultDf.C.describe()\n", " hostId = spikeResultDf[\"HostId\"].values[0]\n", " upper = countResult[\"75%\"]\n", " lower = countResult[\"25%\"]\n", " # Wondering what's going on here? We detect anomolies, see https://www.purplemath.com/modules/boxwhisk3.htm\n", " IQR = upper - lower\n", " countResultOfInterest = spikeResultDf[spikeResultDf[\"C\"] > upper + 1.5 * IQR ].head(5)\n", " unqCommands = list(dict.fromkeys(countResultOfInterest[\"Command\"].values).keys())\n", " if len(unqCommands) > 0:\n", " commands = ','.join(str(e) for e in unqCommands)\n", " r(\"INSIGHT: Found anomalies for these phases in order highest to lowest for host %s: %s, max being %s \\n\" % (hostId, commands, countResult[\"max\"]))\n", " r(pandas_df_to_markdown_table(countResultOfInterest[[\"Command\", \"C\"]])) \n", " newParams = dict(params)\n", " newParams[\"command\"] = next(iter(unqCommands)) \n", " newParams[\"hostId\"] = hostId\n", " if \"PlanCompleted\" in commands:\n", " if \"StartPlan\" in commands or \"PlanStarted\" in commands:\n", " r(\"\\nTIP: Lot of jobs might have started... creating this spike\")\n", " else:\n", " r(\"\\nTIP: Jobs that are queued long ago might have completed now... creating this spike\") \n", " r(\"\\nConsider running these queries by changing times, if you need to dig in further: \\n\")\n", " r(\"```\\n\" + tokenize(os.path.join(os.path.join(queryPath, \"delays\"), \"OrchestrationLogSpikeTip.csl\"), newParams) + \"\\n```\") \n", " r(\"```\\n\" + tokenize(q_load, newParams) + \"\\n```\") \n", " else:\n", " r('...everything looks good?') \n", " \n", "# ja load\n", "r('## JA Load')\n", "q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n", "pendingGreaterThan50Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 50)\n", "if len(pendingGreaterThan50Result[0]) > 0:\n", " max_pending_jobs = np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values)\n", " r(\"INSIGHT: There was a high number of pending jobs during this period (max was %s). Note that this is for jobs including all priorities (even low priority ones)\" % (max_pending_jobs)) \n", " \n", " open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n", " jaUrl = baseUrl + \"/devops-pipelines/ja.ipynb\"\n", " r('\\n\\n[JobAgent investigation notebook](', requote_uri(jaUrl), ')')\n", "\n", " jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n", " \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n", " \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n", " r('\\n\\n[JobAgent health dashboard](', requote_uri(jaJarvisLink), ')')\n", "else:\n", " r('...everything looks good?')\n", " \n", "# abuse detection?\n", "r('## What users are impacted?')\n", "if len(finalabusersList) > 0:\n", " r('Found abusers -- this alert is likely a false alarm.')\n", "r(pandas_df_to_markdown_table(q_delayedAccountsAreAbusers_df)) \n", " \n", " \n", "# more analysis? \n", "r('## More analysis')\n", "url = baseUrl + \"/devops-pipelines/sla.ipynb\"\n", "SLAParams = {\n", " \"triggerTime\": params[\"start\"],\n", " \"scaleUnit\": params[\"su\"],\n", " \"service\": params[\"service\"],\n", " \"lookback\": \"1h\",\n", " \"region\": \"\"\n", "}\n", "open_nb(os.path.join(root, 'sla.ipynb'), SLAParams, redirect=False)\n", "r('\\n\\n[SLA investigation notebook](', requote_uri(url), ')') \n", "\n", "url = baseUrl + \"/devops-pipelines/impact.ipynb\"\n", "open_nb(os.path.join(root, 'impact.ipynb'), params, redirect=False)\n", "r('\\n\\n[Customer impact investigation notebook](', requote_uri(url), ')') \n", "\n", "# Scale unit health\n", "jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/DevOpsReports\"\"\" \\\n", " \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n", " \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n", " \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n", "r('\\n\\n[Scale unit health dashboard (' + su + ', ' + service + ')](', requote_uri(jarvisLink), ')')\n", "\n", "\n", "Markdown(content)\n", "# print(content)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "inputHidden": false, "outputHidden": false }, "outputs": [], "source": [] } ], "metadata": { "kernel_info": { "name": "python3" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "nteract": { "version": "0.14.5" } }, "nbformat": 4, "nbformat_minor": 0 }