devops-pipelines/delays.ipynb

520 строки
21 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Orchestration delays Investigation\n",
"1. Run all cells.\n",
"1. Scroll down to see for any authentication messages\n",
"1. View report at the bottom."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# These are just defaults will be overwritten if you use nimport pip\n",
"start = \"2019-08-08T23:50:00.0000000Z\"\n",
"end = \"2019-08-09T00:24:36.0000000Z\"\n",
"service = \"tfs\"\n",
"hub = \"Build\"\n",
"su = \"tfs-wcus-0\"\n",
"url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
"baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false,
"tags": [
"debug"
]
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install nimport azure-kusto-notebooks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# Import the things we use\n",
"\n",
"# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
"# %kql is single line magic\n",
"# %%kql is cell magic\n",
"\n",
"# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
"# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
"from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
"\n",
"# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
"import pandas as pd\n",
"pd.options.display.html.table_schema = True\n",
"from pandas import Series, DataFrame\n",
"from datetime import datetime, timedelta, timezone\n",
"from urllib.parse import urlencode, quote_plus\n",
"from requests.utils import requote_uri\n",
"import time\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"from nimport.utils import tokenize, open_nb\n",
"import json\n",
"import os\n",
"import calendar as cal\n",
"import concurrent.futures\n",
"from azure.kusto.notebooks import utils as akn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"params = {\n",
" \"su\": su,\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"url\": url,\n",
" \"baseUrl\": baseUrl,\n",
" \"service\": service,\n",
" \"hub\": hub\n",
"}\n",
"root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
"queryPath = os.path.join(root, 'queries')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# authenticate kusto client\n",
"# you will need to copy the token into a browser window for AAD auth. \n",
"client = akn.get_client('https://vso.kusto.windows.net')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# authenticate kusto client\n",
"# you will need to copy the token into a browser window for AAD auth. \n",
"icm_client = akn.get_client('https://icmcluster.kusto.windows.net')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"q_loc = os.path.join(queryPath, \"LocationName.csl\")\n",
"q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n",
"q_haActions = os.path.join(queryPath, \"HealthAgentActions.csl\")\n",
"q_mdm = os.path.join(queryPath, \"MDMAccount.csl\")\n",
"\n",
"delaysPath = os.path.join(queryPath, \"delays\")\n",
"q_affectedAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
"q_abusers = os.path.join(delaysPath, \"Abusers.csl\")\n",
"q_affAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
"q_delayedAccountsAreAbusers = os.path.join(delaysPath, \"DelayedAccountsAreAbusers.csl\")\n",
"q_whatDelayed = os.path.join(delaysPath, \"WhatDelayed.csl\")\n",
"q_load = os.path.join(delaysPath, \"Load.csl\")\n",
"\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" # materialize location name immediately as we need this for other queries\n",
" p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n",
" locationNameResult = akn.to_dataframe_from_future(p1)\n",
" locationName = locationNameResult[\"Tenant\"][0]\n",
" params[\"locationName\"] = locationName\n",
" p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n",
" p4 = executor.submit(akn.execute_file, client, 'VSO', q_haActions, params) \n",
" \n",
" p5 = executor.submit(akn.execute_file, client, 'VSO', q_affectedAccounts, params)\n",
" p6 = executor.submit(akn.execute_file, client, 'VSO', q_abusers, params)\n",
" p7 = executor.submit(akn.execute_file, client, 'VSO', q_affAccounts, params)\n",
" p8 = executor.submit(akn.execute_file, client, 'VSO', q_delayedAccountsAreAbusers, params)\n",
" p9 = executor.submit(akn.execute_file, client, 'VSO', q_whatDelayed, params)\n",
" p10 = executor.submit(akn.execute_file, client, 'VSO', q_load, params)\n",
" \n",
" p11 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n",
" os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n",
" p12 = executor.submit(akn.execute_file, client, 'VSO', q_mdm, params)\n",
"\n",
"q_whatChanged_df = akn.to_dataframe_from_future(p2)\n",
"q_haActions_df = akn.to_dataframe_from_future(p4)\n",
"q_affectedAccountsResultDf = akn.to_dataframe_from_future(p5)\n",
"\n",
"abusersDf = akn.to_dataframe_from_future(p6)\n",
"finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);\n",
"\n",
"q_affAccounts_df = akn.to_dataframe_from_future(p7)\n",
"q_delayedAccountsAreAbusers_df = akn.to_dataframe_from_future(p8)\n",
"q_whatDelayedResultDf = akn.to_dataframe_from_future(p9)\n",
"q_loadResultDf = akn.to_dataframe_from_future(p10)\n",
"\n",
"q_activeIncidentsResultDf = akn.to_dataframe_from_future(p11)\n",
"\n",
"q_mdmDf = akn.to_dataframe_from_future(p12)\n",
"params[\"mdmAccount\"] = q_mdmDf[\"monitoringAccount\"][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"query = os.path.join(delaysPath, \"OrchestrationLogSpike.csl\")\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" sfs = [executor.submit(akn.execute_file, client, 'VSO', query, \n",
" {\n",
" **params,\n",
" \"hostId\": r\n",
" }) for r in q_delayedAccountsAreAbusers_df[\"HostId\"].values]\n",
" sfsResults = [s.result() for s in concurrent.futures.as_completed(sfs)]\n",
"\n",
"# convert to data frames\n",
"primary_results = [s.primary_results[0] for s in sfsResults]\n",
"spikeResultsDfs = None\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in primary_results]\n",
" spikeResultsDfs = [dff.result() for dff in concurrent.futures.as_completed(dataframe_futures)]\n",
"sfsResults = None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# utility functions\n",
"content = ''\n",
"def r(*args):\n",
" '''construct a markdown report'''\n",
" global content\n",
" content += ''.join([str(a) for a in args]) + '\\n'\n",
"\n",
"startTime = akn.to_datetime(start)\n",
"# report! \n",
"r('# OK SO WHAT HAPPENED')\n",
"r('|parameter|value|')\n",
"r('|---|---|')\n",
"r('|startTime|', startTime, '|')\n",
"r('|endTime|', akn.to_datetime(end), '|')\n",
"r('|scale unit|', su, '|')\n",
"r('|service|', service, '|')\n",
"\n",
"# jarvis params\n",
"jarvisParams = {\n",
" 'su': su, \n",
" 'start': akn.get_time(start, -10), \n",
" 'end': akn.get_time(end, 10), \n",
" 'service': service,\n",
" 'location': locationName,\n",
" 'account': params[\"mdmAccount\"]\n",
"}\n",
"\n",
"# abuse detection?\n",
"r('## What users are impacted?')\n",
"if len(finalabusersList) > 0:\n",
" r('INSIGHT: Found abusers -- this alert is likely a false alarm.')\n",
"r(akn.pandas_df_to_markdown_table(q_delayedAccountsAreAbusers_df)) \n",
"\n",
"\n",
"\n",
"# what changed? analysis\n",
"r('## What changed?')\n",
"if q_whatChanged_df.empty:\n",
" r(\"...no relevant config changes recorded during this period.\")\n",
"else:\n",
" # compute relative times and relevant changes\n",
" history = q_whatChanged_df\n",
" t0 = startTime.replace(tzinfo=None)\n",
" history['RelativeSeconds'] = history.apply(lambda row: (row.TIMESTAMP.replace(tzinfo=None) - t0).total_seconds(), axis=1)\n",
" relevant = history[abs(history.RelativeSeconds) < 3600]\n",
" \n",
" # analysis\n",
" upgrade = False\n",
" mitigation = False\n",
" vip_swap = False\n",
" ffs = False\n",
" for t in relevant.title.values:\n",
" l = t.lower()\n",
" upgrade = upgrade or 'upgrade' in l\n",
" mitigation = mitigation or 'mitigation' in l\n",
" vip_swap = vip_swap or 'vip' in l\n",
" ffs = ffs or 'feature flag' in l\n",
" \n",
" if upgrade:\n",
" r('INSIGHT: there were database upgrades in progress')\n",
" if mitigation:\n",
" r('INSIGHT: there were mitigations in progress')\n",
" if vip_swap:\n",
" r('INSIGHT: there was a vip swap just before this period.')\n",
" if ffs:\n",
" r('INSIGHT: there were feature flag changes right before this period.')\n",
" \n",
" # full table\n",
" r(akn.pandas_df_to_markdown_table(relevant[['TIMESTAMP', 'RelativeSeconds', 'title']]))\n",
" \n",
" \n",
"# active incidents?\n",
"r('## Active incidents?')\n",
"otherIncidentsCount = 0;\n",
"\n",
"if q_activeIncidentsResultDf is not None and not q_activeIncidentsResultDf.empty:\n",
" for index, row in q_activeIncidentsResultDf.iterrows():\n",
" if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n",
" otherIncidentsCount += 1\n",
" \n",
" if otherIncidentsCount > 0:\n",
" r(\"INSIGHT: There were incidents recorded during this period. These might be related:\")\n",
" newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: \"\"\"[%s](https://icm.ad.msft.net/imp/v3/incidents/details/%s/home)\"\"\" % (x,x), q_activeIncidentsResultDf.IncidentId)]) \n",
" r(\"\\n\")\n",
" r(akn.pandas_df_to_markdown_table(newDf[['URL','Severity','Title']]))\n",
" else:\n",
" r(\"...no relevant incidents during this period.\") \n",
" \n",
" \n",
"r('## Queue Load')\n",
"ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n",
"queuedGreatherThan500 = np.where(ar > 500)\n",
"ar_max = np.amax(ar) if len(ar) else '?'\n",
"if len(queuedGreatherThan500[0]) > 0:\n",
" r('INSIGHT: There was a high rate of jobs queued during this period (max: ', ar_max, ' / minute)...')\n",
"else: \n",
" r('...everything looks good? (max: ', ar_max, ' / minute)')\n",
" \n",
"r('## Orchestration phase Load')\n",
"for spikeResultDf in spikeResultsDfs:\n",
" countResult = spikeResultDf.C.describe()\n",
" hostId = spikeResultDf[\"HostId\"].values[0]\n",
" upper = countResult[\"75%\"]\n",
" lower = countResult[\"25%\"]\n",
" # Wondering what's going on here? We detect anomolies, see https://www.purplemath.com/modules/boxwhisk3.htm\n",
" IQR = upper - lower\n",
" countResultOfInterest = spikeResultDf[spikeResultDf[\"C\"] > upper + 1.5 * IQR ].head(5)\n",
" unqCommands = list(dict.fromkeys(countResultOfInterest[\"Command\"].values).keys())\n",
" if len(unqCommands) > 0:\n",
" r(\"INSIGHT: Found anomalies for these phases in order highest to lowest for host: \", hostId)\n",
"\n",
" # print commands table\n",
" r(akn.pandas_df_to_markdown_table(countResultOfInterest[[\"Command\", \"C\"]])) \n",
" \n",
" \n",
" if \"PlanCompleted\" in unqCommands:\n",
" if \"StartPlan\" in unqCommands or \"PlanStarted\" in unqCommands:\n",
" r(\"\\nTIP: An unusual number of plans were started during this period.\")\n",
" else:\n",
" r(\"\\nTIP: Jobs that are queued long ago might have completed now... creating this spike\") \n",
" \n",
" newParams = dict(params)\n",
" newParams[\"command\"] = next(iter(unqCommands)) \n",
" newParams[\"hostId\"] = hostId\n",
" r(akn.details_md('Kusto query for analyzing spike:', \n",
" tokenize(os.path.join(os.path.join(queryPath, \"delays\"), \"OrchestrationLogSpikeTip.csl\"), newParams)))\n",
" r(akn.details_md('Kusto for analyzing load:', tokenize(q_load, newParams)))\n",
" \n",
" else:\n",
" r('...everything looks good?') \n",
" \n",
"# ja load\n",
"r()\n",
"r('## JA Load')\n",
"q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n",
"pendingGreaterThan50Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 50)\n",
"if len(pendingGreaterThan50Result[0]) > 0:\n",
" max_pending_jobs = np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values)\n",
" r(\"INSIGHT: There was a high number of pending jobs during this period (max was %s). Note that this is for jobs including all priorities (even low priority ones)\" % (max_pending_jobs)) \n",
" \n",
" open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n",
" jaUrl = baseUrl + \"/devops-pipelines/ja.ipynb\"\n",
" r('\\n\\n[JobAgent investigation notebook](', requote_uri(jaUrl), ')')\n",
"\n",
" jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n",
" r('\\n\\n[JobAgent health dashboard](', requote_uri(jaJarvisLink), ')')\n",
"else:\n",
" r('...everything looks good?')\n",
" \n",
" \n",
" \n",
" \n",
"# more analysis? \n",
"r('## What should we look at next?')\n",
"url = baseUrl + \"/devops-pipelines/sla.ipynb\"\n",
"SLAParams = {\n",
" \"triggerTime\": params[\"start\"],\n",
" \"scaleUnit\": params[\"su\"],\n",
" \"service\": params[\"service\"],\n",
" \"lookback\": \"1h\",\n",
" \"region\": \"\"\n",
"}\n",
"open_nb(os.path.join(root, 'sla.ipynb'), SLAParams, redirect=False)\n",
"r('\\n\\n[SLA investigation notebook](', requote_uri(url), ')') \n",
"\n",
"url = baseUrl + \"/devops-pipelines/impact.ipynb\"\n",
"open_nb(os.path.join(root, 'impact.ipynb'), params, redirect=False)\n",
"r('\\n\\n[Customer impact investigation notebook](', requote_uri(url), ')') \n",
"\n",
"# Scale unit health\n",
"jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/DevOpsReports\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
"r('\\n\\n[Scale unit health dashboard (' + su + ', ' + service + ')](', requote_uri(jarvisLink), ')')\n",
"\n",
"\n",
"Markdown(content)\n",
"# print(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# visualize delays\n",
"import plotly\n",
"from plotly import graph_objs as go\n",
"delays = go.Scatter(\n",
" x=q_affAccounts_df[\"PreciseTimeStamp\"],\n",
" y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n",
" mode = 'lines',\n",
" name = 'Delays in seconds',\n",
" text= q_affAccounts_df['Name']\n",
")\n",
"\n",
"changed = go.Scatter(\n",
" x=q_whatChanged_df[\"TIMESTAMP\"],\n",
" y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n",
" mode = 'lines+markers',\n",
" name = 'What Changed',\n",
" text = q_whatChanged_df[\"Name\"],\n",
" marker=dict(\n",
" size=32,\n",
" color = np.random.randn(500),\n",
" colorscale='Viridis'\n",
" )\n",
")\n",
"\n",
"mitigations = go.Scatter(\n",
" x=q_haActions_df[\"PreciseTimeStamp\"],\n",
" y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n",
" mode = 'markers',\n",
" name = 'Mitigations',\n",
" text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n",
" marker = dict(\n",
" size = 10,\n",
" color = 'rgba(152, 0, 0, .8)',\n",
" line = dict(\n",
" width = 2,\n",
" color = 'rgb(0, 0, 0)'\n",
" )\n",
" )\n",
")\n",
"\n",
"data = [delays, changed, mitigations]\n",
"plotly.offline.iplot(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"nteract": {
"version": "0.14.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}