devops-pipelines/delays.ipynb

459 строки
18 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Orchestration delays Investigation\n",
"1. Run all cells.\n",
"1. View report at the bottom."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# These are just defaults will be overwritten if you use nimport pip\n",
"su=\"tfs-cus-1\"\n",
"start=\"2019-07-20T16:00:00.0000000Z\"\n",
"end=\"2019-07-20T16:33:36.0000000Z\"\n",
"url=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
"baseUrl=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
"service=\"tfs\"\n",
"hub=\"Build\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"#%%capture\n",
"# This isn't needed if you are bootstraping\n",
"!pip install nimport azure-kusto-notebooks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# Import the things we use\n",
"\n",
"# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
"# %kql is single line magic\n",
"# %%kql is cell magic\n",
"\n",
"# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
"# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
"from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
"\n",
"# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
"import pandas as pd\n",
"pd.options.display.html.table_schema = True\n",
"from pandas import Series, DataFrame\n",
"from datetime import datetime, timedelta, timezone\n",
"from urllib.parse import urlencode, quote_plus\n",
"from requests.utils import requote_uri\n",
"import time\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"from nimport.utils import tokenize, open_nb\n",
"import json\n",
"import os\n",
"import calendar as cal\n",
"import concurrent.futures\n",
"from azure.kusto.notebooks import utils as akn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"params = {\n",
" \"su\": su,\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"url\": url,\n",
" \"baseUrl\": baseUrl,\n",
" \"service\": service,\n",
" \"hub\": hub\n",
"}\n",
"root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
"queryPath = os.path.join(root, 'queries')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# authenticate kusto client\n",
"# you will need to copy the token into a browser window for AAD auth. \n",
"client = akn.get_client('https://vso.kusto.windows.net', 'VSO')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# authenticate kusto client\n",
"# you will need to copy the token into a browser window for AAD auth. \n",
"icm_client = akn.get_client('https://icmcluster.kusto.windows.net', 'IcMDataWarehouse')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"q_loc = os.path.join(queryPath, \"LocationName.csl\")\n",
"q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n",
"q_vipSwap = os.path.join(queryPath, \"VIPSwap.csl\")\n",
"q_haActions = os.path.join(queryPath, \"HealthAgentActions.csl\")\n",
"\n",
"delaysPath = os.path.join(queryPath, \"delays\")\n",
"q_affectedAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
"q_abusers = os.path.join(delaysPath, \"Abusers.csl\")\n",
"q_affAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
"q_delayedAccountsAreAbusers = os.path.join(delaysPath, \"DelayedAccountsAreAbusers.csl\")\n",
"q_whatDelayed = os.path.join(delaysPath, \"WhatDelayed.csl\")\n",
"q_load = os.path.join(delaysPath, \"Load.csl\")\n",
"\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" # materialize location name immediately as we need this for other queries\n",
" p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n",
" locationNameResult = akn.to_dataframe_from_future(p1)\n",
" locationName = locationNameResult[\"Tenant\"][0]\n",
" params[\"locationName\"] = locationName\n",
" p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n",
" p3 = executor.submit(akn.execute_file, client, 'VSO', q_vipSwap, params)\n",
" p4 = executor.submit(akn.execute_file, client, 'VSO', q_haActions, params) \n",
" \n",
" p5 = executor.submit(akn.execute_file, client, 'VSO', q_affectedAccounts, params)\n",
" p6 = executor.submit(akn.execute_file, client, 'VSO', q_abusers, params)\n",
" p7 = executor.submit(akn.execute_file, client, 'VSO', q_affAccounts, params)\n",
" p8 = executor.submit(akn.execute_file, client, 'VSO', q_delayedAccountsAreAbusers, params)\n",
" p9 = executor.submit(akn.execute_file, client, 'VSO', q_whatDelayed, params)\n",
" p10 = executor.submit(akn.execute_file, client, 'VSO', q_load, params)\n",
" \n",
" p11 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n",
" os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n",
"\n",
"q_whatChanged_df = akn.to_dataframe_from_future(p2)\n",
"vipSwapResultDf = akn.to_dataframe_from_future(p3)\n",
"q_haActions_df = akn.to_dataframe_from_future(p4)\n",
"q_affectedAccountsResultDf = akn.to_dataframe_from_future(p5)\n",
"\n",
"abusersDf = akn.to_dataframe_from_future(p6)\n",
"finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);\n",
"\n",
"q_affAccounts_df = akn.to_dataframe_from_future(p7)\n",
"q_delayedAccountsAreAbusers_df = akn.to_dataframe_from_future(p8)\n",
"q_whatDelayedResultDf = akn.to_dataframe_from_future(p9)\n",
"q_loadResultDf = akn.to_dataframe_from_future(p10)\n",
"\n",
"q_activeIncidentsResultDf = akn.to_dataframe_from_future(p11)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# visualize delays\n",
"import plotly\n",
"from plotly import graph_objs as go\n",
"delays = go.Scatter(\n",
" x=q_affAccounts_df[\"PreciseTimeStamp\"],\n",
" y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n",
" mode = 'lines',\n",
" name = 'Delays in seconds',\n",
" text= q_affAccounts_df['Name']\n",
")\n",
"\n",
"changed = go.Scatter(\n",
" x=q_whatChanged_df[\"TIMESTAMP\"],\n",
" y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n",
" mode = 'lines+markers',\n",
" name = 'What Changed',\n",
" text = q_whatChanged_df[\"Name\"],\n",
" marker=dict(\n",
" size=32,\n",
" color = np.random.randn(500),\n",
" colorscale='Viridis'\n",
" )\n",
")\n",
"\n",
"mitigations = go.Scatter(\n",
" x=q_haActions_df[\"PreciseTimeStamp\"],\n",
" y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n",
" mode = 'markers',\n",
" name = 'Mitigations',\n",
" text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n",
" marker = dict(\n",
" size = 10,\n",
" color = 'rgba(152, 0, 0, .8)',\n",
" line = dict(\n",
" width = 2,\n",
" color = 'rgb(0, 0, 0)'\n",
" )\n",
" )\n",
")\n",
"\n",
"data = [delays, changed, mitigations]\n",
"plotly.offline.iplot(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# utility functions\n",
"content = ''\n",
"def r(*args):\n",
" '''construct a markdown report'''\n",
" global content\n",
" content += ''.join([str(a) for a in args]) + '\\n'\n",
"\n",
"def pandas_df_to_markdown_table(df):\n",
" from IPython.display import Markdown, display\n",
" fmt = ['---' for i in range(len(df.columns))]\n",
" df_fmt = pd.DataFrame([fmt], columns=df.columns)\n",
" df_formatted = pd.concat([df_fmt, df])\n",
" return df_formatted.to_csv(sep=\"|\", index=False)\n",
"\n",
"# report! \n",
"r('# OK SO WHAT HAPPENED')\n",
"r('|parameter|value|')\n",
"r('|---|---|')\n",
"r('|startTime|', akn.to_datetime(start), '|')\n",
"r('|endTime|', akn.to_datetime(end), '|')\n",
"r('|scale unit|', su, '|')\n",
"r('|service|', service, '|')\n",
"\n",
"# jarvis params\n",
"jarvisParams = {\n",
" 'su': su, \n",
" 'start': akn.get_time(start, -10), \n",
" 'end': akn.get_time(end, 10), \n",
" 'service': service \n",
"}\n",
"\n",
"# what changed? analysis\n",
"r('## What changed?')\n",
"if(len(q_whatChanged_df.index) == 0):\n",
" r(\"...no relevant config changes recorded during this period.\")\n",
"else:\n",
" up_prefix = \"\"\n",
" mit_prefix = \"\"\n",
" text = \"\"\n",
" for index, row in q_whatChanged_df.iterrows():\n",
" if(row.title.lower().find('upgrade') != -1):\n",
" if not up_prefix:\n",
" up_prefix += \"Looks like, there's upgrade in progress...\\n\\n\"\n",
" text += \"\"\"%s %s %s \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber)\n",
" if(row.title.lower().find('mitigation') != -1):\n",
" if not mit_prefix:\n",
" mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\"\n",
" text += \"\"\"%s %s %s \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber)\n",
" \n",
" if text:\n",
" r(up_prefix + mit_prefix + text)\n",
" else:\n",
" r(pandas_df_to_markdown_table(q_whatChanged_df))\n",
" \n",
" \n",
" \n",
"# active incidents?\n",
"r('## Active incidents?')\n",
"otherIncidentsCount = 0;\n",
"for index, row in q_activeIncidentsResultDf.iterrows():\n",
" if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n",
" otherIncidentsCount += 1\n",
" \n",
"if(otherIncidentsCount > 0):\n",
" r(\"INSIGHT: There were incidents recorded during this period. These might be related:\")\n",
" \n",
" def make_clickable(url, text):\n",
" '''styling'''\n",
" return '{0}'.format(url)\n",
"\n",
" newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n",
" r(\"ICM link to copy - \", \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\", \"\\n\")\n",
" r(pandas_df_to_markdown_table(newDf[['IncidentId','Severity','Title']]))\n",
"else:\n",
" r(\"...no relevant incidents during this period.\")\n",
"\n",
" \n",
" \n",
" \n",
"r('## Queue Load')\n",
"ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n",
"queuedGreatherThan100 = np.where(ar > 100)\n",
"if len(queuedGreatherThan100[0]) > 0:\n",
" r('INSIGHT: There was a high rate of jobs queued during this perdiod (max: ', np.amax(ar), ' / minute).')\n",
"else: \n",
" r('...everything looks good?')\n",
"\n",
" \n",
" \n",
"# ja load\n",
"r('## JA Load')\n",
"q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n",
"pendingGreaterThan10Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 10)\n",
"if len(pendingGreaterThan10Result[0]) > 0:\n",
" max_pending_jobs = np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values)\n",
" r(\"INSIGHT: There was a high number of pending jobs during this period (max was %s).\" % (max_pending_jobs)) \n",
" \n",
" open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n",
" jaUrl = baseUrl + \"/devops-pipelines/ja.ipynb\"\n",
" r('\\n\\n[JobAgent investigation notebook](', requote_uri(jaUrl), ')')\n",
"\n",
" jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n",
" r('\\n\\n[JobAgent health dashboard](', requote_uri(jaJarvisLink), ')')\n",
"else:\n",
" r('...everything looks good?')\n",
" \n",
"# abuse detection?\n",
"r('## What users are impacted?')\n",
"if len(finalabusersList) > 0:\n",
" r('Found abusers -- this alert is likely a false alarm.')\n",
"r(pandas_df_to_markdown_table(q_delayedAccountsAreAbusers_df))\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"# vip swap\n",
"r('## Vip Swap?')\n",
"if len(vipSwapResultDf.index) > 0:\n",
" viptime = vipSwapResultDf[\"TIMESTAMP\"][0]\n",
" starttime = akn.to_datetime(start)\n",
" delta = starttime.replace(tzinfo=None) - viptime.replace(tzinfo=None)\n",
" if delta.total_seconds() > 0:\n",
" r(\"\"\"INSIGHT: vip swap recorded %s days %s hours %s minutes before the start time (at %s)\"\"\" % \n",
" (delta.days, delta.seconds//3600, (delta.seconds//60) % 60, viptime))\n",
" else:\n",
" r('...no swaps recorded during this period')\n",
"else:\n",
" r('...no swaps recorded during this period')\n",
"\n",
" \n",
" \n",
" \n",
"# more analysis? \n",
"r('## More analysis')\n",
"url = baseUrl + \"/devops-pipelines/sla.ipynb\"\n",
"SLAParams = {\n",
" \"triggerTime\": params[\"start\"],\n",
" \"scaleUnit\": params[\"su\"],\n",
" \"service\": params[\"service\"]\n",
"}\n",
"open_nb(os.path.join(root, 'sla.ipynb'), SLAParams, redirect=False)\n",
"r('\\n\\n[SLA investigation notebook](', requote_uri(url), ')') \n",
"\n",
"url = baseUrl + \"/devops-pipelines/impact.ipynb\"\n",
"open_nb(os.path.join(root, 'impact.ipynb'), params, redirect=False)\n",
"r('\\n\\n[Customer impact investigation notebook](', requote_uri(url), ')') \n",
"\n",
"# Scale unit health\n",
"jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/TFS DevOpsReports\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n",
"r('\\n\\n[Scale unit health dashboard (' + su + ', ' + service + ')](', requote_uri(jarvisLink), ')')\n",
"\n",
"\n",
"Markdown(content)\n",
"# print(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"nteract": {
"version": "0.14.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}