459 строки
18 KiB
Plaintext
459 строки
18 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Orchestration delays Investigation\n",
|
|
"1. Run all cells.\n",
|
|
"1. View report at the bottom."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false,
|
|
"tags": [
|
|
"parameters"
|
|
]
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# These are just defaults will be overwritten if you use nimport pip\n",
|
|
"su=\"tfs-cus-1\"\n",
|
|
"start=\"2019-07-20T16:00:00.0000000Z\"\n",
|
|
"end=\"2019-07-20T16:33:36.0000000Z\"\n",
|
|
"url=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
|
|
"baseUrl=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
|
|
"service=\"tfs\"\n",
|
|
"hub=\"Build\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#%%capture\n",
|
|
"# This isn't needed if you are bootstraping\n",
|
|
"!pip install nimport azure-kusto-notebooks"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Import the things we use\n",
|
|
"\n",
|
|
"# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
|
|
"# %kql is single line magic\n",
|
|
"# %%kql is cell magic\n",
|
|
"\n",
|
|
"# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
|
|
"# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
|
|
"from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
|
|
"\n",
|
|
"# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
|
|
"import pandas as pd\n",
|
|
"pd.options.display.html.table_schema = True\n",
|
|
"from pandas import Series, DataFrame\n",
|
|
"from datetime import datetime, timedelta, timezone\n",
|
|
"from urllib.parse import urlencode, quote_plus\n",
|
|
"from requests.utils import requote_uri\n",
|
|
"import time\n",
|
|
"import numpy as np\n",
|
|
"from matplotlib import pyplot as plt\n",
|
|
"from nimport.utils import tokenize, open_nb\n",
|
|
"import json\n",
|
|
"import os\n",
|
|
"import calendar as cal\n",
|
|
"import concurrent.futures\n",
|
|
"from azure.kusto.notebooks import utils as akn"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"params = {\n",
|
|
" \"su\": su,\n",
|
|
" \"start\": start,\n",
|
|
" \"end\": end,\n",
|
|
" \"url\": url,\n",
|
|
" \"baseUrl\": baseUrl,\n",
|
|
" \"service\": service,\n",
|
|
" \"hub\": hub\n",
|
|
"}\n",
|
|
"root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
|
|
"queryPath = os.path.join(root, 'queries')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# authenticate kusto client\n",
|
|
"# you will need to copy the token into a browser window for AAD auth. \n",
|
|
"client = akn.get_client('https://vso.kusto.windows.net', 'VSO')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# authenticate kusto client\n",
|
|
"# you will need to copy the token into a browser window for AAD auth. \n",
|
|
"icm_client = akn.get_client('https://icmcluster.kusto.windows.net', 'IcMDataWarehouse')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"q_loc = os.path.join(queryPath, \"LocationName.csl\")\n",
|
|
"q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n",
|
|
"q_vipSwap = os.path.join(queryPath, \"VIPSwap.csl\")\n",
|
|
"q_haActions = os.path.join(queryPath, \"HealthAgentActions.csl\")\n",
|
|
"\n",
|
|
"delaysPath = os.path.join(queryPath, \"delays\")\n",
|
|
"q_affectedAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
|
|
"q_abusers = os.path.join(delaysPath, \"Abusers.csl\")\n",
|
|
"q_affAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
|
|
"q_delayedAccountsAreAbusers = os.path.join(delaysPath, \"DelayedAccountsAreAbusers.csl\")\n",
|
|
"q_whatDelayed = os.path.join(delaysPath, \"WhatDelayed.csl\")\n",
|
|
"q_load = os.path.join(delaysPath, \"Load.csl\")\n",
|
|
"\n",
|
|
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
|
|
" # materialize location name immediately as we need this for other queries\n",
|
|
" p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n",
|
|
" locationNameResult = akn.to_dataframe_from_future(p1)\n",
|
|
" locationName = locationNameResult[\"Tenant\"][0]\n",
|
|
" params[\"locationName\"] = locationName\n",
|
|
" p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n",
|
|
" p3 = executor.submit(akn.execute_file, client, 'VSO', q_vipSwap, params)\n",
|
|
" p4 = executor.submit(akn.execute_file, client, 'VSO', q_haActions, params) \n",
|
|
" \n",
|
|
" p5 = executor.submit(akn.execute_file, client, 'VSO', q_affectedAccounts, params)\n",
|
|
" p6 = executor.submit(akn.execute_file, client, 'VSO', q_abusers, params)\n",
|
|
" p7 = executor.submit(akn.execute_file, client, 'VSO', q_affAccounts, params)\n",
|
|
" p8 = executor.submit(akn.execute_file, client, 'VSO', q_delayedAccountsAreAbusers, params)\n",
|
|
" p9 = executor.submit(akn.execute_file, client, 'VSO', q_whatDelayed, params)\n",
|
|
" p10 = executor.submit(akn.execute_file, client, 'VSO', q_load, params)\n",
|
|
" \n",
|
|
" p11 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n",
|
|
" os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n",
|
|
"\n",
|
|
"q_whatChanged_df = akn.to_dataframe_from_future(p2)\n",
|
|
"vipSwapResultDf = akn.to_dataframe_from_future(p3)\n",
|
|
"q_haActions_df = akn.to_dataframe_from_future(p4)\n",
|
|
"q_affectedAccountsResultDf = akn.to_dataframe_from_future(p5)\n",
|
|
"\n",
|
|
"abusersDf = akn.to_dataframe_from_future(p6)\n",
|
|
"finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);\n",
|
|
"\n",
|
|
"q_affAccounts_df = akn.to_dataframe_from_future(p7)\n",
|
|
"q_delayedAccountsAreAbusers_df = akn.to_dataframe_from_future(p8)\n",
|
|
"q_whatDelayedResultDf = akn.to_dataframe_from_future(p9)\n",
|
|
"q_loadResultDf = akn.to_dataframe_from_future(p10)\n",
|
|
"\n",
|
|
"q_activeIncidentsResultDf = akn.to_dataframe_from_future(p11)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# visualize delays\n",
|
|
"import plotly\n",
|
|
"from plotly import graph_objs as go\n",
|
|
"delays = go.Scatter(\n",
|
|
" x=q_affAccounts_df[\"PreciseTimeStamp\"],\n",
|
|
" y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n",
|
|
" mode = 'lines',\n",
|
|
" name = 'Delays in seconds',\n",
|
|
" text= q_affAccounts_df['Name']\n",
|
|
")\n",
|
|
"\n",
|
|
"changed = go.Scatter(\n",
|
|
" x=q_whatChanged_df[\"TIMESTAMP\"],\n",
|
|
" y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n",
|
|
" mode = 'lines+markers',\n",
|
|
" name = 'What Changed',\n",
|
|
" text = q_whatChanged_df[\"Name\"],\n",
|
|
" marker=dict(\n",
|
|
" size=32,\n",
|
|
" color = np.random.randn(500),\n",
|
|
" colorscale='Viridis'\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"mitigations = go.Scatter(\n",
|
|
" x=q_haActions_df[\"PreciseTimeStamp\"],\n",
|
|
" y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n",
|
|
" mode = 'markers',\n",
|
|
" name = 'Mitigations',\n",
|
|
" text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n",
|
|
" marker = dict(\n",
|
|
" size = 10,\n",
|
|
" color = 'rgba(152, 0, 0, .8)',\n",
|
|
" line = dict(\n",
|
|
" width = 2,\n",
|
|
" color = 'rgb(0, 0, 0)'\n",
|
|
" )\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"data = [delays, changed, mitigations]\n",
|
|
"plotly.offline.iplot(data)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# utility functions\n",
|
|
"content = ''\n",
|
|
"def r(*args):\n",
|
|
" '''construct a markdown report'''\n",
|
|
" global content\n",
|
|
" content += ''.join([str(a) for a in args]) + '\\n'\n",
|
|
"\n",
|
|
"def pandas_df_to_markdown_table(df):\n",
|
|
" from IPython.display import Markdown, display\n",
|
|
" fmt = ['---' for i in range(len(df.columns))]\n",
|
|
" df_fmt = pd.DataFrame([fmt], columns=df.columns)\n",
|
|
" df_formatted = pd.concat([df_fmt, df])\n",
|
|
" return df_formatted.to_csv(sep=\"|\", index=False)\n",
|
|
"\n",
|
|
"# report! \n",
|
|
"r('# OK SO WHAT HAPPENED')\n",
|
|
"r('|parameter|value|')\n",
|
|
"r('|---|---|')\n",
|
|
"r('|startTime|', akn.to_datetime(start), '|')\n",
|
|
"r('|endTime|', akn.to_datetime(end), '|')\n",
|
|
"r('|scale unit|', su, '|')\n",
|
|
"r('|service|', service, '|')\n",
|
|
"\n",
|
|
"# jarvis params\n",
|
|
"jarvisParams = {\n",
|
|
" 'su': su, \n",
|
|
" 'start': akn.get_time(start, -10), \n",
|
|
" 'end': akn.get_time(end, 10), \n",
|
|
" 'service': service \n",
|
|
"}\n",
|
|
"\n",
|
|
"# what changed? analysis\n",
|
|
"r('## What changed?')\n",
|
|
"if(len(q_whatChanged_df.index) == 0):\n",
|
|
" r(\"...no relevant config changes recorded during this period.\")\n",
|
|
"else:\n",
|
|
" up_prefix = \"\"\n",
|
|
" mit_prefix = \"\"\n",
|
|
" text = \"\"\n",
|
|
" for index, row in q_whatChanged_df.iterrows():\n",
|
|
" if(row.title.lower().find('upgrade') != -1):\n",
|
|
" if not up_prefix:\n",
|
|
" up_prefix += \"Looks like, there's upgrade in progress...\\n\\n\"\n",
|
|
" text += \"\"\"%s %s %s \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber)\n",
|
|
" if(row.title.lower().find('mitigation') != -1):\n",
|
|
" if not mit_prefix:\n",
|
|
" mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\"\n",
|
|
" text += \"\"\"%s %s %s \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber)\n",
|
|
" \n",
|
|
" if text:\n",
|
|
" r(up_prefix + mit_prefix + text)\n",
|
|
" else:\n",
|
|
" r(pandas_df_to_markdown_table(q_whatChanged_df))\n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
"# active incidents?\n",
|
|
"r('## Active incidents?')\n",
|
|
"otherIncidentsCount = 0;\n",
|
|
"for index, row in q_activeIncidentsResultDf.iterrows():\n",
|
|
" if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n",
|
|
" otherIncidentsCount += 1\n",
|
|
" \n",
|
|
"if(otherIncidentsCount > 0):\n",
|
|
" r(\"INSIGHT: There were incidents recorded during this period. These might be related:\")\n",
|
|
" \n",
|
|
" def make_clickable(url, text):\n",
|
|
" '''styling'''\n",
|
|
" return '{0}'.format(url)\n",
|
|
"\n",
|
|
" newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n",
|
|
" r(\"ICM link to copy - \", \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\", \"\\n\")\n",
|
|
" r(pandas_df_to_markdown_table(newDf[['IncidentId','Severity','Title']]))\n",
|
|
"else:\n",
|
|
" r(\"...no relevant incidents during this period.\")\n",
|
|
"\n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
"r('## Queue Load')\n",
|
|
"ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n",
|
|
"queuedGreatherThan100 = np.where(ar > 100)\n",
|
|
"if len(queuedGreatherThan100[0]) > 0:\n",
|
|
" r('INSIGHT: There was a high rate of jobs queued during this perdiod (max: ', np.amax(ar), ' / minute).')\n",
|
|
"else: \n",
|
|
" r('...everything looks good?')\n",
|
|
"\n",
|
|
" \n",
|
|
" \n",
|
|
"# ja load\n",
|
|
"r('## JA Load')\n",
|
|
"q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n",
|
|
"pendingGreaterThan10Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 10)\n",
|
|
"if len(pendingGreaterThan10Result[0]) > 0:\n",
|
|
" max_pending_jobs = np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values)\n",
|
|
" r(\"INSIGHT: There was a high number of pending jobs during this period (max was %s).\" % (max_pending_jobs)) \n",
|
|
" \n",
|
|
" open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n",
|
|
" jaUrl = baseUrl + \"/devops-pipelines/ja.ipynb\"\n",
|
|
" r('\\n\\n[JobAgent investigation notebook](', requote_uri(jaUrl), ')')\n",
|
|
"\n",
|
|
" jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n",
|
|
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
|
|
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
|
|
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
|
|
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n",
|
|
" r('\\n\\n[JobAgent health dashboard](', requote_uri(jaJarvisLink), ')')\n",
|
|
"else:\n",
|
|
" r('...everything looks good?')\n",
|
|
" \n",
|
|
"# abuse detection?\n",
|
|
"r('## What users are impacted?')\n",
|
|
"if len(finalabusersList) > 0:\n",
|
|
" r('Found abusers -- this alert is likely a false alarm.')\n",
|
|
"r(pandas_df_to_markdown_table(q_delayedAccountsAreAbusers_df))\n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
"# vip swap\n",
|
|
"r('## Vip Swap?')\n",
|
|
"if len(vipSwapResultDf.index) > 0:\n",
|
|
" viptime = vipSwapResultDf[\"TIMESTAMP\"][0]\n",
|
|
" starttime = akn.to_datetime(start)\n",
|
|
" delta = starttime.replace(tzinfo=None) - viptime.replace(tzinfo=None)\n",
|
|
" if delta.total_seconds() > 0:\n",
|
|
" r(\"\"\"INSIGHT: vip swap recorded %s days %s hours %s minutes before the start time (at %s)\"\"\" % \n",
|
|
" (delta.days, delta.seconds//3600, (delta.seconds//60) % 60, viptime))\n",
|
|
" else:\n",
|
|
" r('...no swaps recorded during this period')\n",
|
|
"else:\n",
|
|
" r('...no swaps recorded during this period')\n",
|
|
"\n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
"# more analysis? \n",
|
|
"r('## More analysis')\n",
|
|
"url = baseUrl + \"/devops-pipelines/sla.ipynb\"\n",
|
|
"SLAParams = {\n",
|
|
" \"triggerTime\": params[\"start\"],\n",
|
|
" \"scaleUnit\": params[\"su\"],\n",
|
|
" \"service\": params[\"service\"]\n",
|
|
"}\n",
|
|
"open_nb(os.path.join(root, 'sla.ipynb'), SLAParams, redirect=False)\n",
|
|
"r('\\n\\n[SLA investigation notebook](', requote_uri(url), ')') \n",
|
|
"\n",
|
|
"url = baseUrl + \"/devops-pipelines/impact.ipynb\"\n",
|
|
"open_nb(os.path.join(root, 'impact.ipynb'), params, redirect=False)\n",
|
|
"r('\\n\\n[Customer impact investigation notebook](', requote_uri(url), ')') \n",
|
|
"\n",
|
|
"# Scale unit health\n",
|
|
"jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/TFS DevOpsReports\"\"\" \\\n",
|
|
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
|
|
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
|
|
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
|
|
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n",
|
|
"r('\\n\\n[Scale unit health dashboard (' + su + ', ' + service + ')](', requote_uri(jarvisLink), ')')\n",
|
|
"\n",
|
|
"\n",
|
|
"Markdown(content)\n",
|
|
"# print(content)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernel_info": {
|
|
"name": "python3"
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.4"
|
|
},
|
|
"nteract": {
|
|
"version": "0.14.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
}
|