devops-pipelines/impact.ipynb

363 строки
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Impact Investigation\n",
"1. Run all cells.\n",
"1. View report at the bottom."
],
"metadata": {}
},
{
"cell_type": "code",
"source": [
"su = \"tfs-cus-1\"\n",
"start = \"2019-07-20T16:00:00.0000000Z\"\n",
"end = \"2019-07-20T16:33:36.0000000Z\"\n",
"url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
"baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
"service = \"tfs\"\n",
"hub = \"Build\"\n",
"locationName = \"tfsprodcus1\""
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
}
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!pip install nimport azure-kusto-notebooks"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Import the things we use\n",
"\n",
"# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
"# %kql is single line magic\n",
"# %%kql is cell magic\n",
"\n",
"# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
"# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
"from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
"\n",
"# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
"import pandas as pd\n",
"pd.options.display.html.table_schema = True\n",
"from pandas import Series, DataFrame\n",
"from datetime import datetime, timedelta, timezone\n",
"from urllib.parse import urlencode, quote_plus\n",
"from requests.utils import requote_uri\n",
"import time\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"from nimport.utils import tokenize, open_nb\n",
"import json\n",
"import os\n",
"import calendar as cal\n",
"import concurrent.futures\n",
"from azure.kusto.notebooks import utils as akn"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"params = {\n",
" \"su\": su,\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"url\": url,\n",
" \"baseUrl\": baseUrl,\n",
" \"service\": service\n",
"}\n",
"root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
"queryPath = os.path.join(root, 'queries')"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# authenticate kusto client\n",
"# you will need to copy the token into a browser window for AAD auth. \n",
"client = akn.get_client('https://vso.kusto.windows.net', 'VSO')"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# authenticate kusto client\n",
"# you will need to copy the token into a browser window for AAD auth. \n",
"icm_client = akn.get_client('https://icmcluster.kusto.windows.net', 'IcMDataWarehouse')"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"q_loc = os.path.join(queryPath, \"LocationName.csl\")\n",
"q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n",
"q_vipSwap = os.path.join(queryPath, \"VIPSwap.csl\")\n",
"\n",
"impactPath = os.path.join(queryPath, \"impact\")\n",
"q_commands = os.path.join(impactPath, \"CommandsReason.csl\")\n",
"q_commandsAT = os.path.join(impactPath, \"CommandsAT.csl\")\n",
"q_commandsDb = os.path.join(impactPath, \"CommandsDb.csl\")\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" # materialize location name immediately as we need this for other queries\n",
" p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n",
" locationNameResult = akn.to_dataframe_from_future(p1)\n",
" locationName = locationNameResult[\"Tenant\"][0]\n",
" params[\"locationName\"] = locationName\n",
" p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n",
" p3 = executor.submit(akn.execute_file, client, 'VSO', q_vipSwap, params)\n",
" \n",
" p4 = executor.submit(akn.execute_file, client, 'VSO', q_commandsAT, params)\n",
" p5 = executor.submit(akn.execute_file, client, 'VSO', q_commandsDb, params) \n",
" p6 = executor.submit(akn.execute_file, client, 'VSO', q_commands, params)\n",
" \n",
" p7 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n",
" os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n",
"\n",
"q_whatChanged_df = akn.to_dataframe_from_future(p2)\n",
"\n",
"vipSwapResultDf = akn.to_dataframe_from_future(p3)\n",
"\n",
"q_commandsAT_df = akn.to_dataframe_from_future(p4)\n",
"\n",
"q_commandsDb_df = akn.to_dataframe_from_future(p5)\n",
"\n",
"q_commands_df = akn.to_dataframe_from_future(p6)\n",
"\n",
"q_activeIncidentsResultDf = akn.to_dataframe_from_future(p7)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"print('=' * 50)\n",
"print('Report!')\n",
"print('=' * 50, '\\n\\n')\n",
"\n",
"# jarvis params\n",
"jarvisParams = {\n",
" 'su': su, \n",
" 'start': akn.get_time(start, -10), \n",
" 'end': akn.get_time(end, 10), \n",
" 'service': service \n",
"}\n",
"\n",
"# jarvis\n",
"jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/TFS DevOpsReports\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
"print('Jarvis dashboard link:\\n', requote_uri(jarvisLink), '\\n')\n",
"\n",
"#\n",
"# vip swap\n",
"print()\n",
"print('Vip Swap? =============================')\n",
"if len(vipSwapResultDf.index) > 0:\n",
" viptime = vipSwapResultDf[\"TIMESTAMP\"][0]\n",
" starttime = akn.to_datetime(start)\n",
" delta = starttime.replace(tzinfo=None) - viptime.replace(tzinfo=None)\n",
" if delta.total_seconds() > 0:\n",
" print(\"\"\"VIP SWAP happened: %s days %s hours %s minutes ago (%s) (issue start: %s)\"\"\" % (delta.days, delta.seconds//3600, (delta.seconds//60) % 60, viptime, start))\n",
" else:\n",
" print('...no swaps recorded in the given time range')\n",
"else:\n",
" print('...no swaps recorded in the given time range')\n",
" \n",
"# slow failed reason analysis\n",
"print()\n",
"print('Is it slow commands or failed commands? =============================')\n",
"freq = q_commands_df[\"Frequency\"]\n",
"coefficientOfVariance = freq.std()/freq.mean()\n",
"failedCount = q_commands_df[q_commands_df[\"Reason\"] == \"failed\"][\"Frequency\"].values[0]\n",
"slowCount = q_commands_df[q_commands_df[\"Reason\"] == \"slow\"][\"Frequency\"].values[0]\n",
"reason = \"failed or slow\"\n",
"if coefficientOfVariance > 0.5:\n",
" if failedCount > slowCount:\n",
" reason = \"failed\"\n",
" else:\n",
" reason = \"slow\"\n",
"else:\n",
" print(\"Slow and failed commands are too close, both might be contributing...\")\n",
"if reason:\n",
" print(\"Probably due to %s commands; Failed - %s, Slow - %s\" % (reason, failedCount, slowCount))\n",
"\n",
"# slow failed reason for AT?\n",
"print()\n",
"print('Is it %s because of AT? =============================' % (reason))\n",
"failed = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"failed\"]\n",
"slow = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"slow\"]\n",
"data = q_commandsAT_df\n",
"if reason == \"failed\":\n",
" data = failed\n",
"elif reason == \"slow\":\n",
" data = slow\n",
"\n",
"coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n",
" \n",
"if coefficientOfVariance > 0.5:\n",
" print(\"Found variance in AT's for %s commands\" % (reason))\n",
" print(data.head(30))\n",
"else:\n",
" print(\"Seems be same across AT's for %s commands\" % (reason))\n",
" \n",
"# slow failed reason for Db?\n",
"print()\n",
"print('Is it %s because of Db? =============================' % (reason))\n",
"failed = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"failed\"]\n",
"slow = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"slow\"]\n",
"data = q_commandsDb_df\n",
"if reason == \"failed\":\n",
" data = failed\n",
"elif reason == \"slow\":\n",
" data = slow\n",
"\n",
"coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n",
" \n",
"if coefficientOfVariance > 0.5:\n",
" print(\"Found variance in Db's for %s commands\" % (reason))\n",
" print(\"Suffix '%s' to database server name\" % (\".database.windows.net\"))\n",
" print(\"Prefix '%s' to database name\" % (params[\"service\"] + \"_\" + params[\"locationName\"] + \"_\"))\n",
" print(data.head(30))\n",
"else:\n",
" print(\"Seems be same across Db's for %s commands\" % (reason)) \n",
" \n",
"# what changed? analysis\n",
"print()\n",
"print('What changed? =============================')\n",
"if(len(q_whatChanged_df.index) == 0):\n",
" print(\"No relevant changes found...\")\n",
"else:\n",
" up_prefix = \"\";\n",
" mit_prefix = \"\";\n",
" text = \"\";\n",
" for index, row in q_whatChanged_df.iterrows():\n",
" if(row.title.lower().find('upgrade') != -1):\n",
" if not up_prefix:\n",
" up_prefix += \"Looks like, there's upgrade in progress...\\n\";\n",
" text += \"\"\"%s %s %s \\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber);\n",
" if(row.title.lower().find('mitigation') != -1):\n",
" if not mit_prefix:\n",
" mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\";\n",
" state += states[\"healthagent\"];\n",
" text += \"\"\"%s %s %s\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber);\n",
" \n",
" if text:\n",
" print(up_prefix + mit_prefix + text)\n",
" else:\n",
" print(q_whatChanged_df)\n",
" \n",
"# active incidents?\n",
"print()\n",
"print('Active incidents? =============================')\n",
"otherIncidentsCount = 0;\n",
"for index, row in q_activeIncidentsResultDf.iterrows():\n",
" if(row.Title.find(\"TFS Customer Impact Monitor\") == -1):\n",
" otherIncidentsCount+=1;\n",
" \n",
"if(otherIncidentsCount > 0):\n",
" print(\"We found some incidents during the time period, check if they are related...\")\n",
" # styling\n",
" def make_clickable(url, text):\n",
" return '{0}'.format(url)\n",
"\n",
" newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n",
" print(\"ICM link to copy - \" + \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\")\n",
" print(newDf[['IncidentId','Severity','Title']])\n",
"else:\n",
" print(\"No active incidents that could be related are found...\") "
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"language_info": {
"name": "python",
"version": "3.7.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nteract": {
"version": "0.14.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}