devops-pipelines/sla.ipynb

315 строки
10 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SLA Investigation\n",
"1. Run all cells! (click on Menu > Cell > Run All Cells)\n",
"1. View report at the bottom."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"triggerTime = \"2019-10-15T20:21:54.0330000Z\"\n",
"scaleUnit = \"pipelines-ghub-eus2-2\"\n",
"service = \"pipelines\"\n",
"lookback = \"1h\"\n",
"region = \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"%%capture \n",
"\n",
"# install packages, setup workspace root\n",
"!pip install --upgrade pip azure-kusto-notebooks\n",
"import os\n",
"import sys\n",
"import datetime\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"pd.options.display.html.table_schema = True\n",
"import concurrent.futures\n",
"from azure.kusto.notebooks import utils as akn\n",
"\n",
"# cwd should be workspace root\n",
"if os.path.basename(os.getcwd()) == 'devops-pipelines':\n",
" os.chdir(os.pardir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# authenticate kusto client\n",
"# you will need to copy the token into a browser window for AAD auth. \n",
"client = akn.get_client('https://vso.kusto.windows.net')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# find orchestrations that violate SLA\n",
"params = {\n",
" 'TriggerTime': akn.to_kusto_datetime(triggerTime),\n",
" 'Lookback': akn.to_kusto_timespan(lookback),\n",
" 'Service': '\"' + service + '\"', \n",
" 'Region': '\"' + region + '\"',\n",
" 'ScaleUnit': '\"' + scaleUnit + '\"'\n",
"}\n",
"query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLADurationAnalysis.csl')\n",
"violations = akn.execute_file(client, database='VSO', path=query, params=params)\n",
"# violations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# collect problematic orchestration ids\n",
"result = violations.primary_results[0]\n",
"oid_column_index = next((c.ordinal for c in result.columns if c.column_name == 'OrchestrationId'), None)\n",
"su_column_index = next((c.ordinal for c in result.columns if c.column_name == 'ScaleUnit'), None)\n",
"\n",
"# group\n",
"by_su = {}\n",
"for r in result.rows:\n",
" su = r[su_column_index]\n",
" oid = r[oid_column_index]\n",
" l = by_su.get(su, [])\n",
" by_su[su] = l\n",
" l.append(oid)\n",
"\n",
"max_scale_units = []\n",
"max_problems = 0\n",
"for k,v in by_su.items():\n",
" c = len(v)\n",
" if c > max_problems:\n",
" max_problems = c\n",
" max_scale_units = [k]\n",
" elif c == max_problems:\n",
" max_scale_units.append(k)\n",
"max_scale_units.sort()\n",
"\n",
"# for su, oids in by_su.items():\n",
"# print(su)\n",
"# for oid in oids:\n",
"# print(' ', oid)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"# collect visualization data sets\n",
"query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLAVisualization.csl')\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" hfs = [executor.submit(akn.execute_file, client, 'VSO', query, \n",
" {\n",
" 'ScaleUnit': '\"' + r[su_column_index] + '\"', \n",
" 'OrchestrationId': '\"' + r[oid_column_index] + '\"'\n",
" }) for r in result.rows]\n",
" histories = [h.result() for h in concurrent.futures.as_completed(hfs)]\n",
"\n",
"# convert to data frames\n",
"primary_results = [h.primary_results[0] for h in histories]\n",
"dataframes = None\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in primary_results]\n",
" dataframes = [dff.result() for dff in concurrent.futures.as_completed(dataframe_futures)]\n",
"histories = None\n",
"\n",
"# try to filter out false positives? at least a certain number of phases must have been recorded.\n",
"required_phases = ('RunAgentJob.SendJob', 'RunAgentJob.JobCompleted')\n",
"filtered_dataframes = [df for df in dataframes if all([p in df['PhaseName'].values for p in required_phases])]\n",
"number_of_false_positives = len(dataframes) - len(filtered_dataframes)\n",
"dataframes = filtered_dataframes\n",
"plans_out_of_sla = [df['PlanId'].iat[0] for df in dataframes]\n",
"number_of_violations = len(dataframes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"worst_phaseName = ''\n",
"worst_count = 0\n",
"worst_team = ''\n",
"\n",
"if dataframes:\n",
" # what was the worst phase?\n",
" combined = pd.concat(dataframes, ignore_index=True)\n",
" df = combined.loc[combined['Level'] == 2].groupby(['PhaseName']).size().to_frame('Count').nlargest(1, 'Count')\n",
" if len(df.index) > 0:\n",
" worst_phaseName = df.index[0]\n",
" worst_count = df.iat[0, 0]\n",
" worst_team = worst_phaseName.split('.')[0]\n",
" \n",
" # what was the worst plan?\n",
" violations_df = akn.to_dataframe(violations.primary_results[0])\n",
" df = violations_df.groupby(['PlanId']).size().to_frame('Count').nlargest(1, 'Count')\n",
" plan_with_most_violations = df.index[0]\n",
" plan_with_most_violations_count = df.iat[0, 0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"if number_of_false_positives:\n",
" print(number_of_false_positives, 'plans are likely missing kusto data and were ignored.')\n",
"if number_of_violations <= 0:\n",
" print('no problems detected')\n",
"else:\n",
" for su in max_scale_units:\n",
" print(max_problems, 'of the problems were in', su)\n",
" \n",
" print(number_of_violations, \n",
" 'plans' if number_of_violations > 1 else 'plan', \n",
" 'had no apparent data problems and', \n",
" 'are' if number_of_violations > 1 else 'is', \n",
" 'out of SLA.')\n",
" \n",
" if plan_with_most_violations in plans_out_of_sla:\n",
" print(plan_with_most_violations, 'had the most violations with', plan_with_most_violations_count)\n",
" \n",
" if worst_phaseName:\n",
" print('\"' + worst_phaseName + '\"', 'was the slowest phase in', worst_count, \n",
" 'of the', number_of_violations, 'SLA violations.')\n",
" \n",
" print ('\\nConclusion:')\n",
" if number_of_violations > 5: \n",
" print('This is likely a real problem. Open icm against scale units:', max_scale_units)\n",
" print('Initially route it to: ', worst_team)\n",
" else: \n",
" print('Too much uncertainty -- do not open any ICMs.')\n",
" \n",
" if number_of_false_positives and float(number_of_false_positives) / float(max_problems) > .5:\n",
" for su in max_scale_units:\n",
" print(su, 'might be unhealthy based on the number of plans missing kusto data.')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"inputHidden": false,
"outputHidden": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"plt.rcdefaults()\n",
"\n",
"if dataframes:\n",
" number_of_graphs = min(25, len(dataframes))\n",
" fig, axes = plt.subplots(nrows=number_of_graphs, ncols=1, figsize=(8, 6 * number_of_graphs), constrained_layout=True)\n",
" for i in range(number_of_graphs):\n",
" df = dataframes[i]\n",
" ax = axes[i] if number_of_graphs > 1 else axes\n",
" ax.axhline(0, color='k')\n",
"\n",
" x = df['PhaseName']\n",
" xpos = np.arange(len(x))\n",
" y = df['PercentDifference']\n",
" plan_id = df['PlanId'].iloc[0]\n",
" violation_row = violations_df.loc[violations_df['PlanId'] == plan_id]\n",
" title = '\\n'.join([\n",
" 'plan id:' + plan_id,\n",
" 'scale unit:' + str(violation_row['ScaleUnit'].iloc[0]),\n",
" 'definition:' + str(df['DefinitionName'].iloc[0]),\n",
" 'plan duration: ' + str(violation_row['PlanDuration'].iloc[0]),\n",
" 'sla duration: ' + str(violation_row['TotalSLADuration'].iloc[0]),\n",
" ])\n",
" ax.title.set_text(title)\n",
"\n",
" ax.bar(x=xpos, height=y)\n",
" ax.set_xticks(xpos)\n",
" ax.set_xticklabels(x, rotation=45, ha=\"right\")\n",
"\n",
"# output_filename = 'analysis.svg'\n",
"# plt.savefig(output_filename, format='svg')"
]
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"nteract": {
"version": "0.15.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}