315 строки
10 KiB
Plaintext
315 строки
10 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# SLA Investigation\n",
|
|
"1. Run all cells! (click on Menu > Cell > Run All Cells)\n",
|
|
"1. View report at the bottom."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false,
|
|
"tags": [
|
|
"parameters"
|
|
]
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"triggerTime = \"2019-10-15T20:21:54.0330000Z\"\n",
|
|
"scaleUnit = \"pipelines-ghub-eus2-2\"\n",
|
|
"service = \"pipelines\"\n",
|
|
"lookback = \"1h\"\n",
|
|
"region = \"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%%capture \n",
|
|
"\n",
|
|
"# install packages, setup workspace root\n",
|
|
"!pip install --upgrade pip azure-kusto-notebooks\n",
|
|
"import os\n",
|
|
"import sys\n",
|
|
"import datetime\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"pd.options.display.html.table_schema = True\n",
|
|
"import concurrent.futures\n",
|
|
"from azure.kusto.notebooks import utils as akn\n",
|
|
"\n",
|
|
"# cwd should be workspace root\n",
|
|
"if os.path.basename(os.getcwd()) == 'devops-pipelines':\n",
|
|
" os.chdir(os.pardir)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# authenticate kusto client\n",
|
|
"# you will need to copy the token into a browser window for AAD auth. \n",
|
|
"client = akn.get_client('https://vso.kusto.windows.net')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# find orchestrations that violate SLA\n",
|
|
"params = {\n",
|
|
" 'TriggerTime': akn.to_kusto_datetime(triggerTime),\n",
|
|
" 'Lookback': akn.to_kusto_timespan(lookback),\n",
|
|
" 'Service': '\"' + service + '\"', \n",
|
|
" 'Region': '\"' + region + '\"',\n",
|
|
" 'ScaleUnit': '\"' + scaleUnit + '\"'\n",
|
|
"}\n",
|
|
"query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLADurationAnalysis.csl')\n",
|
|
"violations = akn.execute_file(client, database='VSO', path=query, params=params)\n",
|
|
"# violations"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# collect problematic orchestration ids\n",
|
|
"result = violations.primary_results[0]\n",
|
|
"oid_column_index = next((c.ordinal for c in result.columns if c.column_name == 'OrchestrationId'), None)\n",
|
|
"su_column_index = next((c.ordinal for c in result.columns if c.column_name == 'ScaleUnit'), None)\n",
|
|
"\n",
|
|
"# group\n",
|
|
"by_su = {}\n",
|
|
"for r in result.rows:\n",
|
|
" su = r[su_column_index]\n",
|
|
" oid = r[oid_column_index]\n",
|
|
" l = by_su.get(su, [])\n",
|
|
" by_su[su] = l\n",
|
|
" l.append(oid)\n",
|
|
"\n",
|
|
"max_scale_units = []\n",
|
|
"max_problems = 0\n",
|
|
"for k,v in by_su.items():\n",
|
|
" c = len(v)\n",
|
|
" if c > max_problems:\n",
|
|
" max_problems = c\n",
|
|
" max_scale_units = [k]\n",
|
|
" elif c == max_problems:\n",
|
|
" max_scale_units.append(k)\n",
|
|
"max_scale_units.sort()\n",
|
|
"\n",
|
|
"# for su, oids in by_su.items():\n",
|
|
"# print(su)\n",
|
|
"# for oid in oids:\n",
|
|
"# print(' ', oid)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# collect visualization data sets\n",
|
|
"query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLAVisualization.csl')\n",
|
|
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
|
|
" hfs = [executor.submit(akn.execute_file, client, 'VSO', query, \n",
|
|
" {\n",
|
|
" 'ScaleUnit': '\"' + r[su_column_index] + '\"', \n",
|
|
" 'OrchestrationId': '\"' + r[oid_column_index] + '\"'\n",
|
|
" }) for r in result.rows]\n",
|
|
" histories = [h.result() for h in concurrent.futures.as_completed(hfs)]\n",
|
|
"\n",
|
|
"# convert to data frames\n",
|
|
"primary_results = [h.primary_results[0] for h in histories]\n",
|
|
"dataframes = None\n",
|
|
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
|
|
" dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in primary_results]\n",
|
|
" dataframes = [dff.result() for dff in concurrent.futures.as_completed(dataframe_futures)]\n",
|
|
"histories = None\n",
|
|
"\n",
|
|
"# try to filter out false positives? at least a certain number of phases must have been recorded.\n",
|
|
"required_phases = ('RunAgentJob.SendJob', 'RunAgentJob.JobCompleted')\n",
|
|
"filtered_dataframes = [df for df in dataframes if all([p in df['PhaseName'].values for p in required_phases])]\n",
|
|
"number_of_false_positives = len(dataframes) - len(filtered_dataframes)\n",
|
|
"dataframes = filtered_dataframes\n",
|
|
"plans_out_of_sla = [df['PlanId'].iat[0] for df in dataframes]\n",
|
|
"number_of_violations = len(dataframes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"worst_phaseName = ''\n",
|
|
"worst_count = 0\n",
|
|
"worst_team = ''\n",
|
|
"\n",
|
|
"if dataframes:\n",
|
|
" # what was the worst phase?\n",
|
|
" combined = pd.concat(dataframes, ignore_index=True)\n",
|
|
" df = combined.loc[combined['Level'] == 2].groupby(['PhaseName']).size().to_frame('Count').nlargest(1, 'Count')\n",
|
|
" if len(df.index) > 0:\n",
|
|
" worst_phaseName = df.index[0]\n",
|
|
" worst_count = df.iat[0, 0]\n",
|
|
" worst_team = worst_phaseName.split('.')[0]\n",
|
|
" \n",
|
|
" # what was the worst plan?\n",
|
|
" violations_df = akn.to_dataframe(violations.primary_results[0])\n",
|
|
" df = violations_df.groupby(['PlanId']).size().to_frame('Count').nlargest(1, 'Count')\n",
|
|
" plan_with_most_violations = df.index[0]\n",
|
|
" plan_with_most_violations_count = df.iat[0, 0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"if number_of_false_positives:\n",
|
|
" print(number_of_false_positives, 'plans are likely missing kusto data and were ignored.')\n",
|
|
"if number_of_violations <= 0:\n",
|
|
" print('no problems detected')\n",
|
|
"else:\n",
|
|
" for su in max_scale_units:\n",
|
|
" print(max_problems, 'of the problems were in', su)\n",
|
|
" \n",
|
|
" print(number_of_violations, \n",
|
|
" 'plans' if number_of_violations > 1 else 'plan', \n",
|
|
" 'had no apparent data problems and', \n",
|
|
" 'are' if number_of_violations > 1 else 'is', \n",
|
|
" 'out of SLA.')\n",
|
|
" \n",
|
|
" if plan_with_most_violations in plans_out_of_sla:\n",
|
|
" print(plan_with_most_violations, 'had the most violations with', plan_with_most_violations_count)\n",
|
|
" \n",
|
|
" if worst_phaseName:\n",
|
|
" print('\"' + worst_phaseName + '\"', 'was the slowest phase in', worst_count, \n",
|
|
" 'of the', number_of_violations, 'SLA violations.')\n",
|
|
" \n",
|
|
" print ('\\nConclusion:')\n",
|
|
" if number_of_violations > 5: \n",
|
|
" print('This is likely a real problem. Open icm against scale units:', max_scale_units)\n",
|
|
" print('Initially route it to: ', worst_team)\n",
|
|
" else: \n",
|
|
" print('Too much uncertainty -- do not open any ICMs.')\n",
|
|
" \n",
|
|
" if number_of_false_positives and float(number_of_false_positives) / float(max_problems) > .5:\n",
|
|
" for su in max_scale_units:\n",
|
|
" print(su, 'might be unhealthy based on the number of plans missing kusto data.')\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"inputHidden": false,
|
|
"outputHidden": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%matplotlib inline\n",
|
|
"plt.rcdefaults()\n",
|
|
"\n",
|
|
"if dataframes:\n",
|
|
" number_of_graphs = min(25, len(dataframes))\n",
|
|
" fig, axes = plt.subplots(nrows=number_of_graphs, ncols=1, figsize=(8, 6 * number_of_graphs), constrained_layout=True)\n",
|
|
" for i in range(number_of_graphs):\n",
|
|
" df = dataframes[i]\n",
|
|
" ax = axes[i] if number_of_graphs > 1 else axes\n",
|
|
" ax.axhline(0, color='k')\n",
|
|
"\n",
|
|
" x = df['PhaseName']\n",
|
|
" xpos = np.arange(len(x))\n",
|
|
" y = df['PercentDifference']\n",
|
|
" plan_id = df['PlanId'].iloc[0]\n",
|
|
" violation_row = violations_df.loc[violations_df['PlanId'] == plan_id]\n",
|
|
" title = '\\n'.join([\n",
|
|
" 'plan id:' + plan_id,\n",
|
|
" 'scale unit:' + str(violation_row['ScaleUnit'].iloc[0]),\n",
|
|
" 'definition:' + str(df['DefinitionName'].iloc[0]),\n",
|
|
" 'plan duration: ' + str(violation_row['PlanDuration'].iloc[0]),\n",
|
|
" 'sla duration: ' + str(violation_row['TotalSLADuration'].iloc[0]),\n",
|
|
" ])\n",
|
|
" ax.title.set_text(title)\n",
|
|
"\n",
|
|
" ax.bar(x=xpos, height=y)\n",
|
|
" ax.set_xticks(xpos)\n",
|
|
" ax.set_xticklabels(x, rotation=45, ha=\"right\")\n",
|
|
"\n",
|
|
"# output_filename = 'analysis.svg'\n",
|
|
"# plt.savefig(output_filename, format='svg')"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernel_info": {
|
|
"name": "python3"
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.4"
|
|
},
|
|
"nteract": {
|
|
"version": "0.15.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|