This commit is contained in:
yash 2019-07-25 17:29:54 -04:00
Родитель 961ba3dc03
Коммит 7c1ff16047
14 изменённых файлов: 888 добавлений и 344 удалений

331
.gitignore поставляемый
Просмотреть файл

@ -1,330 +1,3 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
**/Properties/launchSettings.json
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
Kqlmagic_temp_files
.ipynb_checkpoints/*
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/

Просмотреть файл

@ -1,3 +1,5 @@
# Devops-pipelines
warehouse of notebooks that container queries to help in root causing pipeline delays in azuredevops
# Contributing

579
delays.ipynb Normal file
Просмотреть файл

@ -0,0 +1,579 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"## Orchestration delays investigation\n",
"This notebook makes various kusto queries and produces a report.\n"
],
"metadata": {}
},
{
"cell_type": "code",
"source": [
"# These are just defaults will be overwritten if you use https://github.com/yaananth/nimport\n",
"su=\"tfs-cus-1\"\n",
"start=\"2019-07-20T16:00:00.0000000Z\"\n",
"end=\"2019-07-20T16:33:36.0000000Z\"\n",
"url=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
"baseUrl=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
"service=\"tfs\"\n",
"hub=\"Build\""
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
}
},
{
"cell_type": "code",
"source": [
"# Import the things we use\n",
"\n",
"# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
"# %kql is single line magic\n",
"# %%kql is cell magic\n",
"\n",
"# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
"# https://ipython.readthedocs.io/en/stable/interactive/magics.html\n",
"from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
"\n",
"# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
"import pandas as pd\n",
"from pandas import Series, DataFrame\n",
"from datetime import datetime, timedelta, timezone\n",
"from urllib.parse import urlencode, quote_plus\n",
"from requests.utils import requote_uri\n",
"import time\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"from nimport.utils import tokenize, open_nb\n",
"import json\n",
"import os"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"params = {\n",
" \"su\": su,\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"url\": url,\n",
" \"baseUrl\": baseUrl,\n",
" \"service\": service,\n",
" \"hub\": hub\n",
"}\n",
"root = 'pipeline-delays' if os.path.basename(os.getcwd()) != 'pipeline-delays' else ''\n",
" "
],
"outputs": [],
"execution_count": 4,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"%load_ext Kqlmagic"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# kusto/python utilities\n",
"def getTime(timestamp, d):\n",
" return int((time.mktime(getDateTime(timestamp).timetuple()) + (d * 60)) * 1000);\n",
" \n",
"def getDateTime(timestamp):\n",
" s = timestamp[:23] + 'Z' # only allow 5 decimals of precision\n",
" for f in (\"%Y-%m-%d %H:%M:%S.%fZ\", \"%Y-%m-%dT%H:%M:%S.%fZ\"):\n",
" try:\n",
" return datetime.strptime(s, f);\n",
" except:\n",
" pass\n",
" \n",
"def getKustoQuery(csl_filename, params):\n",
"\n",
" return tokenize(os.path.join(root, 'queries', csl_filename), params)\n",
" \n",
"states = {\n",
" \"ja\": \"ja\",\n",
" \"healthagent\": \"healthagent\"\n",
"};\n",
"\n",
"state = \"\";"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Active incidents?\n",
"# ActiveIncidents.csl\n",
"%kql AzureDataExplorer://tenant=\"Microsoft.com\";code;cluster='Icmcluster';database='IcMDataWarehouse' \n",
"q_activeIncidents = getKustoQuery(\"ActiveIncidents.csl\", params)\n",
"q_activeIncidentsResult = %kql -query q_activeIncidents\n",
"q_activeIncidentsResultDf = q_activeIncidentsResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Note: KQL has a bug where it copies the wrong code (the previous cluster's auth code instead of current) when we try to auth to multiple clusters\n",
"# Copy the code manually instead, if you already closed the dialog, copy code and go to https://microsoft.com/devicelogin\n",
"%kql AzureDataExplorer://tenant=\"Microsoft.com\";code;cluster='VSO';database='VSO'"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# AffectedAccounts.csl\n",
"q_affectedAccounts = getKustoQuery(\"AffectedAccounts.csl\", params)\n",
"q_affectedAccountsResult = %kql -query q_affectedAccounts\n",
"q_affectedAccountsResultDf = q_affectedAccountsResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Abusers.csl\n",
"q_abusers = getKustoQuery(\"Abusers.csl\", {})\n",
"Abusers = %kql -query q_abusers\n",
"abusersDf = Abusers.to_dataframe();\n",
"finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"%%kql\n",
"locationNameResult <<\n",
"let _su = su;\n",
"let _service = service;\n",
"ActivityLog\n",
"| where ScaleUnit == _su\n",
"| where Service =~ _service\n",
"| project Tenant\n",
"| take 1"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"locationName = locationNameResult[0]['Tenant']\n",
"params[\"locationName\"] = locationName"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# AffectedAccounts.csl\n",
"q_affAccounts = getKustoQuery(\"AffectedAccounts.csl\", params)\n",
"q_affAccounts_r = %kql -query q_affAccounts\n",
"q_affAccounts_df = q_affAccounts_r.to_dataframe();"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# DelayedAccountsAreAbusers.csl\n",
"q_delayedAccountsAreAbusers = getKustoQuery(\"DelayedAccountsAreAbusers.csl\", params)\n",
"q_delayedAccountsAreAbusers_r = %kql -query q_delayedAccountsAreAbusers\n",
"q_delayedAccountsAreAbusers_df = q_delayedAccountsAreAbusers_r.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# WhatChanged.csl\n",
"q_whatChanged = getKustoQuery(\"WhatChanged.csl\", params)\n",
"q_whatChanged_r = %kql -query q_whatChanged\n",
"q_whatChanged_df = q_whatChanged_r.to_dataframe();"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# HealthAgentActions.csl\n",
"q_haActions = getKustoQuery(\"HealthAgentActions.csl\", params)\n",
"q_haActions_r = %kql -query q_haActions\n",
"q_haActions_df = q_haActions_r.to_dataframe();"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# VIPSwap.csl\n",
"q_vipSwap = getKustoQuery(\"VIPSwap.csl\", params)\n",
"vipSwapResult = %kql -query q_vipSwap\n",
"vipSwapResultDf = vipSwapResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# WhatDelayed.csl\n",
"q_whatDelayed = getKustoQuery(\"WhatDelayed.csl\", params)\n",
"q_whatDelayedResult = %kql -query q_whatDelayed\n",
"q_whatDelayedResultDf = q_whatDelayedResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Load.csl\n",
"q_load = getKustoQuery(\"Load.csl\", params)\n",
"q_loadResult = %kql -query q_load\n",
"q_loadResultDf = q_loadResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# https://plot.ly/python/graphing-multiple-chart-types/\n",
"\n",
"# https://plot.ly/python/line-and-scatter/\n",
"import plotly.graph_objs as go\n",
"import plotly\n",
"plotly.offline.init_notebook_mode(connected=True)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# visualize delays\n",
"delays = go.Scatter(\n",
" x=q_affAccounts_df[\"PreciseTimeStamp\"],\n",
" y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n",
" mode = 'lines',\n",
" name = 'Delays in seconds',\n",
" text= q_affAccounts_df['Name']\n",
")\n",
"\n",
"changed = go.Scatter(\n",
" x=q_whatChanged_df[\"TIMESTAMP\"],\n",
" y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n",
" mode = 'lines+markers',\n",
" name = 'What Changed',\n",
" text = q_whatChanged_df[\"Name\"],\n",
" marker=dict(\n",
" size=32,\n",
" color = np.random.randn(500),\n",
" colorscale='Viridis'\n",
" )\n",
")\n",
"\n",
"mitigations = go.Scatter(\n",
" x=q_haActions_df[\"PreciseTimeStamp\"],\n",
" y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n",
" mode = 'markers',\n",
" name = 'Mitigations',\n",
" text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n",
" marker = dict(\n",
" size = 10,\n",
" color = 'rgba(152, 0, 0, .8)',\n",
" line = dict(\n",
" width = 2,\n",
" color = 'rgb(0, 0, 0)'\n",
" )\n",
" )\n",
")\n",
"\n",
"data = [delays, changed, mitigations]\n",
"plotly.offline.iplot(data)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"print('=' * 50)\n",
"print('Report!')\n",
"print('=' * 50, '\\n\\n')\n",
"\n",
"jarvisParams = {'su': su, 'start': getTime(start, -10), 'end': getTime(end, 10), 'service': service }\n",
"\n",
"# jarvis\n",
"jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/TFS DevOpsReports\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
"print('Jarvis dashboard link:\\n', requote_uri(jarvisLink), '\\n')\n",
"\n",
"# what changed? analysis\n",
"print()\n",
"print('What changed? =============================')\n",
"if(len(q_whatChanged_df.index) == 0):\n",
" print(\"No relevant changes found...\")\n",
"else:\n",
" up_prefix = \"\";\n",
" mit_prefix = \"\";\n",
" text = \"\";\n",
" for index, row in q_whatChanged_df.iterrows():\n",
" if(row.title.lower().find('upgrade') != -1):\n",
" if not up_prefix:\n",
" up_prefix += \"Looks like, there's upgrade in progress...\\n\\n\";\n",
" text += \"\"\"%s %s %s \\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber);\n",
" if(row.title.lower().find('mitigation') != -1):\n",
" if not mit_prefix:\n",
" mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\";\n",
" state += states[\"healthagent\"];\n",
" text += \"\"\"%s %s %s\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber);\n",
" \n",
" if text:\n",
" print(up_prefix + mit_prefix + text)\n",
" else:\n",
" print(q_whatChanged_df)\n",
" \n",
"# active incidents?\n",
"print()\n",
"print('Active incidents? =============================')\n",
"otherIncidentsCount = 0;\n",
"for index, row in q_activeIncidentsResultDf.iterrows():\n",
" if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n",
" otherIncidentsCount+=1;\n",
" \n",
"if(otherIncidentsCount > 0):\n",
" print(\"We found some incidents during the time period, check if they are related...\")\n",
" # styling\n",
" def make_clickable(url, text):\n",
" return '{0}'.format(url)\n",
"\n",
" newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n",
" print(\"ICM link to copy - \" + \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\")\n",
" print(newDf[['IncidentId','Severity','Title']])\n",
"else:\n",
" print(\"No active incidents that could be related are found...\")\n",
"\n",
"print()\n",
"print('Queue Load =============================')\n",
"ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n",
"queuedGreatherThan100 = np.where(ar > 100)\n",
"if len(queuedGreatherThan100[0]) > 0:\n",
" print(\"\"\"More than 100 requests are queued in 1 minute (Actual: %s)...could be a load issue\"\"\" % (np.amax(ar)))\n",
"else: \n",
" print('...everything looks good?')\n",
"# ja load\n",
"print()\n",
"print('JA Load =============================')\n",
"q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n",
"pendingGreaterThan10Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 10)\n",
"if len(pendingGreaterThan10Result[0]) > 0:\n",
" print(\"We are seeing high pending jobs from job agent (highest being %s total pending jobs in 1 min), could be an issue with job agents...\" % (np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values))) \n",
" # update state\n",
" state += states[\"ja\"]\n",
" \n",
" open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n",
" jaUrl = baseUrl + \"/pipeline-delays/ja.ipynb\"\n",
" print('Investigate job agent related issues by going here:', requote_uri(jaUrl), '\\n')\n",
"\n",
" jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
" print('Jarvis dashboard link for job agents:\\n', requote_uri(jaJarvisLink), '\\n')\n",
"else:\n",
" print('...everything looks good?')\n",
" \n",
"# abuse detection?\n",
"print()\n",
"print('What users are impacted? =============================')\n",
"if len(finalabusersList) > 0:\n",
" print('Found abusers - !!')\n",
"print(q_delayedAccountsAreAbusers_df)\n",
" \n",
"#\n",
"# vip swap\n",
"print()\n",
"print('Vip Swap? =============================')\n",
"if len(vipSwapResultDf.index) > 0:\n",
" viptime = vipSwapResultDf[\"TIMESTAMP\"][0]\n",
" starttime = getDateTime(start)\n",
" delta = starttime.replace(tzinfo=None) - viptime.replace(tzinfo=None)\n",
" if delta.total_seconds() > 0:\n",
" print(\"\"\"VIP SWAP happened: %s days %s hours %s minutes ago (%s) (issue start: %s)\"\"\" % (delta.days, delta.seconds//3600, (delta.seconds//60) % 60, viptime, start))\n",
" else:\n",
" print('...no swaps recorded in the given time range')\n",
"else:\n",
" print('...no swaps recorded in the given time range')\n",
"\n",
"# more analysis? \n",
"print()\n",
"print('More analysis =============================')\n",
"if os.path.exists(\"SLAInvestigation/sla.ipynb\"):\n",
" slaUrl = baseUrl + \"/SLAInvestigation/sla.ipynb\"\n",
" print('Investigate SLA by going here:', requote_uri(slaUrl), '\\n') "
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"language_info": {
"name": "python",
"version": "3.7.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nteract": {
"version": "0.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

121
ja.ipynb Normal file
Просмотреть файл

@ -0,0 +1,121 @@
{
"cells": [
{
"cell_type": "code",
"source": [
"su = \"tfs-cus-1\"\n",
"start = \"2019-07-20T16:00:00.0000000Z\"\n",
"end = \"2019-07-20T16:33:36.0000000Z\"\n",
"url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
"baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
"locationName = \"tfsprodcus1\"\n",
"service = \"tfs\"\n",
"hub = \"Build\""
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
}
},
{
"cell_type": "code",
"source": [
"# This isn't needed if you are bootstraping\n",
"#!pip install Kqlmagic --no-cache-dir --upgrade\n",
"!pip install nimport --no-cache-dir --upgrade\n",
"%load_ext nimport"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"params = {\n",
" \"su\": su,\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"url\": url,\n",
" \"baseUrl\": baseUrl,\n",
" \"locationName\": locationName,\n",
" \"service\": service,\n",
" \"hub\": hub\n",
"}"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Let's clone our repo\n",
"%nimport container=\"yaananth/azuredevops-ja\" path=\"ja.ipynb\" provider=\"github\" providerOptions={\"clone\":\"true\"}"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"from nimport.utils import open_nb, redirectTo\n",
"open_nb(\"azuredevops-ja/ja.ipynb\", params, redirect=False)\n",
"redirectTo(\"azuredevops-ja/ja.ipynb\", baseUrl)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"language_info": {
"name": "python",
"version": "3.7.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nteract": {
"version": "0.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

8
queries/Abusers.csl Normal file
Просмотреть файл

@ -0,0 +1,8 @@
TraceLightRailLog
| where ServiceName =~ 'mms'
| where Command == 'Stop-ServiceHost'
| where Message startswith 'HostId = '
| extend HostId = tostring(split(Message, ' ')[2])
| summarize by HostId
| union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId)
| distinct HostId

Просмотреть файл

@ -0,0 +1,9 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 5hr;
let endTime = todatetime({end}) + 5hr;
//38 is the VSTS Tenant ID in IcM
getincidents(38,startTime, endTime)
//| where Severity < 3 and IsOutage == 1 and isnull(ParentIncidentId)
| project IncidentId, Severity, CreateDate, Title, Status, OwningTeamName
| where Title contains scaleUnit
| order by CreateDate desc

Просмотреть файл

@ -0,0 +1,28 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let affectedAccounts =
ProductTrace
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Tracepoint == 15010000
| where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
| extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
| extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
| extend MessageDelay = DbDelay + BufferDelay
| summarize avg(MessageDelay) by ServiceHost, Layer, bin(PreciseTimeStamp, step)
| extend Threshold = strcat(threshold, 's')
| where avg_MessageDelay > totimespan(Threshold);
ServiceHostAggregated()
| join (affectedAccounts) on $left.HostId == $right.ServiceHost
| where Service =~ service
| where HostType == 4
| project Name, HostId, Layer, PreciseTimeStamp, MessageDelayInSeconds = avg_MessageDelay / 1s, DatabaseName, Threshold
| order by PreciseTimeStamp desc
// | order by MessageDelayInSeconds desc

Просмотреть файл

@ -0,0 +1,32 @@
// Impacted accounts in time window, and are they known abusers
//
let startTime = todatetime({start}) - 15m;
let endTime = todatetime({end}) + 15m;
let service = {service};
let hubName = {hub};
let scaleUnit = {su};
let Abusers = TraceLightRailLog
| where ServiceName =~ 'mms'
| where Command == 'Stop-ServiceHost'
| where Message startswith 'HostId = '
| extend HostId = tostring(split(Message, ' ')[2])
| summarize by HostId
| union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId)
| distinct HostId;
let ActivityDispatcherDelays = ProductTrace
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Tracepoint == 15010000
| where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
| extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
| extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
| extend MessageDelayInSeconds = toint((DbDelay + BufferDelay) / 1s)
| join kind=leftouter (ServiceHostAggregated() | where Service =~ service | where HostType == 4 | summarize by HostId, Name)
on $left.ServiceHost == $right.HostId;
// table
//
ActivityDispatcherDelays
| summarize AvgMessageDelay=round(avg(MessageDelayInSeconds)) by Name, HostId
| extend Abuser = iff(HostId in (Abusers), "yep", "")
| order by AvgMessageDelay desc

21
queries/Delays.csl Normal file
Просмотреть файл

@ -0,0 +1,21 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let tenant = {locationName};
ProductTrace
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Tracepoint == 15010000
| where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
| extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
| extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
| extend MessageDelayInSeconds = toint((DbDelay + BufferDelay) / 1s)
| join kind=leftouter (ServiceHostAggregated() | where Service =~ service | where HostType == 4 | summarize by HostId, Name)
on $left.ServiceHost == $right.HostId
| project PreciseTimeStamp, Name, HostId, DbDelay, BufferDelay, MessageDelayInSeconds

Просмотреть файл

@ -0,0 +1,14 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 5hr;
let endTime = todatetime({end}) + 5hr;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let tenant = {locationName};
VssHealthAgentActions
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| project PreciseTimeStamp, RoleInstance, MitigationName, Directory, ActionName

21
queries/Load.csl Normal file
Просмотреть файл

@ -0,0 +1,21 @@
// orchestrator kpi's
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let interval = 1m;
KPI
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Metrics contains "DTPlan" or Metrics contains "DTAgent" or Metrics contains "DTJob"
| extend DataObj = parsejson(Metrics)
| extend MetricsObjArr = parsejson(DataObj.metrics)
| extend MetricsObj = MetricsObjArr[0]
| extend Name = tostring(MetricsObj.name)
| extend DisplayName = MetricsObj.displayName
| extend Value = todouble(MetricsObj.value)
| project PreciseTimeStamp, DataObj, Name, DisplayName, Value, MetricsObj
| summarize sum(Value) by Name, bin(PreciseTimeStamp, interval)
| render timechart

1
queries/VIPSwap.csl Normal file
Просмотреть файл

@ -0,0 +1 @@
Last_VIP_Swap_Time() | where ScaleUnit == {locationName}

12
queries/WhatChanged.csl Normal file
Просмотреть файл

@ -0,0 +1,12 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 5hr;
let endTime = todatetime({end}) + 5hr;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let tenant = {locationName};
WhatChangedRange(tenant=tenant, startTime, endTime-startTime)
| extend Name=strcat(['title'], "@", tostring(TIMESTAMP))
| project TIMESTAMP, Name

23
queries/WhatDelayed.csl Normal file
Просмотреть файл

@ -0,0 +1,23 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let interval = 1m;
CounterEvent
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Role == 'JobAgent'
| where CounterName startswith strcat("\\TFS Services:Orchestration(", hubName) or CounterName startswith "\\TFS Services:JobService(_Total)"
| extend NameOnly = extract("\\)\\\\(.*)$", 1, CounterName, typeof(string))
| where NameOnly in (
'Total Pending Jobs',
'Pending Job Age',
'Average Activity Message Delay',
'Average Activity Job Delay',
'Average Activity Execution Time'
)
| extend Pivot = replace("(TFS Services:)|(Orchestration\\(Build-)","", CounterName)
| summarize avg(CounterValue) by Pivot, bin(PreciseTimeStamp, interval)
| render timechart