This commit is contained in:
yash 2019-07-25 17:29:54 -04:00
Родитель 961ba3dc03
Коммит 7c1ff16047
14 изменённых файлов: 888 добавлений и 344 удалений

333
.gitignore поставляемый
Просмотреть файл

@ -1,330 +1,3 @@
## Ignore Visual Studio temporary files, build results, and Kqlmagic_temp_files
## files generated by popular Visual Studio add-ons. .ipynb_checkpoints/*
## *.pyc
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
**/Properties/launchSettings.json
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/

Просмотреть файл

@ -1,14 +1,16 @@
# Devops-pipelines
# Contributing warehouse of notebooks that container queries to help in root causing pipeline delays in azuredevops
This project welcomes contributions and suggestions. Most contributions require you to agree to a # Contributing
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
When you submit a pull request, a CLA bot will automatically determine whether you need to provide the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repos using our CLA. When you submit a pull request, a CLA bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). provided by the bot. You will only need to do this once across all repos using our CLA.
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

579
delays.ipynb Normal file
Просмотреть файл

@ -0,0 +1,579 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"## Orchestration delays investigation\n",
"This notebook makes various kusto queries and produces a report.\n"
],
"metadata": {}
},
{
"cell_type": "code",
"source": [
"# These are just defaults will be overwritten if you use https://github.com/yaananth/nimport\n",
"su=\"tfs-cus-1\"\n",
"start=\"2019-07-20T16:00:00.0000000Z\"\n",
"end=\"2019-07-20T16:33:36.0000000Z\"\n",
"url=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
"baseUrl=\"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
"service=\"tfs\"\n",
"hub=\"Build\""
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
}
},
{
"cell_type": "code",
"source": [
"# Import the things we use\n",
"\n",
"# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
"# %kql is single line magic\n",
"# %%kql is cell magic\n",
"\n",
"# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
"# https://ipython.readthedocs.io/en/stable/interactive/magics.html\n",
"from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
"\n",
"# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
"import pandas as pd\n",
"from pandas import Series, DataFrame\n",
"from datetime import datetime, timedelta, timezone\n",
"from urllib.parse import urlencode, quote_plus\n",
"from requests.utils import requote_uri\n",
"import time\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"from nimport.utils import tokenize, open_nb\n",
"import json\n",
"import os"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"params = {\n",
" \"su\": su,\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"url\": url,\n",
" \"baseUrl\": baseUrl,\n",
" \"service\": service,\n",
" \"hub\": hub\n",
"}\n",
"root = 'pipeline-delays' if os.path.basename(os.getcwd()) != 'pipeline-delays' else ''\n",
" "
],
"outputs": [],
"execution_count": 4,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"%load_ext Kqlmagic"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# kusto/python utilities\n",
"def getTime(timestamp, d):\n",
" return int((time.mktime(getDateTime(timestamp).timetuple()) + (d * 60)) * 1000);\n",
" \n",
"def getDateTime(timestamp):\n",
" s = timestamp[:23] + 'Z' # only allow 5 decimals of precision\n",
" for f in (\"%Y-%m-%d %H:%M:%S.%fZ\", \"%Y-%m-%dT%H:%M:%S.%fZ\"):\n",
" try:\n",
" return datetime.strptime(s, f);\n",
" except:\n",
" pass\n",
" \n",
"def getKustoQuery(csl_filename, params):\n",
"\n",
" return tokenize(os.path.join(root, 'queries', csl_filename), params)\n",
" \n",
"states = {\n",
" \"ja\": \"ja\",\n",
" \"healthagent\": \"healthagent\"\n",
"};\n",
"\n",
"state = \"\";"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Active incidents?\n",
"# ActiveIncidents.csl\n",
"%kql AzureDataExplorer://tenant=\"Microsoft.com\";code;cluster='Icmcluster';database='IcMDataWarehouse' \n",
"q_activeIncidents = getKustoQuery(\"ActiveIncidents.csl\", params)\n",
"q_activeIncidentsResult = %kql -query q_activeIncidents\n",
"q_activeIncidentsResultDf = q_activeIncidentsResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Note: KQL has a bug where it copies the wrong code (the previous cluster's auth code instead of current) when we try to auth to multiple clusters\n",
"# Copy the code manually instead, if you already closed the dialog, copy code and go to https://microsoft.com/devicelogin\n",
"%kql AzureDataExplorer://tenant=\"Microsoft.com\";code;cluster='VSO';database='VSO'"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# AffectedAccounts.csl\n",
"q_affectedAccounts = getKustoQuery(\"AffectedAccounts.csl\", params)\n",
"q_affectedAccountsResult = %kql -query q_affectedAccounts\n",
"q_affectedAccountsResultDf = q_affectedAccountsResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Abusers.csl\n",
"q_abusers = getKustoQuery(\"Abusers.csl\", {})\n",
"Abusers = %kql -query q_abusers\n",
"abusersDf = Abusers.to_dataframe();\n",
"finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"%%kql\n",
"locationNameResult <<\n",
"let _su = su;\n",
"let _service = service;\n",
"ActivityLog\n",
"| where ScaleUnit == _su\n",
"| where Service =~ _service\n",
"| project Tenant\n",
"| take 1"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"locationName = locationNameResult[0]['Tenant']\n",
"params[\"locationName\"] = locationName"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# AffectedAccounts.csl\n",
"q_affAccounts = getKustoQuery(\"AffectedAccounts.csl\", params)\n",
"q_affAccounts_r = %kql -query q_affAccounts\n",
"q_affAccounts_df = q_affAccounts_r.to_dataframe();"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# DelayedAccountsAreAbusers.csl\n",
"q_delayedAccountsAreAbusers = getKustoQuery(\"DelayedAccountsAreAbusers.csl\", params)\n",
"q_delayedAccountsAreAbusers_r = %kql -query q_delayedAccountsAreAbusers\n",
"q_delayedAccountsAreAbusers_df = q_delayedAccountsAreAbusers_r.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# WhatChanged.csl\n",
"q_whatChanged = getKustoQuery(\"WhatChanged.csl\", params)\n",
"q_whatChanged_r = %kql -query q_whatChanged\n",
"q_whatChanged_df = q_whatChanged_r.to_dataframe();"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# HealthAgentActions.csl\n",
"q_haActions = getKustoQuery(\"HealthAgentActions.csl\", params)\n",
"q_haActions_r = %kql -query q_haActions\n",
"q_haActions_df = q_haActions_r.to_dataframe();"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# VIPSwap.csl\n",
"q_vipSwap = getKustoQuery(\"VIPSwap.csl\", params)\n",
"vipSwapResult = %kql -query q_vipSwap\n",
"vipSwapResultDf = vipSwapResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# WhatDelayed.csl\n",
"q_whatDelayed = getKustoQuery(\"WhatDelayed.csl\", params)\n",
"q_whatDelayedResult = %kql -query q_whatDelayed\n",
"q_whatDelayedResultDf = q_whatDelayedResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Load.csl\n",
"q_load = getKustoQuery(\"Load.csl\", params)\n",
"q_loadResult = %kql -query q_load\n",
"q_loadResultDf = q_loadResult.to_dataframe()"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# https://plot.ly/python/graphing-multiple-chart-types/\n",
"\n",
"# https://plot.ly/python/line-and-scatter/\n",
"import plotly.graph_objs as go\n",
"import plotly\n",
"plotly.offline.init_notebook_mode(connected=True)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# visualize delays\n",
"delays = go.Scatter(\n",
" x=q_affAccounts_df[\"PreciseTimeStamp\"],\n",
" y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n",
" mode = 'lines',\n",
" name = 'Delays in seconds',\n",
" text= q_affAccounts_df['Name']\n",
")\n",
"\n",
"changed = go.Scatter(\n",
" x=q_whatChanged_df[\"TIMESTAMP\"],\n",
" y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n",
" mode = 'lines+markers',\n",
" name = 'What Changed',\n",
" text = q_whatChanged_df[\"Name\"],\n",
" marker=dict(\n",
" size=32,\n",
" color = np.random.randn(500),\n",
" colorscale='Viridis'\n",
" )\n",
")\n",
"\n",
"mitigations = go.Scatter(\n",
" x=q_haActions_df[\"PreciseTimeStamp\"],\n",
" y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n",
" mode = 'markers',\n",
" name = 'Mitigations',\n",
" text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n",
" marker = dict(\n",
" size = 10,\n",
" color = 'rgba(152, 0, 0, .8)',\n",
" line = dict(\n",
" width = 2,\n",
" color = 'rgb(0, 0, 0)'\n",
" )\n",
" )\n",
")\n",
"\n",
"data = [delays, changed, mitigations]\n",
"plotly.offline.iplot(data)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"print('=' * 50)\n",
"print('Report!')\n",
"print('=' * 50, '\\n\\n')\n",
"\n",
"jarvisParams = {'su': su, 'start': getTime(start, -10), 'end': getTime(end, 10), 'service': service }\n",
"\n",
"# jarvis\n",
"jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/TFS DevOpsReports\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
"print('Jarvis dashboard link:\\n', requote_uri(jarvisLink), '\\n')\n",
"\n",
"# what changed? analysis\n",
"print()\n",
"print('What changed? =============================')\n",
"if(len(q_whatChanged_df.index) == 0):\n",
" print(\"No relevant changes found...\")\n",
"else:\n",
" up_prefix = \"\";\n",
" mit_prefix = \"\";\n",
" text = \"\";\n",
" for index, row in q_whatChanged_df.iterrows():\n",
" if(row.title.lower().find('upgrade') != -1):\n",
" if not up_prefix:\n",
" up_prefix += \"Looks like, there's upgrade in progress...\\n\\n\";\n",
" text += \"\"\"%s %s %s \\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber);\n",
" if(row.title.lower().find('mitigation') != -1):\n",
" if not mit_prefix:\n",
" mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\";\n",
" state += states[\"healthagent\"];\n",
" text += \"\"\"%s %s %s\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber);\n",
" \n",
" if text:\n",
" print(up_prefix + mit_prefix + text)\n",
" else:\n",
" print(q_whatChanged_df)\n",
" \n",
"# active incidents?\n",
"print()\n",
"print('Active incidents? =============================')\n",
"otherIncidentsCount = 0;\n",
"for index, row in q_activeIncidentsResultDf.iterrows():\n",
" if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n",
" otherIncidentsCount+=1;\n",
" \n",
"if(otherIncidentsCount > 0):\n",
" print(\"We found some incidents during the time period, check if they are related...\")\n",
" # styling\n",
" def make_clickable(url, text):\n",
" return '{0}'.format(url)\n",
"\n",
" newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n",
" print(\"ICM link to copy - \" + \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\")\n",
" print(newDf[['IncidentId','Severity','Title']])\n",
"else:\n",
" print(\"No active incidents that could be related are found...\")\n",
"\n",
"print()\n",
"print('Queue Load =============================')\n",
"ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n",
"queuedGreatherThan100 = np.where(ar > 100)\n",
"if len(queuedGreatherThan100[0]) > 0:\n",
" print(\"\"\"More than 100 requests are queued in 1 minute (Actual: %s)...could be a load issue\"\"\" % (np.amax(ar)))\n",
"else: \n",
" print('...everything looks good?')\n",
"# ja load\n",
"print()\n",
"print('JA Load =============================')\n",
"q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n",
"pendingGreaterThan10Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 10)\n",
"if len(pendingGreaterThan10Result[0]) > 0:\n",
" print(\"We are seeing high pending jobs from job agent (highest being %s total pending jobs in 1 min), could be an issue with job agents...\" % (np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values))) \n",
" # update state\n",
" state += states[\"ja\"]\n",
" \n",
" open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n",
" jaUrl = baseUrl + \"/pipeline-delays/ja.ipynb\"\n",
" print('Investigate job agent related issues by going here:', requote_uri(jaUrl), '\\n')\n",
"\n",
" jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n",
" \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
" \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
" \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
" print('Jarvis dashboard link for job agents:\\n', requote_uri(jaJarvisLink), '\\n')\n",
"else:\n",
" print('...everything looks good?')\n",
" \n",
"# abuse detection?\n",
"print()\n",
"print('What users are impacted? =============================')\n",
"if len(finalabusersList) > 0:\n",
" print('Found abusers - !!')\n",
"print(q_delayedAccountsAreAbusers_df)\n",
" \n",
"#\n",
"# vip swap\n",
"print()\n",
"print('Vip Swap? =============================')\n",
"if len(vipSwapResultDf.index) > 0:\n",
" viptime = vipSwapResultDf[\"TIMESTAMP\"][0]\n",
" starttime = getDateTime(start)\n",
" delta = starttime.replace(tzinfo=None) - viptime.replace(tzinfo=None)\n",
" if delta.total_seconds() > 0:\n",
" print(\"\"\"VIP SWAP happened: %s days %s hours %s minutes ago (%s) (issue start: %s)\"\"\" % (delta.days, delta.seconds//3600, (delta.seconds//60) % 60, viptime, start))\n",
" else:\n",
" print('...no swaps recorded in the given time range')\n",
"else:\n",
" print('...no swaps recorded in the given time range')\n",
"\n",
"# more analysis? \n",
"print()\n",
"print('More analysis =============================')\n",
"if os.path.exists(\"SLAInvestigation/sla.ipynb\"):\n",
" slaUrl = baseUrl + \"/SLAInvestigation/sla.ipynb\"\n",
" print('Investigate SLA by going here:', requote_uri(slaUrl), '\\n') "
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"language_info": {
"name": "python",
"version": "3.7.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nteract": {
"version": "0.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

121
ja.ipynb Normal file
Просмотреть файл

@ -0,0 +1,121 @@
{
"cells": [
{
"cell_type": "code",
"source": [
"su = \"tfs-cus-1\"\n",
"start = \"2019-07-20T16:00:00.0000000Z\"\n",
"end = \"2019-07-20T16:33:36.0000000Z\"\n",
"url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
"baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
"locationName = \"tfsprodcus1\"\n",
"service = \"tfs\"\n",
"hub = \"Build\""
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false,
"tags": [
"parameters"
]
}
},
{
"cell_type": "code",
"source": [
"# This isn't needed if you are bootstraping\n",
"#!pip install Kqlmagic --no-cache-dir --upgrade\n",
"!pip install nimport --no-cache-dir --upgrade\n",
"%load_ext nimport"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"params = {\n",
" \"su\": su,\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"url\": url,\n",
" \"baseUrl\": baseUrl,\n",
" \"locationName\": locationName,\n",
" \"service\": service,\n",
" \"hub\": hub\n",
"}"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"# Let's clone our repo\n",
"%nimport container=\"yaananth/azuredevops-ja\" path=\"ja.ipynb\" provider=\"github\" providerOptions={\"clone\":\"true\"}"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
},
{
"cell_type": "code",
"source": [
"from nimport.utils import open_nb, redirectTo\n",
"open_nb(\"azuredevops-ja/ja.ipynb\", params, redirect=False)\n",
"redirectTo(\"azuredevops-ja/ja.ipynb\", baseUrl)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"collapsed": false,
"inputHidden": false,
"outputHidden": false
}
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"language_info": {
"name": "python",
"version": "3.7.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nteract": {
"version": "0.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

8
queries/Abusers.csl Normal file
Просмотреть файл

@ -0,0 +1,8 @@
TraceLightRailLog
| where ServiceName =~ 'mms'
| where Command == 'Stop-ServiceHost'
| where Message startswith 'HostId = '
| extend HostId = tostring(split(Message, ' ')[2])
| summarize by HostId
| union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId)
| distinct HostId

Просмотреть файл

@ -0,0 +1,9 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 5hr;
let endTime = todatetime({end}) + 5hr;
//38 is the VSTS Tenant ID in IcM
getincidents(38,startTime, endTime)
//| where Severity < 3 and IsOutage == 1 and isnull(ParentIncidentId)
| project IncidentId, Severity, CreateDate, Title, Status, OwningTeamName
| where Title contains scaleUnit
| order by CreateDate desc

Просмотреть файл

@ -0,0 +1,28 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let affectedAccounts =
ProductTrace
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Tracepoint == 15010000
| where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
| extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
| extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
| extend MessageDelay = DbDelay + BufferDelay
| summarize avg(MessageDelay) by ServiceHost, Layer, bin(PreciseTimeStamp, step)
| extend Threshold = strcat(threshold, 's')
| where avg_MessageDelay > totimespan(Threshold);
ServiceHostAggregated()
| join (affectedAccounts) on $left.HostId == $right.ServiceHost
| where Service =~ service
| where HostType == 4
| project Name, HostId, Layer, PreciseTimeStamp, MessageDelayInSeconds = avg_MessageDelay / 1s, DatabaseName, Threshold
| order by PreciseTimeStamp desc
// | order by MessageDelayInSeconds desc

Просмотреть файл

@ -0,0 +1,32 @@
// Impacted accounts in time window, and are they known abusers
//
let startTime = todatetime({start}) - 15m;
let endTime = todatetime({end}) + 15m;
let service = {service};
let hubName = {hub};
let scaleUnit = {su};
let Abusers = TraceLightRailLog
| where ServiceName =~ 'mms'
| where Command == 'Stop-ServiceHost'
| where Message startswith 'HostId = '
| extend HostId = tostring(split(Message, ' ')[2])
| summarize by HostId
| union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId)
| distinct HostId;
let ActivityDispatcherDelays = ProductTrace
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Tracepoint == 15010000
| where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
| extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
| extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
| extend MessageDelayInSeconds = toint((DbDelay + BufferDelay) / 1s)
| join kind=leftouter (ServiceHostAggregated() | where Service =~ service | where HostType == 4 | summarize by HostId, Name)
on $left.ServiceHost == $right.HostId;
// table
//
ActivityDispatcherDelays
| summarize AvgMessageDelay=round(avg(MessageDelayInSeconds)) by Name, HostId
| extend Abuser = iff(HostId in (Abusers), "yep", "")
| order by AvgMessageDelay desc

21
queries/Delays.csl Normal file
Просмотреть файл

@ -0,0 +1,21 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let tenant = {locationName};
ProductTrace
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Tracepoint == 15010000
| where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
| extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
| extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
| extend MessageDelayInSeconds = toint((DbDelay + BufferDelay) / 1s)
| join kind=leftouter (ServiceHostAggregated() | where Service =~ service | where HostType == 4 | summarize by HostId, Name)
on $left.ServiceHost == $right.HostId
| project PreciseTimeStamp, Name, HostId, DbDelay, BufferDelay, MessageDelayInSeconds

Просмотреть файл

@ -0,0 +1,14 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 5hr;
let endTime = todatetime({end}) + 5hr;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let tenant = {locationName};
VssHealthAgentActions
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| project PreciseTimeStamp, RoleInstance, MitigationName, Directory, ActionName

21
queries/Load.csl Normal file
Просмотреть файл

@ -0,0 +1,21 @@
// orchestrator kpi's
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let interval = 1m;
KPI
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Metrics contains "DTPlan" or Metrics contains "DTAgent" or Metrics contains "DTJob"
| extend DataObj = parsejson(Metrics)
| extend MetricsObjArr = parsejson(DataObj.metrics)
| extend MetricsObj = MetricsObjArr[0]
| extend Name = tostring(MetricsObj.name)
| extend DisplayName = MetricsObj.displayName
| extend Value = todouble(MetricsObj.value)
| project PreciseTimeStamp, DataObj, Name, DisplayName, Value, MetricsObj
| summarize sum(Value) by Name, bin(PreciseTimeStamp, interval)
| render timechart

1
queries/VIPSwap.csl Normal file
Просмотреть файл

@ -0,0 +1 @@
Last_VIP_Swap_Time() | where ScaleUnit == {locationName}

12
queries/WhatChanged.csl Normal file
Просмотреть файл

@ -0,0 +1,12 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 5hr;
let endTime = todatetime({end}) + 5hr;
let service = {service};
let hubName = {hub};
let threshold = 10;
let step = totimespan("1m");
let window = totimespan("5m");
let tenant = {locationName};
WhatChangedRange(tenant=tenant, startTime, endTime-startTime)
| extend Name=strcat(['title'], "@", tostring(TIMESTAMP))
| project TIMESTAMP, Name

23
queries/WhatDelayed.csl Normal file
Просмотреть файл

@ -0,0 +1,23 @@
let scaleUnit = {su};
let startTime = todatetime({start}) - 15min;
let endTime = todatetime({end}) + 15min;
let service = {service};
let hubName = {hub};
let interval = 1m;
CounterEvent
| where PreciseTimeStamp between (startTime .. endTime)
| where Service =~ service
| where ScaleUnit =~ scaleUnit
| where Role == 'JobAgent'
| where CounterName startswith strcat("\\TFS Services:Orchestration(", hubName) or CounterName startswith "\\TFS Services:JobService(_Total)"
| extend NameOnly = extract("\\)\\\\(.*)$", 1, CounterName, typeof(string))
| where NameOnly in (
'Total Pending Jobs',
'Pending Job Age',
'Average Activity Message Delay',
'Average Activity Job Delay',
'Average Activity Execution Time'
)
| extend Pivot = replace("(TFS Services:)|(Orchestration\\(Build-)","", CounterName)
| summarize avg(CounterValue) by Pivot, bin(PreciseTimeStamp, interval)
| render timechart