[devops] Improved diagnostics and implement deadlocked process termination. (#21317)
* Unify the code to collect diagnostic information about a bot. * Unify some of the cleanup code to prepare a bot as well. * Implement code to terminate processes on a bot that's used more than a day of CPU (presumably these processes are stuck for some reason).
This commit is contained in:
Родитель
2f7c126ac9
Коммит
48f5591829
|
@ -12,6 +12,15 @@ df -h
|
|||
# We don't care about errors in this section, we just want to clean as much as possible
|
||||
set +e
|
||||
|
||||
# Clean workspace
|
||||
(
|
||||
REPO_PATH="SYSTEM_DEFAULTWORKINGDIRECTORY/$(basename "$BUILD_REPOSITORY_NAME")"
|
||||
if test -d "$REPO_PATH"; then
|
||||
cd "$REPO_PATH"
|
||||
git clean -xfd
|
||||
fi
|
||||
)
|
||||
|
||||
# Delete all the simulator devices. These can take up a lot of space over time (I've seen 100+GB on the bots)
|
||||
/Applications/Xcode.app/Contents/Developer/usr/bin/simctl delete all
|
||||
|
||||
|
@ -118,7 +127,9 @@ XCODE_SELECT=$(xcode-select -p)
|
|||
|
||||
for oldXcode in "${oldXcodes[@]}"; do
|
||||
if [ "$XCODE_SELECT" != "$oldXcode/Contents/Developer" ]; then
|
||||
sudo rm -Rf "$oldXcode"
|
||||
if test -d "$oldXcode"; then
|
||||
sudo rm -Rf "$oldXcode"
|
||||
fi
|
||||
else
|
||||
echo "Not removing $oldXcode because is the currently selected one."
|
||||
fi
|
||||
|
@ -126,6 +137,13 @@ done
|
|||
|
||||
DIR="$(dirname "${BASH_SOURCE[0]}")"
|
||||
"$DIR"/clean-simulator-runtime.sh
|
||||
"$DIR"/kill-deadlocked-processes.sh
|
||||
|
||||
# Remove legacy Xamarin/MonoTouch stuff
|
||||
sudo rm -Rf /Developer/MonoTouch
|
||||
sudo rm -Rf /Library/Frameworks/Xamarin.iOS.framework
|
||||
sudo rm -Rf /Library/Frameworks/Xamarin.Mac.framework
|
||||
ls -R /Library/Frameworks
|
||||
|
||||
# Print disk status after cleaning
|
||||
df -h
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
#!/bin/bash -ex
|
||||
|
||||
rm -Rvf package
|
||||
time make -C xamarin-macios/ git-clean-all
|
|
@ -5,6 +5,11 @@
|
|||
set -o pipefail
|
||||
IFS=$'\n\t'
|
||||
|
||||
# delete all watchOS simulators, we don't need them anymore
|
||||
for i in $(xcrun simctl runtime list | grep "watchOS.*Ready" | sed -e 's/.* - //' -e 's/ .*//'); do
|
||||
xcrun simctl runtime delete "$i"
|
||||
done
|
||||
|
||||
xcrun simctl runtime list -j > simruntime.json
|
||||
cat simruntime.json
|
||||
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
#!/bin/bash -ex
|
||||
|
||||
sudo rm -Rf /Developer/MonoTouch
|
||||
sudo rm -Rf /Library/Frameworks/Xamarin.iOS.framework
|
||||
sudo rm -Rf /Library/Frameworks/Xamarin.Mac.framework
|
||||
ls -R /Library/Frameworks
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash -e
|
||||
|
||||
echo "Looking for processes that have been stuck for more than a day, and will try to kill them."
|
||||
|
||||
# Collect the list of processes for the current user, including the CPU time.
|
||||
# We then split the CPU time into separate fields, so that it's easier to figure out the total number of minutes later on.
|
||||
IFS=$'\n'
|
||||
PROCESSES=()
|
||||
while IFS='' read -r line; do PROCESSES+=("$line"); done < <(ps -o cputime=,pid=,user=,lstart=,args= -U "$USER" -w -w -w | sed -e 's/\([0-9]*\):\([0-9][0-9]\)\.\([0-9][0-9]\)/\1 m \2 s \3 ms/' | sort -nr)
|
||||
|
||||
IFS=' '
|
||||
for process in "${PROCESSES[@]}"; do
|
||||
IFS=" " read -r -a FIELDS <<< "$process"
|
||||
minutes=${FIELDS[0]}
|
||||
pid=${FIELDS[6]}
|
||||
|
||||
echo "$process"
|
||||
|
||||
# looking for processes that have spent more than a day using the CPU (24h * 60min = 1440min)
|
||||
if (( minutes > 1440 )); then
|
||||
echo " This process has been stuck for more than $minutes minutes, so assuming it's deadlocked and we'll try to kill it:"
|
||||
echo " kill -9 $pid"
|
||||
kill -9 "$pid" | sed 's/^/ /' || true
|
||||
fi
|
||||
done
|
||||
|
||||
echo "No (more) processes stuck for more than a day."
|
|
@ -1,5 +1,5 @@
|
|||
# Dump the environment to see what we're working with.
|
||||
& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY\xamarin-macios\tools\devops\automation\scripts\show_env.ps1"
|
||||
& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY\$($Env:BUILD_REPOSITORY_NAME.Split('/')[1])\tools\devops\automation\scripts\show_bot_info.ps1"
|
||||
|
||||
# Set a few variables
|
||||
$Env:DOTNET = "$Env:BUILD_SOURCESDIRECTORY\xamarin-macios\tests\dotnet\Windows\bin\dotnet\dotnet.exe"
|
||||
|
|
|
@ -8,13 +8,14 @@ if ($IsMacOS -or $IsLinux) {
|
|||
Write-Host "COMPUTERNAME: ${env:COMPUTERNAME}"
|
||||
}
|
||||
|
||||
gci env: | format-table -autosize
|
||||
|
||||
gci env: | format-table -autosize | Out-String -Width 8192
|
||||
|
||||
gci env: | format-table -autosize -wrap
|
||||
Get-ChildItem env: | Sort-Object -Property Name | Format-Table -AutoSize | Out-String -Width 81920
|
||||
|
||||
if ($IsMacOS) {
|
||||
Write-Host ""
|
||||
Write-Host "## Uptime"
|
||||
Write-Host ""
|
||||
uptime
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "## System profile"
|
||||
Write-Host ""
|
||||
|
@ -25,10 +26,24 @@ if ($IsMacOS) {
|
|||
Write-Host ""
|
||||
ifconfig | grep 'inet '
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "## Top processes (ps)"
|
||||
Write-Host ""
|
||||
ps aux
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "## Top processes"
|
||||
Write-Host "## Python3 location:"
|
||||
Write-Host ""
|
||||
top -l 1 -o TIME
|
||||
which python3
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "## Pip3 version:"
|
||||
Write-Host ""
|
||||
pip3 -V
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "## Hardware info"
|
||||
Write-Host ""
|
||||
ioreg -l | grep -e Manufacturer -e 'Vendor Name'
|
||||
}
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
Write-Host "Python3 location"
|
||||
which python3
|
||||
|
||||
Write-Host "Pip3 version"
|
||||
pip3 -V
|
||||
|
|
@ -27,8 +27,8 @@ steps:
|
|||
repositoryAlias: ${{ parameters.repositoryAlias }}
|
||||
commit: ${{ parameters.commit }}
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Show Environment'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- pwsh: |
|
||||
if (Test-Path "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/Artifacts" -PathType Container) {
|
||||
|
|
|
@ -65,8 +65,8 @@ steps:
|
|||
name: decisions
|
||||
displayName: 'Make decisions'
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Show Environment'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- pwsh: |
|
||||
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY/xamarin-macios/tools/devops/automation/scripts/MaciosCI.psd1
|
||||
|
|
|
@ -75,8 +75,8 @@ steps:
|
|||
name: decisions
|
||||
displayName: 'Make decisions'
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Show Environment'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- pwsh: |
|
||||
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY/xamarin-macios/tools/devops/automation/scripts/MaciosCI.psd1
|
||||
|
|
|
@ -17,30 +17,14 @@ steps:
|
|||
- bash: $(Build.SourcesDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/fix-github-ssh-key.sh
|
||||
displayName: 'Fix GitHub SSH host key'
|
||||
|
||||
- bash: cd $(System.DefaultWorkingDirectory)/xamarin-macios/ && git clean -xdf
|
||||
displayName: 'Clean workspace'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/clean-bot.sh
|
||||
- bash: '$SYSTEM_DEFAULTWORKINGDIRECTORY/${BUILD_REPOSITORY_NAME/#*\//}/tools/devops/automation/scripts/bash/clean-bot.sh'
|
||||
displayName: 'Clean bot'
|
||||
continueOnError: true
|
||||
timeoutInMinutes: 60
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Show Environment'
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_python_env.ps1
|
||||
displayName: 'Show Python information'
|
||||
|
||||
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/delete-library-dirs.sh
|
||||
displayName: 'Delete library folders'
|
||||
timeoutInMinutes: 5
|
||||
|
||||
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/clean-results-dir.sh
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
displayName: 'Clear results directory'
|
||||
timeoutInMinutes: 5
|
||||
continueOnError: true
|
||||
|
||||
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/remove-ui-prompt.sh
|
||||
env:
|
||||
OSX_KEYCHAIN_PASS: ${{ parameters.keyringPass }}
|
||||
|
|
|
@ -64,18 +64,16 @@ steps:
|
|||
condition: succeededOrFailed() # we do not care about the previous process cleanup
|
||||
continueOnError: true
|
||||
|
||||
- bash: cd $(System.DefaultWorkingDirectory)/xamarin-macios/ && git clean -xdf
|
||||
displayName: 'Clean workspace'
|
||||
|
||||
# download the packages that have been created, install them, later download the zip files that contain the already built
|
||||
# tests and execute them.
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Show Environment'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- bash: |
|
||||
ioreg -l | grep -e Manufacturer -e 'Vendor Name'
|
||||
displayName: 'Dump Hardware'
|
||||
- bash: '$SYSTEM_DEFAULTWORKINGDIRECTORY/${BUILD_REPOSITORY_NAME/#*\//}/tools/devops/automation/scripts/bash/clean-bot.sh'
|
||||
displayName: 'Clean bot'
|
||||
continueOnError: true
|
||||
timeoutInMinutes: 60
|
||||
|
||||
- bash: |
|
||||
if [[ $(ioreg -l | grep -e 'VMware' | wc -l) -ne 0 ]]; then
|
||||
|
@ -95,11 +93,6 @@ steps:
|
|||
|
||||
displayName: 'Set VM Vendor'
|
||||
|
||||
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/clean-bot.sh
|
||||
displayName: 'Clean bot'
|
||||
continueOnError: true
|
||||
timeoutInMinutes: 60
|
||||
|
||||
# Use a cmdlet to check if the space available in the devices root system is larger than 50 gb. If there is not
|
||||
# enough space available it:
|
||||
# 1. Set the status of the build to error. It is not a failure since no tests have been ran.
|
||||
|
@ -109,7 +102,7 @@ steps:
|
|||
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY\xamarin-macios\tools\devops\automation\scripts\MaciosCI.psd1
|
||||
|
||||
if ( -not (Test-HDFreeSpace -Size 5)) {
|
||||
Set-Content -Path "$GITHUB_FAILURE_COMMENT_FILE" -Value "Not enough free space in the host."
|
||||
Set-Content -Path "$GITHUB_FAILURE_COMMENT_FILE" -Value "Not enough free space in the host $Env:AGENT_MACHINENAME."
|
||||
exit 1
|
||||
}
|
||||
env:
|
||||
|
|
|
@ -99,7 +99,7 @@ steps:
|
|||
Import-Module ./MaciosCI.psd1
|
||||
|
||||
if ( -not (Test-HDFreeSpace -Size 20)) {
|
||||
New-GitHubComment -Header "Tests failed catastrophically on $Env:CONTEXT" -Emoji ":fire:" -Description "Not enough free space in the host."
|
||||
New-GitHubComment -Header "Tests failed catastrophically on $Env:CONTEXT" -Emoji ":fire:" -Description "Not enough free space in the host $Env:AGENT_MACHINENAME."
|
||||
Stop-Pipeline
|
||||
}
|
||||
env:
|
||||
|
|
|
@ -49,8 +49,8 @@ steps:
|
|||
|
||||
- template: download-artifacts.yml
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Show Environment'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
# build a message with all the content of all tests, to do so, we get the labels and to pass them to pwsh we do a join with ;
|
||||
# as the separator
|
||||
|
|
|
@ -37,8 +37,8 @@ steps:
|
|||
repositoryAlias: ${{ parameters.repositoryAlias }}
|
||||
commit: ${{ parameters.commit }}
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Dump Environment'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- ${{ if or(contains(variables['Build.Reason'], 'ResourceTrigger'), contains(variables['Build.Reason'], 'BuildCompletion'), contains(variables['Build.DefinitionName'], 'xamarin-macios-ci-tests'), contains(variables['Build.DefinitionName'], 'xamarin-macios-pr-tests')) }}:
|
||||
- download: macios
|
||||
|
@ -127,9 +127,6 @@ steps:
|
|||
displayName: "Write and verify id_rsa"
|
||||
continueOnError: true
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Show Environment'
|
||||
|
||||
- pwsh: |
|
||||
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY\\xamarin-macios\\tools\\devops\\automation\\scripts\\MaciosCI.psd1
|
||||
ssh -v -i "$(ID_RSA_PATH)" -o IdentitiesOnly=yes -o StrictHostKeyChecking=no builder@$Env:MAC_AGENT_IP pwd
|
||||
|
|
|
@ -19,8 +19,8 @@ steps:
|
|||
repositoryAlias: ${{ parameters.repositoryAlias }}
|
||||
commit: ${{ parameters.commit }}
|
||||
|
||||
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
|
||||
displayName: 'Dump Environment'
|
||||
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- task: AzureKeyVault@2
|
||||
inputs:
|
||||
|
|
|
@ -33,6 +33,14 @@ steps:
|
|||
- checkout: maccore
|
||||
persistCredentials: true # hugely important, else there are some scripts that check a single file from maccore that will fail
|
||||
|
||||
- pwsh: '"$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
|
||||
displayName: 'Show Bot Info'
|
||||
|
||||
- bash: '$SYSTEM_DEFAULTWORKINGDIRECTORY/${BUILD_REPOSITORY_NAME/#*\//}/tools/devops/automation/scripts/bash/clean-bot.sh'
|
||||
displayName: 'Clean bot'
|
||||
continueOnError: true
|
||||
timeoutInMinutes: 60
|
||||
|
||||
- bash: $(Build.SourcesDirectory)/xamarin-macios/tools/devops/automation/scripts/disable-codeql-arm64.sh
|
||||
displayName: 'Disable CodeQL on arm64'
|
||||
name: disableCodeQLOnArm64
|
||||
|
|
Загрузка…
Ссылка в новой задаче