[devops] Improved diagnostics and implement deadlocked process termination. (#21317)

* Unify the code to collect diagnostic information about a bot.
* Unify some of the cleanup code to prepare a bot as well.
* Implement code to terminate processes on a bot that's used more than a day
  of CPU (presumably these processes are stuck for some reason).
This commit is contained in:
Rolf Bjarne Kvinge 2024-10-11 13:26:45 +02:00 коммит произвёл GitHub
Родитель 2f7c126ac9
Коммит 48f5591829
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
18 изменённых файлов: 105 добавлений и 74 удалений

Просмотреть файл

@ -12,6 +12,15 @@ df -h
# We don't care about errors in this section, we just want to clean as much as possible
set +e
# Clean workspace
(
REPO_PATH="SYSTEM_DEFAULTWORKINGDIRECTORY/$(basename "$BUILD_REPOSITORY_NAME")"
if test -d "$REPO_PATH"; then
cd "$REPO_PATH"
git clean -xfd
fi
)
# Delete all the simulator devices. These can take up a lot of space over time (I've seen 100+GB on the bots)
/Applications/Xcode.app/Contents/Developer/usr/bin/simctl delete all
@ -118,7 +127,9 @@ XCODE_SELECT=$(xcode-select -p)
for oldXcode in "${oldXcodes[@]}"; do
if [ "$XCODE_SELECT" != "$oldXcode/Contents/Developer" ]; then
sudo rm -Rf "$oldXcode"
if test -d "$oldXcode"; then
sudo rm -Rf "$oldXcode"
fi
else
echo "Not removing $oldXcode because is the currently selected one."
fi
@ -126,6 +137,13 @@ done
DIR="$(dirname "${BASH_SOURCE[0]}")"
"$DIR"/clean-simulator-runtime.sh
"$DIR"/kill-deadlocked-processes.sh
# Remove legacy Xamarin/MonoTouch stuff
sudo rm -Rf /Developer/MonoTouch
sudo rm -Rf /Library/Frameworks/Xamarin.iOS.framework
sudo rm -Rf /Library/Frameworks/Xamarin.Mac.framework
ls -R /Library/Frameworks
# Print disk status after cleaning
df -h

Просмотреть файл

@ -1,4 +0,0 @@
#!/bin/bash -ex
rm -Rvf package
time make -C xamarin-macios/ git-clean-all

Просмотреть файл

@ -5,6 +5,11 @@
set -o pipefail
IFS=$'\n\t'
# delete all watchOS simulators, we don't need them anymore
for i in $(xcrun simctl runtime list | grep "watchOS.*Ready" | sed -e 's/.* - //' -e 's/ .*//'); do
xcrun simctl runtime delete "$i"
done
xcrun simctl runtime list -j > simruntime.json
cat simruntime.json

Просмотреть файл

@ -1,6 +0,0 @@
#!/bin/bash -ex
sudo rm -Rf /Developer/MonoTouch
sudo rm -Rf /Library/Frameworks/Xamarin.iOS.framework
sudo rm -Rf /Library/Frameworks/Xamarin.Mac.framework
ls -R /Library/Frameworks

Просмотреть файл

@ -0,0 +1,27 @@
#!/bin/bash -e
echo "Looking for processes that have been stuck for more than a day, and will try to kill them."
# Collect the list of processes for the current user, including the CPU time.
# We then split the CPU time into separate fields, so that it's easier to figure out the total number of minutes later on.
IFS=$'\n'
PROCESSES=()
while IFS='' read -r line; do PROCESSES+=("$line"); done < <(ps -o cputime=,pid=,user=,lstart=,args= -U "$USER" -w -w -w | sed -e 's/\([0-9]*\):\([0-9][0-9]\)\.\([0-9][0-9]\)/\1 m \2 s \3 ms/' | sort -nr)
IFS=' '
for process in "${PROCESSES[@]}"; do
IFS=" " read -r -a FIELDS <<< "$process"
minutes=${FIELDS[0]}
pid=${FIELDS[6]}
echo "$process"
# looking for processes that have spent more than a day using the CPU (24h * 60min = 1440min)
if (( minutes > 1440 )); then
echo " This process has been stuck for more than $minutes minutes, so assuming it's deadlocked and we'll try to kill it:"
echo " kill -9 $pid"
kill -9 "$pid" | sed 's/^/ /' || true
fi
done
echo "No (more) processes stuck for more than a day."

Просмотреть файл

@ -1,5 +1,5 @@
# Dump the environment to see what we're working with.
& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY\xamarin-macios\tools\devops\automation\scripts\show_env.ps1"
& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY\$($Env:BUILD_REPOSITORY_NAME.Split('/')[1])\tools\devops\automation\scripts\show_bot_info.ps1"
# Set a few variables
$Env:DOTNET = "$Env:BUILD_SOURCESDIRECTORY\xamarin-macios\tests\dotnet\Windows\bin\dotnet\dotnet.exe"

Просмотреть файл

@ -8,13 +8,14 @@ if ($IsMacOS -or $IsLinux) {
Write-Host "COMPUTERNAME: ${env:COMPUTERNAME}"
}
gci env: | format-table -autosize
gci env: | format-table -autosize | Out-String -Width 8192
gci env: | format-table -autosize -wrap
Get-ChildItem env: | Sort-Object -Property Name | Format-Table -AutoSize | Out-String -Width 81920
if ($IsMacOS) {
Write-Host ""
Write-Host "## Uptime"
Write-Host ""
uptime
Write-Host ""
Write-Host "## System profile"
Write-Host ""
@ -25,10 +26,24 @@ if ($IsMacOS) {
Write-Host ""
ifconfig | grep 'inet '
Write-Host ""
Write-Host "## Top processes (ps)"
Write-Host ""
ps aux
Write-Host ""
Write-Host "## Top processes"
Write-Host "## Python3 location:"
Write-Host ""
top -l 1 -o TIME
which python3
Write-Host ""
Write-Host "## Pip3 version:"
Write-Host ""
pip3 -V
Write-Host ""
Write-Host "## Hardware info"
Write-Host ""
ioreg -l | grep -e Manufacturer -e 'Vendor Name'
}

Просмотреть файл

@ -1,6 +0,0 @@
Write-Host "Python3 location"
which python3
Write-Host "Pip3 version"
pip3 -V

Просмотреть файл

@ -27,8 +27,8 @@ steps:
repositoryAlias: ${{ parameters.repositoryAlias }}
commit: ${{ parameters.commit }}
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Show Environment'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- pwsh: |
if (Test-Path "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/Artifacts" -PathType Container) {

Просмотреть файл

@ -65,8 +65,8 @@ steps:
name: decisions
displayName: 'Make decisions'
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Show Environment'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- pwsh: |
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY/xamarin-macios/tools/devops/automation/scripts/MaciosCI.psd1

Просмотреть файл

@ -75,8 +75,8 @@ steps:
name: decisions
displayName: 'Make decisions'
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Show Environment'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- pwsh: |
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY/xamarin-macios/tools/devops/automation/scripts/MaciosCI.psd1

Просмотреть файл

@ -17,30 +17,14 @@ steps:
- bash: $(Build.SourcesDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/fix-github-ssh-key.sh
displayName: 'Fix GitHub SSH host key'
- bash: cd $(System.DefaultWorkingDirectory)/xamarin-macios/ && git clean -xdf
displayName: 'Clean workspace'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/clean-bot.sh
- bash: '$SYSTEM_DEFAULTWORKINGDIRECTORY/${BUILD_REPOSITORY_NAME/#*\//}/tools/devops/automation/scripts/bash/clean-bot.sh'
displayName: 'Clean bot'
continueOnError: true
timeoutInMinutes: 60
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Show Environment'
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_python_env.ps1
displayName: 'Show Python information'
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/delete-library-dirs.sh
displayName: 'Delete library folders'
timeoutInMinutes: 5
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/clean-results-dir.sh
workingDirectory: $(System.DefaultWorkingDirectory)
displayName: 'Clear results directory'
timeoutInMinutes: 5
continueOnError: true
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/remove-ui-prompt.sh
env:
OSX_KEYCHAIN_PASS: ${{ parameters.keyringPass }}

Просмотреть файл

@ -64,18 +64,16 @@ steps:
condition: succeededOrFailed() # we do not care about the previous process cleanup
continueOnError: true
- bash: cd $(System.DefaultWorkingDirectory)/xamarin-macios/ && git clean -xdf
displayName: 'Clean workspace'
# download the packages that have been created, install them, later download the zip files that contain the already built
# tests and execute them.
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Show Environment'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- bash: |
ioreg -l | grep -e Manufacturer -e 'Vendor Name'
displayName: 'Dump Hardware'
- bash: '$SYSTEM_DEFAULTWORKINGDIRECTORY/${BUILD_REPOSITORY_NAME/#*\//}/tools/devops/automation/scripts/bash/clean-bot.sh'
displayName: 'Clean bot'
continueOnError: true
timeoutInMinutes: 60
- bash: |
if [[ $(ioreg -l | grep -e 'VMware' | wc -l) -ne 0 ]]; then
@ -95,11 +93,6 @@ steps:
displayName: 'Set VM Vendor'
- bash: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/bash/clean-bot.sh
displayName: 'Clean bot'
continueOnError: true
timeoutInMinutes: 60
# Use a cmdlet to check if the space available in the devices root system is larger than 50 gb. If there is not
# enough space available it:
# 1. Set the status of the build to error. It is not a failure since no tests have been ran.
@ -109,7 +102,7 @@ steps:
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY\xamarin-macios\tools\devops\automation\scripts\MaciosCI.psd1
if ( -not (Test-HDFreeSpace -Size 5)) {
Set-Content -Path "$GITHUB_FAILURE_COMMENT_FILE" -Value "Not enough free space in the host."
Set-Content -Path "$GITHUB_FAILURE_COMMENT_FILE" -Value "Not enough free space in the host $Env:AGENT_MACHINENAME."
exit 1
}
env:

Просмотреть файл

@ -99,7 +99,7 @@ steps:
Import-Module ./MaciosCI.psd1
if ( -not (Test-HDFreeSpace -Size 20)) {
New-GitHubComment -Header "Tests failed catastrophically on $Env:CONTEXT" -Emoji ":fire:" -Description "Not enough free space in the host."
New-GitHubComment -Header "Tests failed catastrophically on $Env:CONTEXT" -Emoji ":fire:" -Description "Not enough free space in the host $Env:AGENT_MACHINENAME."
Stop-Pipeline
}
env:

Просмотреть файл

@ -49,8 +49,8 @@ steps:
- template: download-artifacts.yml
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Show Environment'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
# build a message with all the content of all tests, to do so, we get the labels and to pass them to pwsh we do a join with ;
# as the separator

Просмотреть файл

@ -37,8 +37,8 @@ steps:
repositoryAlias: ${{ parameters.repositoryAlias }}
commit: ${{ parameters.commit }}
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Dump Environment'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- ${{ if or(contains(variables['Build.Reason'], 'ResourceTrigger'), contains(variables['Build.Reason'], 'BuildCompletion'), contains(variables['Build.DefinitionName'], 'xamarin-macios-ci-tests'), contains(variables['Build.DefinitionName'], 'xamarin-macios-pr-tests')) }}:
- download: macios
@ -127,9 +127,6 @@ steps:
displayName: "Write and verify id_rsa"
continueOnError: true
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Show Environment'
- pwsh: |
Import-Module $Env:SYSTEM_DEFAULTWORKINGDIRECTORY\\xamarin-macios\\tools\\devops\\automation\\scripts\\MaciosCI.psd1
ssh -v -i "$(ID_RSA_PATH)" -o IdentitiesOnly=yes -o StrictHostKeyChecking=no builder@$Env:MAC_AGENT_IP pwd

Просмотреть файл

@ -19,8 +19,8 @@ steps:
repositoryAlias: ${{ parameters.repositoryAlias }}
commit: ${{ parameters.commit }}
- pwsh: $(System.DefaultWorkingDirectory)/xamarin-macios/tools/devops/automation/scripts/show_env.ps1
displayName: 'Dump Environment'
- pwsh: '& "$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- task: AzureKeyVault@2
inputs:

Просмотреть файл

@ -33,6 +33,14 @@ steps:
- checkout: maccore
persistCredentials: true # hugely important, else there are some scripts that check a single file from maccore that will fail
- pwsh: '"$Env:SYSTEM_DEFAULTWORKINGDIRECTORY/$($Env:BUILD_REPOSITORY_NAME.Split(''/'')[1])/tools/devops/automation/scripts/show_bot_info.ps1"'
displayName: 'Show Bot Info'
- bash: '$SYSTEM_DEFAULTWORKINGDIRECTORY/${BUILD_REPOSITORY_NAME/#*\//}/tools/devops/automation/scripts/bash/clean-bot.sh'
displayName: 'Clean bot'
continueOnError: true
timeoutInMinutes: 60
- bash: $(Build.SourcesDirectory)/xamarin-macios/tools/devops/automation/scripts/disable-codeql-arm64.sh
displayName: 'Disable CodeQL on arm64'
name: disableCodeQLOnArm64