Feature: Add hnsremediator for hns crash in Windows nodes (#4975)

* Feature: Add hnsremediator for hns crash in Windows nodes
This commit is contained in:
vepasupu 2022-10-07 21:32:36 +05:30 коммит произвёл GitHub
Родитель 02d8455c5c
Коммит 220edded4b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 103 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,57 @@
<#
.DESCRIPTION
HNS service may crash and HNS policies will be purged after it is restarted.
We use this script to restart kubeproxy to recover the node from the hns crash.
Start sequence:
1. windowsnodereset.ps1 deletes hns-remediator-task if hns-remediator-task exists
2. windowsnodereset.ps1 deletes "C:\k\hns.pid" if "C:\k\hns.pid" exists
3. windowsnodereset.ps1 resets all services, hns, csi, kubeproxy, kubelet, etc.
4. windowsnodereset.ps1 creates hns-remediator-task with $Global:ClusterConfiguration.Services.HNSRemediator.IntervalInMinutes
in c:\k\kubeclusterconfig.json when the value is not 0
NOTES:
1. We cannot run hns-remediator-task with an interval less than 1 minute since the RepetitionInterval parameter value in New-JobTrigger must be greater than 1 minute.
2. When the node crashes or is rebooted, hns-remediator-task may restart kubeproxy before windowsnodereset.ps1 is executed.
It should have no impact since windowsnodereset.ps1 always deletes hns-remediator-task and then deletes "C:\k\hns.pid" before stopping kubeproxy
#>
$LogPath = "c:\k\hnsremediator.log"
$hnsPIDFilePath="C:\k\hns.pid"
$isInitialized=$False
filter Timestamp { "$(Get-Date -Format o): $_" }
function Write-Log ($message) {
$message | Timestamp | Tee-Object -FilePath $LogPath -Append
}
if (Test-Path -Path $hnsPIDFilePath) {
$isInitialized=$True
}
$id = Get-WmiObject -Class Win32_Service -Filter "name='hns'" | Select-Object -ExpandProperty ProcessId
if (!$isInitialized) {
Write-Log "Initializing with creating $hnsPIDFilePath. PID of HNS service is $id"
echo $id > $hnsPIDFilePath
$isInitialized=$True
}
$lastId=Get-Content $hnsPIDFilePath
if ($lastId -ne $id) {
Write-Log "The PID of HNS service was changed from $lastId to $id"
echo $id > $hnsPIDFilePath
Write-Log "Restarting kubeproxy service"
Restart-Service kubeproxy
Write-Log "Restarted kubeproxy service"
$calicoService = Get-Service -Name CalicoFelix -ErrorAction Ignore
if ($calicoService) {
Write-Log "Restarting Calico services"
# CalicoFelix depends on CalicoNode
# https://github.com/projectcalico/calico/blob/master/node/windows-packaging/CalicoWindows/start-calico.ps1#L20
# https://github.com/projectcalico/calico/blob/35b0c499dc0b01d228cf70ba942afe4eb1b6a961/node/windows-packaging/CalicoWindows/felix/felix-service.ps1#L21
Restart-Service CalicoNode -ErrorAction Ignore
Restart-Service CalicoFelix -ErrorAction Ignore
Write-Log "Restarted Calico services"
}
}

Просмотреть файл

@ -27,6 +27,8 @@ Write-Log "Entering windowsnodereset.ps1"
Import-Module $global:HNSModule
Unregister-HNSRemediatorScriptTask
#
# Stop services
#
@ -46,6 +48,48 @@ if ($global:EnableHostsConfigAgent) {
Stop-Service hosts-config-agent
}
function Register-HNSRemediatorScriptTask {
# Hardcoding RepetitionInterval to 1 Minute
# Making it variable would need a new parameter to be added under windowsProfile
Write-Log "Creating a scheduled task to run hnsremediator.ps1"
$action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-File `"c:\k\hnsremediator.ps1`""
$principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest
$trigger = New-JobTrigger -Once -At (Get-Date).Date -RepeatIndefinitely -RepetitionInterval (New-TimeSpan -Minutes 1)
$definition = New-ScheduledTask -Action $action -Principal $principal -Trigger $trigger -Description "hns-remediator-task"
Register-ScheduledTask -TaskName "hns-remediator-task" -InputObject $definition
}
function Unregister-HNSRemediatorScriptTask {
if (Get-ScheduledTask -TaskName "hns-remediator-task" -ErrorAction Ignore) {
Write-Log "Deleting the scheduled task hns-remediator-task"
Unregister-ScheduledTask -TaskName "hns-remediator-task" -Confirm:$false
}
$hnsPIDFile="C:\k\hns.pid"
if (Test-Path $hnsPIDFile) {
# Remove this file since PID of HNS service may have been changed after node crashes or is rebooted
# It should not always fail since hns-remediator-task is unregistered.
# We set the max retry count to 20 to avoid dead loop for unknown issues.
$maxRetries=20
$retryCount=0
while ($retryCount -lt $maxRetries) {
Write-Log "Deleting $hnsPIDFile"
Remove-Item -Path $hnsPIDFile -Force -Confirm:$false -ErrorAction Ignore
# The file may not be deleted successfully because hnsremediator.ps1 is still writing the logs
if (Test-Path $hnsPIDFile) {
# Do not log the failure to reduce log
Start-Sleep -Milliseconds 500
$retryCount=$retryCount+1
} else {
Write-Log "$hnsPIDFile is deleted"
break
}
}
}
}
# Due to a bug in hns there is a race where it picks up the incorrect IPv6 address from the node in some cases.
# Hns service has to be restarted after the node internal IPv6 address is available when dual-stack is enabled.
# TODO Remove this once the bug is fixed in hns.
@ -109,4 +153,6 @@ Start-Service kubelet
Write-Log "Do not start kubeproxy service since kubelet will restart kubeproxy"
Register-HNSRemediatorScriptTask
Write-Log "Exiting windowsnodereset.ps1"