зеркало из https://github.com/Azure/aks-engine.git
Feature: Add hnsremediator for hns crash in Windows nodes (#4975)
* Feature: Add hnsremediator for hns crash in Windows nodes
This commit is contained in:
Родитель
02d8455c5c
Коммит
220edded4b
|
@ -0,0 +1,57 @@
|
|||
<#
|
||||
.DESCRIPTION
|
||||
HNS service may crash and HNS policies will be purged after it is restarted.
|
||||
We use this script to restart kubeproxy to recover the node from the hns crash.
|
||||
Start sequence:
|
||||
1. windowsnodereset.ps1 deletes hns-remediator-task if hns-remediator-task exists
|
||||
2. windowsnodereset.ps1 deletes "C:\k\hns.pid" if "C:\k\hns.pid" exists
|
||||
3. windowsnodereset.ps1 resets all services, hns, csi, kubeproxy, kubelet, etc.
|
||||
4. windowsnodereset.ps1 creates hns-remediator-task with $Global:ClusterConfiguration.Services.HNSRemediator.IntervalInMinutes
|
||||
in c:\k\kubeclusterconfig.json when the value is not 0
|
||||
NOTES:
|
||||
1. We cannot run hns-remediator-task with an interval less than 1 minute since the RepetitionInterval parameter value in New-JobTrigger must be greater than 1 minute.
|
||||
2. When the node crashes or is rebooted, hns-remediator-task may restart kubeproxy before windowsnodereset.ps1 is executed.
|
||||
It should have no impact since windowsnodereset.ps1 always deletes hns-remediator-task and then deletes "C:\k\hns.pid" before stopping kubeproxy
|
||||
#>
|
||||
|
||||
$LogPath = "c:\k\hnsremediator.log"
|
||||
$hnsPIDFilePath="C:\k\hns.pid"
|
||||
$isInitialized=$False
|
||||
|
||||
filter Timestamp { "$(Get-Date -Format o): $_" }
|
||||
|
||||
function Write-Log ($message) {
|
||||
$message | Timestamp | Tee-Object -FilePath $LogPath -Append
|
||||
}
|
||||
|
||||
if (Test-Path -Path $hnsPIDFilePath) {
|
||||
$isInitialized=$True
|
||||
}
|
||||
|
||||
$id = Get-WmiObject -Class Win32_Service -Filter "name='hns'" | Select-Object -ExpandProperty ProcessId
|
||||
if (!$isInitialized) {
|
||||
Write-Log "Initializing with creating $hnsPIDFilePath. PID of HNS service is $id"
|
||||
echo $id > $hnsPIDFilePath
|
||||
$isInitialized=$True
|
||||
}
|
||||
|
||||
$lastId=Get-Content $hnsPIDFilePath
|
||||
if ($lastId -ne $id) {
|
||||
Write-Log "The PID of HNS service was changed from $lastId to $id"
|
||||
echo $id > $hnsPIDFilePath
|
||||
|
||||
Write-Log "Restarting kubeproxy service"
|
||||
Restart-Service kubeproxy
|
||||
Write-Log "Restarted kubeproxy service"
|
||||
|
||||
$calicoService = Get-Service -Name CalicoFelix -ErrorAction Ignore
|
||||
if ($calicoService) {
|
||||
Write-Log "Restarting Calico services"
|
||||
# CalicoFelix depends on CalicoNode
|
||||
# https://github.com/projectcalico/calico/blob/master/node/windows-packaging/CalicoWindows/start-calico.ps1#L20
|
||||
# https://github.com/projectcalico/calico/blob/35b0c499dc0b01d228cf70ba942afe4eb1b6a961/node/windows-packaging/CalicoWindows/felix/felix-service.ps1#L21
|
||||
Restart-Service CalicoNode -ErrorAction Ignore
|
||||
Restart-Service CalicoFelix -ErrorAction Ignore
|
||||
Write-Log "Restarted Calico services"
|
||||
}
|
||||
}
|
|
@ -27,6 +27,8 @@ Write-Log "Entering windowsnodereset.ps1"
|
|||
|
||||
Import-Module $global:HNSModule
|
||||
|
||||
Unregister-HNSRemediatorScriptTask
|
||||
|
||||
#
|
||||
# Stop services
|
||||
#
|
||||
|
@ -46,6 +48,48 @@ if ($global:EnableHostsConfigAgent) {
|
|||
Stop-Service hosts-config-agent
|
||||
}
|
||||
|
||||
function Register-HNSRemediatorScriptTask {
|
||||
# Hardcoding RepetitionInterval to 1 Minute
|
||||
# Making it variable would need a new parameter to be added under windowsProfile
|
||||
Write-Log "Creating a scheduled task to run hnsremediator.ps1"
|
||||
|
||||
$action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-File `"c:\k\hnsremediator.ps1`""
|
||||
$principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest
|
||||
$trigger = New-JobTrigger -Once -At (Get-Date).Date -RepeatIndefinitely -RepetitionInterval (New-TimeSpan -Minutes 1)
|
||||
$definition = New-ScheduledTask -Action $action -Principal $principal -Trigger $trigger -Description "hns-remediator-task"
|
||||
Register-ScheduledTask -TaskName "hns-remediator-task" -InputObject $definition
|
||||
}
|
||||
|
||||
function Unregister-HNSRemediatorScriptTask {
|
||||
if (Get-ScheduledTask -TaskName "hns-remediator-task" -ErrorAction Ignore) {
|
||||
Write-Log "Deleting the scheduled task hns-remediator-task"
|
||||
Unregister-ScheduledTask -TaskName "hns-remediator-task" -Confirm:$false
|
||||
}
|
||||
|
||||
$hnsPIDFile="C:\k\hns.pid"
|
||||
if (Test-Path $hnsPIDFile) {
|
||||
# Remove this file since PID of HNS service may have been changed after node crashes or is rebooted
|
||||
# It should not always fail since hns-remediator-task is unregistered.
|
||||
# We set the max retry count to 20 to avoid dead loop for unknown issues.
|
||||
$maxRetries=20
|
||||
$retryCount=0
|
||||
while ($retryCount -lt $maxRetries) {
|
||||
Write-Log "Deleting $hnsPIDFile"
|
||||
Remove-Item -Path $hnsPIDFile -Force -Confirm:$false -ErrorAction Ignore
|
||||
|
||||
# The file may not be deleted successfully because hnsremediator.ps1 is still writing the logs
|
||||
if (Test-Path $hnsPIDFile) {
|
||||
# Do not log the failure to reduce log
|
||||
Start-Sleep -Milliseconds 500
|
||||
$retryCount=$retryCount+1
|
||||
} else {
|
||||
Write-Log "$hnsPIDFile is deleted"
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Due to a bug in hns there is a race where it picks up the incorrect IPv6 address from the node in some cases.
|
||||
# Hns service has to be restarted after the node internal IPv6 address is available when dual-stack is enabled.
|
||||
# TODO Remove this once the bug is fixed in hns.
|
||||
|
@ -109,4 +153,6 @@ Start-Service kubelet
|
|||
|
||||
Write-Log "Do not start kubeproxy service since kubelet will restart kubeproxy"
|
||||
|
||||
Register-HNSRemediatorScriptTask
|
||||
|
||||
Write-Log "Exiting windowsnodereset.ps1"
|
||||
|
|
Загрузка…
Ссылка в новой задаче