diff --git a/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1 b/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1 new file mode 100644 index 00000000..62bf9d77 --- /dev/null +++ b/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1 @@ -0,0 +1,41 @@ +# Enlist the fixed crashes to detect codepath execution +$fixLogs = @( + [pscustomobject]@{ + faultStr='*ElbDsrPolicy-Update-Failure*'; + bugId='41071049'; + }, + [pscustomobject]@{ + faultStr='*Network-Not-Found*'; + bugId='42521831'; + } +) + +$errStr="" +$crashDetected=$false +$hnsCrashEvts=(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like "*The Host Network Service terminated unexpectedly*").TimeCreated; +if($hnsCrashEvts.Count -gt 0) { + $crashDetected=$true + # Log HNS Crashes + $errStr += "HNS crash detected at "; + foreach ($ts in $hnsCrashEvts) { + $errStr += "("+$ts+") "; + } + $errStr += "`n"; +} + +foreach($fixLog in $fixLogs.GetEnumerator()) { + $faultEvent=(Get-WinEvent -FilterHashtable @{logname = 'Microsoft-Windows-Host-Network-Service-Admin' } | Select-Object -Property TimeCreated, Message | Where-Object Message -like $fixLog.faultStr).TimeCreated + if ($faultEvent.Count -gt 0) { + $errStr += "Bug #" + $fixLog.bugId + " gracefully handled at "; + foreach ($ts in $faultEvent) { + $errStr += "("+$ts+") "; + } + $errStr += "`n"; + } +} + +if ($crashDetected -eq $false) { + Write-Host "$(date) HNS crash not detected" +} + +Write-Host $errStr; diff --git a/Kubernetes/windows/debug/faulttolerance/faultTolerance.md b/Kubernetes/windows/debug/faulttolerance/faultTolerance.md new file mode 100644 index 00000000..3bbe8942 --- /dev/null +++ b/Kubernetes/windows/debug/faulttolerance/faultTolerance.md @@ -0,0 +1,36 @@ +# faultTolerance.ps1 + +This will analyze Host Network Service faults and provide concise summary / mitigation steps / auto-mitigate issues. + +## Instructions for AKS cluster + +### With powershell access to the cluster (kubectl) + +1. Run **faultTolerance.ps1** script on powershell with access to the AKS cluster using this command +``` + .\faultTolerance.ps1 + daemonset.apps/faulttolerance created + Sleep for a minute for fault tolerance pods to be up... + **No HNS crashes detected in the cluster** + Sleep for an hour before deleting the fault tolerance pods automatically... + daemonset.apps "faulttolerance" deleted +``` + +### Without powershell access to the cluster (kubectl) + +1. Apply the yaml **faulttolerance.yaml** on an AKS cluster using this command +``` + Cleanup the previous instance of the daemon set and re-apply. + + kubectl delete -f faulttolerance.yaml + kubectl apply -f faulttolerance.yaml +``` + +2. Wait for 5 minutes and redirect the output of the following command to a text file and provide it to the support engineer. +``` + kubectl logs -l name=faulttolerance --all-containers=true + + Example: + kubectl logs -l name=faulttolerance --all-containers=true >> faulttolerance.txt + Provide the generated faulttolerance.txt +``` \ No newline at end of file diff --git a/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 b/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 new file mode 100644 index 00000000..a0d865dc --- /dev/null +++ b/Kubernetes/windows/debug/faulttolerance/faultTolerance.ps1 @@ -0,0 +1,87 @@ +$faultToleranceYaml = @' +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: faulttolerance + labels: + app: faulttolerance +spec: + selector: + matchLabels: + name: faulttolerance + template: + metadata: + labels: + name: faulttolerance + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: "NT AUTHORITY\\SYSTEM" + hostNetwork: true + containers: + - name: faulttolerance + image: mcr.microsoft.com/windows/servercore:1809 + args: + - powershell.exe + - -Command + - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" + imagePullPolicy: IfNotPresent + volumeMounts: + - name: kube-path + mountPath: C:\k + volumes: + - name: kube-path + hostPath: + path: C:\k + nodeSelector: + kubernetes.azure.com/os-sku: Windows2019 +'@ + +$faultToleranceYaml | kubectl delete --ignore-not-found=true -f - + +$faultToleranceYaml | kubectl apply -f - +Write-Output "Sleep for a minute for fault tolerance pods to be up..." +Start-Sleep 60 + +[System.Collections.ArrayList] $ws2019Nodes = @() +$nodes = (kubectl get nodes -o jsonpath="{.items[*].metadata.name}").Split() +foreach ($node in $nodes) { + $nodeImage = kubectl get node $node -o jsonpath="{.status.nodeInfo.osImage}" + + if ($nodeImage.ToString().trim() -eq "Windows Server 2019 Datacenter") { + $ws2019Nodes += $node.trim(); + } +} + +$report="" +$pods = (kubectl get pods -o jsonpath="{.items[*].metadata.name}").Split() +foreach ($pod in $pods) { + if ($pod.StartsWith('faulttolerance')) { + # if hns crashed - get the reason + $nodeName = kubectl get pod $pod -o jsonpath="{.spec.nodeName}" + $podLog = kubectl logs $pod + if ($podLog -like "*HNS crash not detected*") { + $ws2019Nodes.Remove($nodeName) + } + if (($podLog -like "*gracefully handled*") -or ($podLog -like "*HNS crash detected*")) { + # Generate Analysis Report + $report += $nodeName+" - Fault Analysis Report: `n"+$podLog+"`n" + } + } +} + +if ($report -ne "") { + Write-Host $report -ForegroundColor black -BackgroundColor white +} + +if ($ws2019Nodes.Count -eq 0) { + Write-Host "No HNS crashes detected in the cluster" -ForegroundColor darkgreen -BackgroundColor white +} else { + Write-Host "HNS crashed on nodes: $ws2019Nodes" -ForegroundColor darkred -BackgroundColor white +} + + +Write-Output "Sleep for an hour before deleting the fault tolerance pods automatically..." +Start-Sleep 3600 +$faultToleranceYaml | kubectl delete --ignore-not-found=true -f - diff --git a/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml b/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml new file mode 100644 index 00000000..0b604432 --- /dev/null +++ b/Kubernetes/windows/debug/faulttolerance/faulttolerance.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: faulttolerance + labels: + app: faulttolerance +spec: + selector: + matchLabels: + name: faulttolerance + template: + metadata: + labels: + name: faulttolerance + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: "NT AUTHORITY\\SYSTEM" + hostNetwork: true + containers: + - name: faulttolerance + image: mcr.microsoft.com/windows/servercore:1809 + args: + - powershell.exe + - -Command + - "$BaseDir = \"c:\\k\\debug\"; Invoke-WebRequest -UseBasicParsing \"https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/faulttolerance/faultAnalysis.ps1\" -OutFile $BaseDir\\faultAnalysis.ps1; c:\\k\\debug\\faultAnalysis.ps1; start-sleep 3600;" + imagePullPolicy: IfNotPresent + volumeMounts: + - name: kube-path + mountPath: C:\k + volumes: + - name: kube-path + hostPath: + path: C:\k + nodeSelector: + kubernetes.azure.com/os-sku: Windows2019