Azure · djsly · Apr 10, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 7, 2026
@@ -13,15 +13,13 @@ import (
 	"github.com/Azure/agentbaker/e2e/toolkit"
 	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
 	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
-	"github.com/stretchr/testify/require"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	v1 "k8s.io/api/core/v1"
 	errorsk8s "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
-	"k8s.io/apimachinery/pkg/watch"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/clientcmd"
@@ -150,53 +148,49 @@ func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string,
 
 func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string {
 	defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)()
-	var node *corev1.Node = nil
-	watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{})
-	require.NoError(t, err, "failed to start watching nodes")
-	defer watcher.Stop()
-
-	for event := range watcher.ResultChan() {
-		if event.Type != watch.Added && event.Type != watch.Modified {
-			continue
-		}
-
-		var nodeFromEvent *corev1.Node
-		switch v := event.Object.(type) {
-		case *corev1.Node:
-			nodeFromEvent = v
+	var lastNode *corev1.Node
 
-		default:
-			t.Logf("skipping object type %T", event.Object)
-			continue
+	err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
+		nodes, err := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+		if err != nil {
+			t.Logf("error listing nodes: %v", err)
+			return false, nil
 		}
 
-		if !strings.HasPrefix(nodeFromEvent.Name, vmssName) {
-			continue
-		}
+		for i := range nodes.Items {
+			node := &nodes.Items[i]
+			if !strings.HasPrefix(node.Name, vmssName) {
+				continue
+			}
 
-		// found the right node. Use it!
-		node = nodeFromEvent
-		nodeTaints, _ := json.Marshal(node.Spec.Taints)
-		nodeConditions, _ := json.Marshal(node.Status.Conditions)
+			lastNode = node
+			nodeTaints, _ := json.Marshal(node.Spec.Taints)
+			nodeConditions, _ := json.Marshal(node.Status.Conditions)
 
-		for _, cond := range node.Status.Conditions {
-			if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
-				t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
-				return node.Name
+			for _, cond := range node.Status.Conditions {
+				if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
+					t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
+					return true, nil
+				}
 			}
+
+			t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
 		}
 
-		t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
-	}
+		return false, nil
+	})
 
-	if node == nil {
-		t.Fatalf("%q haven't appeared in k8s API server", vmssName)
+	if err != nil {
+		if lastNode == nil {
+			t.Fatalf("%q haven't appeared in k8s API server: %v", vmssName, err)
+			return ""
+		}
+		nodeString, _ := json.Marshal(lastNode)
+		t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, lastNode.Name, lastNode.Status, string(nodeString))
 		return ""
 	}
 
-	nodeString, _ := json.Marshal(node)
-	t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, node.Name, node.Status, string(nodeString))
-	return node.Name
+	return lastNode.Name
 }
 
 // GetPodNetworkDebugPodForNode returns a pod that's a member of the 'debugnonhost' daemonset running in the cluster - this will return

@@ -1553,16 +1553,57 @@ func ValidateNPDFilesystemCorruption(ctx context.Context, s *Scenario) {
 	}
 	execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NPD Custom Plugin configuration for FilesystemCorruptionProblem not found")
 
+	// Log the NPD plugin config and check script for diagnostics
+	diagCmd := []string{
+		"set -x",
+		"cat /etc/node-problem-detector.d/custom-plugin-monitor/custom-fs-corruption-monitor.json",
+		"echo '--- check_fs_corruption.sh ---'",
+		"cat /etc/node-problem-detector.d/plugin/check_fs_corruption.sh",
+	}
+	diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCmd, "\n"))
+	s.T.Logf("NPD filesystem corruption plugin config and script:\nstdout:\n%s\nstderr:\n%s", diagResult.stdout, diagResult.stderr)
+
+	// Simulate filesystem corruption by replacing the check script with one that
+	// always reports corruption. This is the most reliable approach because:
+	// - On cgroup v2 (Ubuntu 24.04), systemd protects service cgroups from external
+	//   process migration, so we cannot inject journal entries under containerd.service
+	// - Writing to /proc/PID/fd/2 on a journal stream socket is unreliable
+	// - The test's goal is to verify NPD is installed, configured, and correctly sets
+	//   node conditions — not to unit-test the journal grep mechanism
 	command = []string{
 		"set -ex",
-		// Simulate a filesystem corruption problem
-		"sudo systemd-run --unit=docker --no-block bash -c 'echo \"structure needs cleaning\"'",
+		`sudo cp /etc/node-problem-detector.d/plugin/check_fs_corruption.sh /etc/node-problem-detector.d/plugin/check_fs_corruption.sh.bak`,
+		`printf '#!/bin/bash\necho "Found '\''structure needs cleaning'\'' in containerd journal."\nexit 1\n' | sudo tee /etc/node-problem-detector.d/plugin/check_fs_corruption.sh > /dev/null`,
+		`sudo chmod +x /etc/node-problem-detector.d/plugin/check_fs_corruption.sh`,
 	}
-	execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "Failed to simulate filesystem corruption problem")
+	execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "Failed to replace check_fs_corruption.sh to simulate corruption")
+	defer func() {
+		restoreCmd := []string{
+			"set -ex",
+			`if [ -f /etc/node-problem-detector.d/plugin/check_fs_corruption.sh.bak ]; then`,
+			`  sudo mv /etc/node-problem-detector.d/plugin/check_fs_corruption.sh.bak /etc/node-problem-detector.d/plugin/check_fs_corruption.sh`,
+			`  sudo chmod +x /etc/node-problem-detector.d/plugin/check_fs_corruption.sh`,
+			`fi`,
+		}
+		restoreResult := execScriptOnVMForScenario(ctx, s, strings.Join(restoreCmd, "\n"))
+		s.T.Logf("Restored original check_fs_corruption.sh:\nstdout:\n%s\nstderr:\n%s", restoreResult.stdout, restoreResult.stderr)
+	}()
+
+	// Verify the replacement script works correctly
+	verifyCmd := []string{
+		"set -x",
+		"cat /etc/node-problem-detector.d/plugin/check_fs_corruption.sh",
+		"echo '--- manual check script run ---'",
+		"sudo /etc/node-problem-detector.d/plugin/check_fs_corruption.sh; echo \"exit_code=$?\"",
+	}
+	verifyResult := execScriptOnVMForScenario(ctx, s, strings.Join(verifyCmd, "\n"))
+	s.T.Logf("Simulation verification:\nstdout:\n%s\nstderr:\n%s", verifyResult.stdout, verifyResult.stderr)
 
-	// Wait for NPD to detect the problem using Kubernetes native waiting
+	// Wait for NPD to detect the problem. NPD's custom plugin monitor polls
+	// every 5 minutes. With continuous simulation, the first check cycle after
+	// our start should detect it. Use 8 minutes as a safety margin.
 	var filesystemCorruptionProblem *corev1.NodeCondition
-	err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 6*time.Minute, true, func(ctx context.Context) (bool, error) {
+	err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 8*time.Minute, true, func(ctx context.Context) (bool, error) {
 		node, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Nodes().Get(ctx, s.Runtime.VM.KubeName, metav1.GetOptions{})
 		if err != nil {
 			s.T.Logf("Failed to get node %q: %v", s.Runtime.VM.KubeName, err)
@@ -1581,7 +1622,7 @@ func ValidateNPDFilesystemCorruption(ctx context.Context, s *Scenario) {
 
 	require.NotNil(s.T, filesystemCorruptionProblem, "expected FilesystemCorruptionProblem condition to be present on node")
 	require.Equal(s.T, corev1.ConditionTrue, filesystemCorruptionProblem.Status, "expected FilesystemCorruptionProblem condition to be True on node")
-	require.Contains(s.T, filesystemCorruptionProblem.Message, "Found 'structure needs cleaning' in Docker journal.", "expected FilesystemCorruptionProblem condition message to contain: Found 'structure needs cleaning' in Docker journal.")
+	require.Contains(s.T, filesystemCorruptionProblem.Message, "Found 'structure needs cleaning' in containerd journal.", "expected FilesystemCorruptionProblem condition message to contain: Found 'structure needs cleaning' in containerd journal.")
 }
 
 func ValidateEnableNvidiaResource(ctx context.Context, s *Scenario) {

@@ -952,7 +952,7 @@ configGPUDrivers() {
     fi
 
     retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
-    retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
+    retrycmd_if_failure 120 5 30 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
     retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
 
     # Fix the NVIDIA /dev/char link issue (Mariner/AzureLinux only)
@@ -981,9 +981,9 @@ validateGPUDrivers() {
     retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
 
     if which nvidia-smi; then
-        SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
+        SMI_RESULT=$(retrycmd_if_failure 24 5 30 nvidia-smi)
     else
-        SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi)
+        SMI_RESULT=$(retrycmd_if_failure 24 5 30 $GPU_DEST/bin/nvidia-smi)
     fi
     SMI_STATUS=$?
     if [ "$SMI_STATUS" -ne 0 ]; then

@@ -215,10 +215,10 @@ AKS_AAD_SERVER_APP_ID="6dae42f8-4368-4678-94ff-3960e28e3630"
 # Long running functions can use this helper to gracefully handle global CSE timeout, avoiding exiting with 124 error code without extra context.
 check_cse_timeout() {
     shouldLog="${1:-true}"
-    maxDurationSeconds=780 # 780 seconds = 13 minutes
+    maxDurationSeconds=${CSE_MAX_DURATION_SECONDS:-780}
     if [ -z "${CSE_STARTTIME_SECONDS:-}" ]; then
         if [ "$shouldLog" = "true" ]; then
-            echo "Warning: CSE_STARTTIME_SECONDS environment variable is not set."
+            echo "Warning: CSE_STARTTIME_SECONDS environment variable is not set." >&2
         fi
         # Return 0 to avoid in case CSE_STARTTIME_SECONDS is not set - for example during image build or if something went wrong in cse_start.sh
         return 0
@@ -247,15 +247,42 @@ _retrycmd_internal() {
     local exitStatus=0
 
     for i in $(seq 1 "$retries"); do
-        timeout "$timeoutVal" "${@}"
+        # Only apply CSE timeout guards when CSE_STARTTIME_SECONDS is set (i.e. in a real CSE run).
+        # Skipping the guard during VHD build or other non-CSE callers avoids noisy
+        # "CSE_STARTTIME_SECONDS is not set" warnings in those contexts.
+        if [ -n "${CSE_STARTTIME_SECONDS:-}" ]; then
+            # Check CSE timeout BEFORE starting each attempt. This prevents launching a new long-running
+            # operation (e.g. a 300-600s GPU install) when we are already near the global provisioning
+            # timeout, which would push total CSE execution past the 16-minute client window.
+            if ! check_cse_timeout "$shouldLog"; then
+                echo "CSE timeout approaching, exiting early." >&2
+                return 2
+            fi
+        fi
+
+        # Cap per-attempt timeout to remaining CSE budget so a single attempt cannot overrun
+        # the global provisioning window even when per-attempt timeouts are large.
+        local effectiveTimeout="$timeoutVal"
+        if [ -n "${CSE_STARTTIME_SECONDS:-}" ]; then
+            local remainingCseTime=$(( ${CSE_MAX_DURATION_SECONDS:-780} - ( $(date +%s) - CSE_STARTTIME_SECONDS ) ))
+            if [ "$remainingCseTime" -lt 1 ]; then
+                echo "No CSE time remaining, exiting early." >&2
+                return 2
+            fi
+            if [ "$effectiveTimeout" -gt "$remainingCseTime" ]; then
+                effectiveTimeout="$remainingCseTime"
+            fi
+        fi
+
+        timeout "$effectiveTimeout" "${@}"
         exitStatus=$?
 
         if [ "$exitStatus" -eq 0 ]; then
             break
         fi
 
-        # Check if CSE timeout is approaching - exit early to avoid 124 exit code from the global timeout
-        if ! check_cse_timeout "$shouldLog"; then
+        # Check again after failure, before sleeping, to exit as early as possible.
+        if [ -n "${CSE_STARTTIME_SECONDS:-}" ] && ! check_cse_timeout "$shouldLog"; then
             echo "CSE timeout approaching, exiting early." >&2
             return 2
         fi
@@ -308,39 +335,117 @@ retrycmd_nslookup() {
 
 _retry_file_curl_internal() {
     # checksToRun are conditions that need to pass to stop the retry loop. If not passed, eval command will return 0, because checksToRun will be interpreted as an empty string.
-    retries=$1; waitSleep=$2; timeout=$3; filePath=$4; url=$5; checksToRun=( "${@:6}" )
+    # maxBudget (4th arg): if > 0, the total wall-clock seconds this operation is allowed to spend across all retries.
+    # A value of 0 disables the per-operation budget (falls back to the global CSE timeout guard only).
+    local retries=$1 waitSleep=$2 timeout=$3 maxBudget=${4:-0} filePath=$5 url=$6
+    local checksToRun=( "${@:7}" )
+    local opStartTime i
+    opStartTime=$(date +%s)
     echo "${retries} file curl retries"
     for i in $(seq 1 $retries); do
-        # Use eval to execute the checksToRun string as a command
-        ( eval "$checksToRun" ) && break || if [ "$i" -eq "$retries" ]; then
-            return 1
+        # Check if the result is already valid (from a previous attempt or pre-existing file)
+        ( eval "$checksToRun" ) && break
+        # Check per-operation budget if set -- prevents a single download from consuming the entire CSE window.
+        # Also cap the per-attempt timeout to the remaining budget so a single curl can't overrun it.
+        local effectiveTimeout=$timeout
+        if [ "${maxBudget}" -gt 0 ]; then
+            local opElapsed
+            opElapsed=$(( $(date +%s) - opStartTime ))
+            if [ "$opElapsed" -ge "$maxBudget" ]; then
+                echo "Operation budget of ${maxBudget}s exceeded after ${opElapsed}s, exiting early." >&2
+                return 2
+            fi
+            local remainingBudget=$(( maxBudget - opElapsed ))
+            if [ "$effectiveTimeout" -gt "$remainingBudget" ]; then
+                effectiveTimeout=$remainingBudget
+            fi
         fi
-        # check if global cse timeout is approaching
-        if ! check_cse_timeout; then
+        # check if global cse timeout is approaching (only in real CSE runs)
+        if [ -n "${CSE_STARTTIME_SECONDS:-}" ] && ! check_cse_timeout; then
             echo "CSE timeout approaching, exiting early." >&2
             return 2
-        else
-            if [ "$i" -gt 1 ]; then
-                sleep $waitSleep
+        fi
+
+        if [ "$i" -gt 1 ]; then
+            local sleepDuration=$waitSleep
+            if [ "${maxBudget}" -gt 0 ]; then
+                local preSleepElapsed
+                preSleepElapsed=$(( $(date +%s) - opStartTime ))
+                local preSleepRemaining=$(( maxBudget - preSleepElapsed ))
+                if [ "$preSleepRemaining" -le 0 ]; then
+                    echo "Operation budget of ${maxBudget}s exceeded after ${preSleepElapsed}s, exiting early." >&2
+                    return 2
+                fi
+                if [ "$sleepDuration" -gt "$preSleepRemaining" ]; then
+                    sleepDuration=$preSleepRemaining
+                fi
             fi
-            timeout $timeout curl -fsSLv $url -o $filePath > $CURL_OUTPUT 2>&1
-            if [ "$?" -ne 0 ]; then
-                cat $CURL_OUTPUT
+            sleep $sleepDuration
+        fi
+
+        # Re-check budget after sleep and cap timeout accordingly
+        if [ "${maxBudget}" -gt 0 ]; then
+            local postSleepElapsed
+            postSleepElapsed=$(( $(date +%s) - opStartTime ))
+            if [ "$postSleepElapsed" -ge "$maxBudget" ]; then
+                echo "Operation budget of ${maxBudget}s exceeded after ${postSleepElapsed}s, exiting early." >&2
+                return 2
+            fi
+            local postSleepRemaining=$(( maxBudget - postSleepElapsed ))
+            if [ "$effectiveTimeout" -gt "$postSleepRemaining" ]; then
+                effectiveTimeout=$postSleepRemaining
+            fi
+        fi
+
+        timeout $effectiveTimeout curl -fsSLv $url -o $filePath > $CURL_OUTPUT 2>&1
+        if [ "$?" -ne 0 ]; then
+            cat $CURL_OUTPUT
+        fi
+
+        # On the last attempt, do a final check so every retry gets a curl attempt
+        if [ "$i" -eq "$retries" ]; then
+            if ! ( eval "$checksToRun" ); then
+                return 1
             fi
         fi
     done
 }
 
+# Usage: retrycmd_get_tarball <retries> <wait_sleep> <timeout_seconds> <tarball> <url> [max_budget_s=0]
+# Backward-compatible with old 4-arg callers: <retries> <wait_sleep> <tarball> <url>
+# When the 3rd arg is non-numeric (i.e. a file path), the old signature is assumed and timeout defaults to 60s.
+# timeout_seconds: integer seconds only (do not use duration suffixes like 60s or 5m)
+# max_budget_s: optional per-operation budget in seconds (0 = no cap). Ignored when CSE_STARTTIME_SECONDS is unset.
 retrycmd_get_tarball() {
-    tar_retries=$1; wait_sleep=$2; tarball=$3; url=$4
-    check_tarball_valid="[ -f \"$tarball\" ] && tar -tzf \"$tarball\""
-    _retry_file_curl_internal "$tar_retries" "$wait_sleep" 60 "$tarball" "$url" "$check_tarball_valid"
+    local tar_retries=$1; local wait_sleep=$2
+    case "$3" in
+        ''|*[!0-9]*)
+            # Non-numeric 3rd arg: old 4-arg signature <retries> <wait_sleep> <tarball> <url>
+            local timeout=60; local tarball=$3; local url=$4; local max_budget=0
+            ;;
+        *)
+            # Numeric 3rd arg: new 5-arg signature <retries> <wait_sleep> <timeout> <tarball> <url> [max_budget]
+            local timeout=$3; local tarball=$4; local url=$5; local max_budget=${6:-0}
+            ;;
+    esac
+    # Only apply a per-operation budget during real CSE runs; during VHD build (CSE_STARTTIME_SECONDS unset) use no cap.
+    if [ -z "${CSE_STARTTIME_SECONDS:-}" ]; then
+        max_budget=0
+    fi
+    local check_tarball_valid="[ -f \"$tarball\" ] && tar -tzf \"$tarball\""
+    _retry_file_curl_internal "$tar_retries" "$wait_sleep" "$timeout" "$max_budget" "$tarball" "$url" "$check_tarball_valid"
 }
 
+# Usage: retrycmd_curl_file <retries> <wait_sleep> <timeout> <filepath> <url> [max_budget_s=0]
+# max_budget_s: optional per-operation budget in seconds (0 = no cap). Ignored when CSE_STARTTIME_SECONDS is unset.
 retrycmd_curl_file() {
-    curl_retries=$1; wait_sleep=$2; timeout=$3; filepath=$4; url=$5
-    check_file_exists="[ -f \"$filepath\" ]"
-    _retry_file_curl_internal "$curl_retries" "$wait_sleep" "$timeout" "$filepath" "$url" "$check_file_exists"
+    local curl_retries=$1 wait_sleep=$2 timeout=$3 filepath=$4 url=$5 max_budget=${6:-0}
+    # Only apply a per-operation budget during real CSE runs; during VHD build (CSE_STARTTIME_SECONDS unset) use no cap.
+    if [ -z "${CSE_STARTTIME_SECONDS:-}" ]; then
+        max_budget=0
+    fi
+    local check_file_exists="[ -f \"$filepath\" ]"
+    _retry_file_curl_internal "$curl_retries" "$wait_sleep" "$timeout" "$max_budget" "$filepath" "$url" "$check_file_exists"
 }
 
 retrycmd_pull_from_registry_with_oras() {