Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 66 additions & 27 deletions e2e/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
"github.com/Azure/agentbaker/e2e/config"
"github.com/Azure/agentbaker/e2e/toolkit"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/stretchr/testify/require"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -147,55 +147,94 @@ func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string,

func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string {
defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)()
var node *corev1.Node = nil
watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{})
require.NoError(t, err, "failed to start watching nodes")
defer watcher.Stop()
var node *corev1.Node

for event := range watcher.ResultChan() {
if event.Type != watch.Added && event.Type != watch.Modified {
continue
for {
if ctx.Err() != nil {
break
}

var nodeFromEvent *corev1.Node
switch v := event.Object.(type) {
case *corev1.Node:
nodeFromEvent = v

default:
t.Logf("skipping object type %T", event.Object)
// List existing nodes first to catch any that appeared while we weren't watching,
// and use the list's resource version to start the watch without missing events.
nodeList, err := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Logf("failed to list nodes: %v, retrying...", err)
time.Sleep(5 * time.Second)
continue
Comment on lines +159 to 163
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The retry backoff uses time.Sleep, which ignores context cancellation. If ctx is already done (e.g., deadline exceeded), list/watch will error and this will still sleep an extra 5s before exiting, delaying test teardown. Consider using a context-aware wait (e.g., wait.PollUntilContextCancel / select on ctx.Done()) so the function returns promptly when ctx is canceled.

Copilot uses AI. Check for mistakes.
}

if !strings.HasPrefix(nodeFromEvent.Name, vmssName) {
for i := range nodeList.Items {
n := &nodeList.Items[i]
if !strings.HasPrefix(n.Name, vmssName) {
continue
}
node = n
if name, ready := checkNodeReady(t, node); ready {
return name
}
}

watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{
ResourceVersion: nodeList.ResourceVersion,
})
if err != nil {
t.Logf("failed to start watching nodes: %v, retrying...", err)
time.Sleep(5 * time.Second)
continue
Comment on lines +159 to 183
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This retries on any List/Watch error for the full parent timeout. For permanent failures (e.g., RBAC Forbidden/Unauthorized, invalid watch options), this will just spin for up to 35m and make failures harder to diagnose. Consider detecting non-transient API errors (via k8s api/errors helpers) and failing fast with a clear message, while still retrying transient network/timeouts.

Copilot uses AI. Check for mistakes.
}

// found the right node. Use it!
node = nodeFromEvent
nodeTaints, _ := json.Marshal(node.Spec.Taints)
nodeConditions, _ := json.Marshal(node.Status.Conditions)
for event := range watcher.ResultChan() {
if event.Type != watch.Added && event.Type != watch.Modified {
continue
}

nodeFromEvent, ok := event.Object.(*corev1.Node)
if !ok {
t.Logf("skipping object type %T", event.Object)
continue
}

for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
return node.Name
if !strings.HasPrefix(nodeFromEvent.Name, vmssName) {
continue
}

node = nodeFromEvent
if name, ready := checkNodeReady(t, node); ready {
watcher.Stop()
return name
}
}

t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
watcher.Stop()
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The log message says the watch is being re-established even when the watch ended due to ctx cancellation/timeout. Consider checking ctx.Err() after the ResultChan loop and only logging/retrying when ctx is still active to avoid misleading output at the end of a timed-out test.

Suggested change
watcher.Stop()
watcher.Stop()
if err := ctx.Err(); err != nil {
t.Logf("watch for node %q closed due to context completion: %v", vmssName, err)
break
}

Copilot uses AI. Check for mistakes.
t.Logf("watch for node %q closed, re-establishing...", vmssName)
}

if node == nil {
t.Fatalf("%q haven't appeared in k8s API server", vmssName)
t.Fatalf("%q never appeared in k8s API server", vmssName)
return ""
}

nodeString, _ := json.Marshal(node)
t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, node.Name, node.Status, string(nodeString))
t.Fatalf("failed to wait for %q (%s) to be ready: %s", vmssName, node.Name, string(nodeString))
return node.Name
}

// checkNodeReady logs the node's status and returns (nodeName, true) if the node is ready.
func checkNodeReady(t testing.TB, node *corev1.Node) (string, bool) {
nodeTaints, _ := json.Marshal(node.Spec.Taints)
nodeConditions, _ := json.Marshal(node.Status.Conditions)

for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
return node.Name, true
}
}

t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
return "", false
}

// GetPodNetworkDebugPodForNode returns a pod that's a member of the 'debugnonhost' daemonset running in the cluster - this will return
// the name of the pod that is running on the node created for specifically for the test case which is running validation checks.
func (k *Kubeclient) GetPodNetworkDebugPodForNode(ctx context.Context, kubeNodeName string) (*corev1.Pod, error) {
Expand Down
Loading