-
Notifications
You must be signed in to change notification settings - Fork 250
fix(e2e): retry watch in WaitUntilNodeReady when channel closes #8225
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -12,7 +12,7 @@ import ( | |||||||||||||
| "github.com/Azure/agentbaker/e2e/config" | ||||||||||||||
| "github.com/Azure/agentbaker/e2e/toolkit" | ||||||||||||||
| "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" | ||||||||||||||
| "github.com/stretchr/testify/require" | ||||||||||||||
|
|
||||||||||||||
| appsv1 "k8s.io/api/apps/v1" | ||||||||||||||
| corev1 "k8s.io/api/core/v1" | ||||||||||||||
| v1 "k8s.io/api/core/v1" | ||||||||||||||
|
|
@@ -147,55 +147,94 @@ func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, | |||||||||||||
|
|
||||||||||||||
| func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string { | ||||||||||||||
| defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)() | ||||||||||||||
| var node *corev1.Node = nil | ||||||||||||||
| watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{}) | ||||||||||||||
| require.NoError(t, err, "failed to start watching nodes") | ||||||||||||||
| defer watcher.Stop() | ||||||||||||||
| var node *corev1.Node | ||||||||||||||
|
|
||||||||||||||
| for event := range watcher.ResultChan() { | ||||||||||||||
| if event.Type != watch.Added && event.Type != watch.Modified { | ||||||||||||||
| continue | ||||||||||||||
| for { | ||||||||||||||
| if ctx.Err() != nil { | ||||||||||||||
| break | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| var nodeFromEvent *corev1.Node | ||||||||||||||
| switch v := event.Object.(type) { | ||||||||||||||
| case *corev1.Node: | ||||||||||||||
| nodeFromEvent = v | ||||||||||||||
|
|
||||||||||||||
| default: | ||||||||||||||
| t.Logf("skipping object type %T", event.Object) | ||||||||||||||
| // List existing nodes first to catch any that appeared while we weren't watching, | ||||||||||||||
| // and use the list's resource version to start the watch without missing events. | ||||||||||||||
| nodeList, err := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) | ||||||||||||||
| if err != nil { | ||||||||||||||
| t.Logf("failed to list nodes: %v, retrying...", err) | ||||||||||||||
| time.Sleep(5 * time.Second) | ||||||||||||||
| continue | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| if !strings.HasPrefix(nodeFromEvent.Name, vmssName) { | ||||||||||||||
| for i := range nodeList.Items { | ||||||||||||||
| n := &nodeList.Items[i] | ||||||||||||||
| if !strings.HasPrefix(n.Name, vmssName) { | ||||||||||||||
| continue | ||||||||||||||
| } | ||||||||||||||
| node = n | ||||||||||||||
| if name, ready := checkNodeReady(t, node); ready { | ||||||||||||||
| return name | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{ | ||||||||||||||
| ResourceVersion: nodeList.ResourceVersion, | ||||||||||||||
| }) | ||||||||||||||
| if err != nil { | ||||||||||||||
| t.Logf("failed to start watching nodes: %v, retrying...", err) | ||||||||||||||
| time.Sleep(5 * time.Second) | ||||||||||||||
| continue | ||||||||||||||
|
Comment on lines
+159
to
183
|
||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| // found the right node. Use it! | ||||||||||||||
| node = nodeFromEvent | ||||||||||||||
| nodeTaints, _ := json.Marshal(node.Spec.Taints) | ||||||||||||||
| nodeConditions, _ := json.Marshal(node.Status.Conditions) | ||||||||||||||
| for event := range watcher.ResultChan() { | ||||||||||||||
| if event.Type != watch.Added && event.Type != watch.Modified { | ||||||||||||||
| continue | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| nodeFromEvent, ok := event.Object.(*corev1.Node) | ||||||||||||||
| if !ok { | ||||||||||||||
| t.Logf("skipping object type %T", event.Object) | ||||||||||||||
| continue | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| for _, cond := range node.Status.Conditions { | ||||||||||||||
| if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { | ||||||||||||||
| t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) | ||||||||||||||
| return node.Name | ||||||||||||||
| if !strings.HasPrefix(nodeFromEvent.Name, vmssName) { | ||||||||||||||
| continue | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| node = nodeFromEvent | ||||||||||||||
| if name, ready := checkNodeReady(t, node); ready { | ||||||||||||||
| watcher.Stop() | ||||||||||||||
| return name | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) | ||||||||||||||
| watcher.Stop() | ||||||||||||||
|
||||||||||||||
| watcher.Stop() | |
| watcher.Stop() | |
| if err := ctx.Err(); err != nil { | |
| t.Logf("watch for node %q closed due to context completion: %v", vmssName, err) | |
| break | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The retry backoff uses time.Sleep, which ignores context cancellation. If ctx is already done (e.g., deadline exceeded), list/watch will error and this will still sleep an extra 5s before exiting, delaying test teardown. Consider using a context-aware wait (e.g., wait.PollUntilContextCancel / select on ctx.Done()) so the function returns promptly when ctx is canceled.