diff --git a/cmd/main.go b/cmd/main.go index 43ae63f9c..46a244de1 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -369,6 +369,15 @@ func main() { os.Exit(1) } } + if slices.Contains(mainConfig.EnabledControllers, "hypervisor-overcommit-controller") { + hypervisorOvercommitController := &nova.HypervisorOvercommitController{} + hypervisorOvercommitController.Client = multiclusterClient + if err := hypervisorOvercommitController.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", + "controller", "HypervisorOvercommitController") + os.Exit(1) + } + } if slices.Contains(mainConfig.EnabledControllers, "manila-decisions-pipeline-controller") { controller := &manila.FilterWeigherPipelineController{ Monitor: filterWeigherPipelineMonitor, diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 200ba3ff3..c38b8bfc4 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -113,6 +113,7 @@ cortex-scheduling-controllers: enabledControllers: - nova-pipeline-controllers - nova-deschedulings-executor + - hypervisor-overcommit-controller - explanation-controller - reservations-controller enabledTasks: @@ -120,6 +121,11 @@ cortex-scheduling-controllers: # Endpoints configuration for reservations controller endpoints: novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" + # OvercommitMappings is a list of mappings that map hypervisor traits to + # overcommit ratios. Note that this list is applied in order, so if there + # are multiple mappings applying to the same hypervisors, the last mapping + # in this list will override the previous ones. + overcommitMappings: [] cortex-knowledge-controllers: <<: *cortex diff --git a/helm/library/cortex/templates/rbac/hypervisor_role.yaml b/helm/library/cortex/templates/rbac/hypervisor_role.yaml index 14b61e5de..0a2fefa00 100644 --- a/helm/library/cortex/templates/rbac/hypervisor_role.yaml +++ b/helm/library/cortex/templates/rbac/hypervisor_role.yaml @@ -1,5 +1,6 @@ {{- if .Values.rbac.hypervisor.enable }} --- +# TODO: Check if this role can be part of the nova bundle, not the core library apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -14,6 +15,8 @@ rules: verbs: - get - list + - patch + - update - watch - apiGroups: - kvm.cloud.sap diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller.go b/internal/scheduling/nova/hypervisor_overcommit_controller.go new file mode 100644 index 000000000..8d54475f0 --- /dev/null +++ b/internal/scheduling/nova/hypervisor_overcommit_controller.go @@ -0,0 +1,242 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package nova + +import ( + "context" + "errors" + "fmt" + "maps" + "slices" + + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/cobaltcore-dev/cortex/pkg/multicluster" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// HypervisorOvercommitMapping maps hypervisor types to their desired +// overcommit ratios. This mapping will be loaded from a configmap +// that is mounted into the controller pod. +type HypervisorOvercommitMapping struct { + // Overcommit is the overcommit ratio to set for hypervisors by resource name. + // Values must be set to something >= 1.0, otherwise the controller will + // ignore them. + Overcommit map[hv1.ResourceName]float64 `json:"overcommit"` + + // HasTrait specifies a trait that a hypervisor may have, and that, if present, + // triggers the controller to set the overcommit ratio specified in the + // overcommit field for that hypervisor. + HasTrait *string `json:"hasTrait,omitempty"` + + // HasntTrait specifies a trait that a hypervisor may have, and that, if + // NOT present, triggers the controller to set the overcommit ratio + // specified in the overcommit field for that hypervisor. + HasntTrait *string `json:"hasntTrait,omitempty"` +} + +// Validate the provided HypervisorOvercommitMapping, returning an error if the +// mapping is invalid. +func (m *HypervisorOvercommitMapping) Validate() error { + for resource, overcommit := range m.Overcommit { + if overcommit < 1.0 { + return errors.New("invalid overcommit ratio in config, must be >= 1.0. " + + "Invalid value for resource " + string(resource) + ": " + + fmt.Sprintf("%f", overcommit)) + } + // Has trait and hasn't trait are mutually exclusive, so if both are set + // we return an error. + if m.HasTrait != nil && m.HasntTrait != nil { + return errors.New("invalid overcommit mapping, hasTrait and hasntTrait are mutually exclusive") + } + // At least one of has trait and hasn't trait must be set, + // otherwise we don't know when to apply this mapping. + if m.HasTrait == nil && m.HasntTrait == nil { + return errors.New("invalid overcommit mapping, at least one of hasTrait and hasntTrait must be set") + } + } + return nil +} + +// HypervisorOvercommitConfig holds the configuration for the +// HypervisorOvercommitController and is loaded from a configmap that is mounted +// into the controller pod. +type HypervisorOvercommitConfig struct { + // OvercommitMappings is a list of mappings that map hypervisor traits to + // overcommit ratios. Note that this list is applied in order, so if there + // are multiple mappings applying to the same hypervisors, the last mapping + // in this list will override the previous ones. + OvercommitMappings []HypervisorOvercommitMapping `json:"overcommitMappings"` +} + +// Validate the provided HypervisorOvercommitConfig, returning an error if the +// config is invalid. +func (c *HypervisorOvercommitConfig) Validate() error { + // Check that all the individual mappings are valid. + for _, mapping := range c.OvercommitMappings { + if err := mapping.Validate(); err != nil { + return err + } + } + return nil +} + +// HypervisorOvercommitController is a controller that reconciles on the +// hypervisor crd and sets desired overcommit ratios based on the hypervisor +// type. +type HypervisorOvercommitController struct { + client.Client + + // config holds the configuration for the controller, which is loaded from a + // configmap that is mounted into the controller pod. + config HypervisorOvercommitConfig +} + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.2/pkg/reconcile +// +// For more details about the method shape, read up here: +// - https://ahmet.im/blog/controller-pitfalls/#reconcile-method-shape +func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.Info("Reconciling resource") + + obj := new(hv1.Hypervisor) + if err := c.Get(ctx, req.NamespacedName, obj); err != nil { + if apierrors.IsNotFound(err) { + // If the custom resource is not found then it usually means + // that it was deleted or not created. + log.Info("Resource not found. Ignoring since object must be deleted") + return ctrl.Result{}, nil + } + // Error reading the object - requeue the request. + log.Error(err, "Failed to get resource") + return ctrl.Result{}, err + } + + // Build desired overcommit ratios by iterating mappings in order. + // Later mappings override earlier ones for the same resource, preserving + // non-overlapping resources from previous mappings. + desiredOvercommit := make(map[hv1.ResourceName]float64) + for _, mapping := range c.config.OvercommitMappings { + var applyMapping bool + switch { + // These are mutually exclusive. + case mapping.HasTrait != nil: + applyMapping = slices.Contains(obj.Status.Traits, *mapping.HasTrait) + case mapping.HasntTrait != nil: + applyMapping = !slices.Contains(obj.Status.Traits, *mapping.HasntTrait) + default: + // This should never happen due to validation, but we check it just in case. + log.Info("Skipping overcommit mapping with no trait specified", + "overcommit", mapping.Overcommit) + continue + } + if !applyMapping { + continue + } + log.Info("Applying overcommit mapping on hypervisor", + "overcommit", mapping.Overcommit) + maps.Copy(desiredOvercommit, mapping.Overcommit) + } + log.Info("Desired overcommit ratios based on traits", + "desiredOvercommit", desiredOvercommit) + if maps.Equal(desiredOvercommit, obj.Spec.Overcommit) { + log.Info("Overcommit ratios are up to date, no update needed") + return ctrl.Result{}, nil + } + + // Update the desired overcommit ratios on the hypervisor spec. + orig := obj.DeepCopy() + obj.Spec.Overcommit = desiredOvercommit + if err := c.Patch(ctx, obj, client.MergeFrom(orig)); err != nil { + log.Error(err, "Failed to update hypervisor overcommit ratios") + return ctrl.Result{}, err + } + log.Info("Updated hypervisor with new overcommit ratios", + "overcommit", desiredOvercommit) + + return ctrl.Result{}, nil +} + +// handleRemoteHypervisor is called by watches in remote clusters and triggers +// a reconcile on the hypervisor resource that was changed in the remote cluster. +func (c *HypervisorOvercommitController) handleRemoteHypervisor() handler.EventHandler { + handler := handler.Funcs{} + handler.CreateFunc = func(ctx context.Context, evt event.CreateEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.Object.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + handler.UpdateFunc = func(ctx context.Context, evt event.UpdateEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.ObjectOld.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + handler.DeleteFunc = func(ctx context.Context, evt event.DeleteEvent, + queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { + + queue.Add(ctrl.Request{NamespacedName: client.ObjectKey{ + Name: evt.Object.(*hv1.Hypervisor).Name, // cluster-scoped crd + }}) + } + return handler +} + +// predicateRemoteHypervisor is used to filter events from remote clusters, +// so that only events for hypervisors that should be processed by this +// controller will trigger reconciliations. +func (c *HypervisorOvercommitController) predicateRemoteHypervisor() predicate.Predicate { + // Currently we're watching all hypervisors. In this way, if a trait + // gets removed from the hypervisor, we'll still reconcile this + // hypervisor and update the overcommit ratios accordingly. + return predicate.NewPredicateFuncs(func(object client.Object) bool { + _, ok := object.(*hv1.Hypervisor) + return ok + }) +} + +// SetupWithManager sets up the controller with the Manager and a multicluster +// client. The multicluster client is used to watch for changes in the +// Hypervisor CRD across all clusters and trigger reconciliations accordingly. +func (c *HypervisorOvercommitController) SetupWithManager(mgr ctrl.Manager) (err error) { + // This will load the config in a safe way and gracefully handle errors. + c.config, err = conf.GetConfig[HypervisorOvercommitConfig]() + if err != nil { + return err + } + // Validate we don't have any weird values in the config. + if err := c.config.Validate(); err != nil { + return err + } + // Check that the provided client is a multicluster client, since we need + // that to watch for hypervisors across clusters. + mcl, ok := c.Client.(*multicluster.Client) + if !ok { + return errors.New("provided client must be a multicluster client") + } + return multicluster. + BuildController(mcl, mgr). + // The hypervisor crd may be distributed across multiple remote clusters. + WatchesMulticluster(&hv1.Hypervisor{}, + c.handleRemoteHypervisor(), + c.predicateRemoteHypervisor(), + ). + Named("hypervisor-overcommit-controller"). + Complete(c) +} diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller_test.go b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go new file mode 100644 index 000000000..391fe356c --- /dev/null +++ b/internal/scheduling/nova/hypervisor_overcommit_controller_test.go @@ -0,0 +1,936 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package nova + +import ( + "context" + "errors" + "strings" + "testing" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +func TestHypervisorOvercommitMapping_Validate(t *testing.T) { + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + + tests := []struct { + name string + mapping HypervisorOvercommitMapping + expectError bool + }{ + { + name: "valid overcommit ratios with HasTrait", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 1.5, + }, + HasTrait: &gpuTrait, + }, + expectError: false, + }, + { + name: "valid minimum overcommit ratio of 1.0 with HasntTrait", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 1.0, + }, + HasntTrait: &gpuTrait, + }, + expectError: false, + }, + { + name: "invalid overcommit ratio less than 1.0", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid overcommit ratio of zero", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 0.0, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid negative overcommit ratio", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: -1.0, + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "empty overcommit map is valid", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + expectError: false, + }, + { + name: "nil overcommit map is valid", + mapping: HypervisorOvercommitMapping{ + Overcommit: nil, + }, + expectError: false, + }, + { + name: "mixed valid and invalid overcommit ratios", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + hv1.ResourceMemory: 0.5, // invalid + }, + HasTrait: &gpuTrait, + }, + expectError: true, + }, + { + name: "invalid: both HasTrait and HasntTrait set", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + HasntTrait: &standardTrait, + }, + expectError: true, + }, + { + name: "invalid: neither HasTrait nor HasntTrait set with non-empty overcommit", + mapping: HypervisorOvercommitMapping{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.mapping.Validate() + if tt.expectError && err == nil { + t.Error("expected error but got nil") + } + if !tt.expectError && err != nil { + t.Errorf("expected no error but got: %v", err) + } + }) + } +} + +func TestHypervisorOvercommitConfig_Validate(t *testing.T) { + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + tests := []struct { + name string + config HypervisorOvercommitConfig + expectError bool + }{ + { + name: "valid config with single mapping", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectError: false, + }, + { + name: "valid config with multiple mappings", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.5, + }, + HasntTrait: &standardTrait, + }, + }, + }, + expectError: false, + }, + { + name: "invalid config with bad overcommit ratio", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 0.5, // invalid ratio + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectError: true, + }, + { + name: "invalid config with both HasTrait and HasntTrait", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &gpuTrait, + HasntTrait: &standardTrait, + }, + }, + }, + expectError: true, + }, + { + name: "invalid config with neither HasTrait nor HasntTrait", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + }, + }, + expectError: true, + }, + { + name: "empty config is valid", + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{}, + }, + expectError: false, + }, + { + name: "nil mappings is valid", + config: HypervisorOvercommitConfig{ + OvercommitMappings: nil, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if tt.expectError && err == nil { + t.Error("expected error but got nil") + } + if !tt.expectError && err != nil { + t.Errorf("expected no error but got: %v", err) + } + }) + } +} + +func newTestHypervisorScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add hv1 to scheme: %v", err) + } + return scheme +} + +func TestHypervisorOvercommitController_Reconcile(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + standardTrait := "CUSTOM_STANDARD" + missingTrait := "CUSTOM_MISSING" + + tests := []struct { + name string + hypervisor *hv1.Hypervisor + config HypervisorOvercommitConfig + expectedOvercommit map[hv1.ResourceName]float64 + expectNoUpdate bool + expectNotFoundError bool + }{ + { + name: "apply overcommit for matching HasTrait", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + { + name: "apply overcommit for matching HasntTrait", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{}, // missing trait + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasntTrait: &missingTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + }, + { + name: "skip mapping when HasTrait not present", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_OTHER"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "skip mapping when HasntTrait is present", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, // trait is present + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasntTrait: &gpuTrait, // should skip because GPU trait IS present + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "later mappings override earlier ones", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU", "CUSTOM_STANDARD"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + HasTrait: &standardTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, // should override the first + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + { + name: "no update when overcommit already matches", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + expectNoUpdate: true, + }, + { + name: "skip mapping without trait specified", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 2.0, + }, + // No HasTrait or HasntTrait specified + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{}, + }, + { + name: "combine HasTrait and HasntTrait mappings", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, // has GPU, doesn't have STANDARD + }, + }, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceMemory: 1.5, + }, + HasntTrait: &standardTrait, // STANDARD not present + }, + }, + }, + expectedOvercommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + hv1.ResourceMemory: 1.5, + }, + }, + { + name: "hypervisor not found", + hypervisor: &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nonexistent", + }, + }, + config: HypervisorOvercommitConfig{}, + expectNotFoundError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var fakeClient client.Client + if tt.expectNotFoundError { + // Don't add the hypervisor to the fake client + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + Build() + } else { + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tt.hypervisor). + Build() + } + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: tt.config, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: tt.hypervisor.Name, + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue") + } + + if tt.expectNotFoundError { + // For not found case, we expect no error and no requeue + return + } + + // Get the updated hypervisor + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + // Check overcommit ratios + if len(updated.Spec.Overcommit) != len(tt.expectedOvercommit) { + t.Errorf("expected %d overcommit entries, got %d", + len(tt.expectedOvercommit), len(updated.Spec.Overcommit)) + } + + for resource, expected := range tt.expectedOvercommit { + actual, ok := updated.Spec.Overcommit[resource] + if !ok { + t.Errorf("expected overcommit for resource %s, but not found", resource) + continue + } + if actual != expected { + t.Errorf("expected overcommit %f for resource %s, got %f", + expected, resource, actual) + } + } + }) + } +} + +func TestHypervisorOvercommitController_ReconcileNotFound(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{}, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "nonexistent-hypervisor", + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("expected no error for not found resource, got: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue for not found resource") + } +} + +// mockWorkQueue implements workqueue.TypedRateLimitingInterface for testing +type mockWorkQueue struct { + workqueue.TypedRateLimitingInterface[reconcile.Request] + items []reconcile.Request +} + +func (m *mockWorkQueue) Add(item reconcile.Request) { + m.items = append(m.items, item) +} + +func TestHypervisorOvercommitController_HandleRemoteHypervisor(t *testing.T) { + controller := &HypervisorOvercommitController{} + handler := controller.handleRemoteHypervisor() + + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + } + + ctx := context.Background() + + t.Run("CreateFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Create(ctx, event.CreateEvent{Object: hypervisor}, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) + + t.Run("UpdateFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Update(ctx, event.UpdateEvent{ + ObjectOld: hypervisor, + ObjectNew: hypervisor, + }, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) + + t.Run("DeleteFunc", func(t *testing.T) { + queue := &mockWorkQueue{} + handler.Delete(ctx, event.DeleteEvent{Object: hypervisor}, queue) + + if len(queue.items) != 1 { + t.Errorf("expected 1 item in queue, got %d", len(queue.items)) + } + if queue.items[0].Name != "test-hypervisor" { + t.Errorf("expected hypervisor name 'test-hypervisor', got %s", queue.items[0].Name) + } + }) +} + +func TestHypervisorOvercommitController_PredicateRemoteHypervisor(t *testing.T) { + controller := &HypervisorOvercommitController{} + predicate := controller.predicateRemoteHypervisor() + + t.Run("accepts Hypervisor objects", func(t *testing.T) { + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + } + + if !predicate.Generic(event.GenericEvent{Object: hypervisor}) { + t.Error("expected predicate to accept Hypervisor object") + } + }) + + t.Run("rejects non-Hypervisor objects", func(t *testing.T) { + // Create a non-Hypervisor object by using a different type + // We'll test with a nil object which should return false + type nonHypervisor struct { + client.Object + } + + if predicate.Generic(event.GenericEvent{Object: &nonHypervisor{}}) { + t.Error("expected predicate to reject non-Hypervisor object") + } + }) +} + +func TestHypervisorOvercommitController_SetupWithManager_InvalidClient(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + // Create a regular fake client (not a multicluster client) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + } + + // Create a minimal mock manager for testing + mgr := &mockManager{scheme: scheme} + + // SetupWithManager should fail - either because config loading fails + // (in test environment without config files) or because the client + // is not a multicluster client. + err := controller.SetupWithManager(mgr) + if err == nil { + t.Error("expected error when calling SetupWithManager, got nil") + } + // The error could be either about missing config or about multicluster client + // depending on the test environment. We just verify an error is returned. +} + +// mockManager implements ctrl.Manager for testing SetupWithManager +type mockManager struct { + ctrl.Manager + scheme *runtime.Scheme +} + +func (m *mockManager) GetScheme() *runtime.Scheme { + return m.scheme +} + +// patchFailingClient wraps a client.Client and returns an error on Patch calls +type patchFailingClient struct { + client.Client + patchErr error +} + +func (c *patchFailingClient) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + return c.patchErr +} + +func TestHypervisorOvercommitController_Reconcile_PatchError(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + // Create a fake client with the hypervisor, then wrap it to fail on Patch + baseClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + patchErr := errors.New("patch failed") + failingClient := &patchFailingClient{ + Client: baseClient, + patchErr: patchErr, + } + + controller := &HypervisorOvercommitController{ + Client: failingClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + }, + HasTrait: &gpuTrait, + }, + }, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + _, err := controller.Reconcile(ctx, req) + + // Reconcile should return an error when Patch fails + if err == nil { + t.Error("expected error when Patch fails, got nil") + } + if !strings.Contains(err.Error(), "patch failed") { + t.Errorf("expected error message to contain 'patch failed', got: %v", err) + } +} + +func TestHypervisorOvercommitController_Reconcile_EmptyConfig(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{}, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + result, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("expected no requeue") + } + + // Verify no changes were made + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + if len(updated.Spec.Overcommit) != 0 { + t.Errorf("expected empty overcommit, got %v", updated.Spec.Overcommit) + } +} + +func TestHypervisorOvercommitController_Reconcile_MultipleResources(t *testing.T) { + scheme := newTestHypervisorScheme(t) + + gpuTrait := "CUSTOM_GPU" + hypervisor := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hypervisor", + }, + Spec: hv1.HypervisorSpec{ + Overcommit: map[hv1.ResourceName]float64{}, + }, + Status: hv1.HypervisorStatus{ + Traits: []string{"CUSTOM_GPU"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hypervisor). + Build() + + controller := &HypervisorOvercommitController{ + Client: fakeClient, + config: HypervisorOvercommitConfig{ + OvercommitMappings: []HypervisorOvercommitMapping{ + { + Overcommit: map[hv1.ResourceName]float64{ + hv1.ResourceCPU: 4.0, + hv1.ResourceMemory: 1.5, + }, + HasTrait: &gpuTrait, + }, + }, + }, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: hypervisor.Name, + }, + } + + ctx := context.Background() + _, err := controller.Reconcile(ctx, req) + + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + updated := &hv1.Hypervisor{} + if err := fakeClient.Get(ctx, req.NamespacedName, updated); err != nil { + t.Fatalf("failed to get updated hypervisor: %v", err) + } + + if len(updated.Spec.Overcommit) != 2 { + t.Errorf("expected 2 overcommit entries, got %d", len(updated.Spec.Overcommit)) + } + + if updated.Spec.Overcommit[hv1.ResourceCPU] != 4.0 { + t.Errorf("expected CPU overcommit 4.0, got %f", updated.Spec.Overcommit[hv1.ResourceCPU]) + } + + if updated.Spec.Overcommit[hv1.ResourceMemory] != 1.5 { + t.Errorf("expected Memory overcommit 1.5, got %f", updated.Spec.Overcommit[hv1.ResourceMemory]) + } +} diff --git a/pkg/conf/conf.go b/pkg/conf/conf.go index 595b33bf2..b0feb02c2 100644 --- a/pkg/conf/conf.go +++ b/pkg/conf/conf.go @@ -17,35 +17,51 @@ import ( // // The values read from secrets.json will override the values in conf.json func GetConfigOrDie[C any]() C { + c, err := GetConfig[C]() + if err != nil { + panic(err) + } + return c +} + +// Create a new configuration from the default config json file. +// Return an error if the config cannot be read or parsed. +// +// This will read two files: +// - /etc/config/conf.json +// - /etc/secrets/secrets.json +// +// The values read from secrets.json will override the values in conf.json +func GetConfig[C any]() (C, error) { // Note: We need to read the config as a raw map first, to avoid golang // unmarshalling default values for the fields. // Read the base config from the configmap (not including secrets). cmConf, err := readRawConfig("/etc/config/conf.json") if err != nil { - panic(err) + return *new(C), err } // Read the secrets config from the kubernetes secret. secretConf, err := readRawConfig("/etc/secrets/secrets.json") if err != nil { - panic(err) + return *new(C), err } return newConfigFromMaps[C](cmConf, secretConf) } -func newConfigFromMaps[C any](base, override map[string]any) C { +func newConfigFromMaps[C any](base, override map[string]any) (C, error) { // Merge the base config with the override config. mergedConf := mergeMaps(base, override) // Marshal again, and then unmarshal into the config struct. mergedBytes, err := json.Marshal(mergedConf) if err != nil { - panic(err) + return *new(C), err } var c C if err := json.Unmarshal(mergedBytes, &c); err != nil { - panic(err) + return *new(C), err } - return c + return c, nil } // Read the json as a map from the given file path.