Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8b31812
Document initial ideas to route multicluster resources
PhilippMatthes Mar 11, 2026
099e208
Add 1:n mapping in clusters
SoWieMarkus Mar 12, 2026
9e0cacc
Remove example multicluster configuration from cortex-scheduling-cont…
SoWieMarkus Mar 12, 2026
b8d967d
Resolve linter issues
SoWieMarkus Mar 12, 2026
204cf2c
Add Hypervisor resource router to multicluster client
SoWieMarkus Mar 12, 2026
7a8a8fb
Feedback
SoWieMarkus Mar 13, 2026
0f75530
Lint fix
SoWieMarkus Mar 13, 2026
eb4b793
Fix copy paste error
SoWieMarkus Mar 17, 2026
a1763a5
Initial draft of multi-az multicluster guide [skip ci]
PhilippMatthes Mar 17, 2026
0d3b9e9
CodeRabbit feedback
SoWieMarkus Mar 17, 2026
1b6d05c
Feedback
SoWieMarkus Mar 17, 2026
2c0caa3
Feedback
SoWieMarkus Mar 17, 2026
b10b092
Replace .For with .MultiCluster
SoWieMarkus Mar 17, 2026
88b96de
Merge branch 'main' into multicluster-routing
SoWieMarkus Mar 17, 2026
11b7dfc
Fix hypervisor overcommit manager
SoWieMarkus Mar 17, 2026
fd4aef5
Advance guide [skip ci]
PhilippMatthes Mar 17, 2026
02558b2
PR feedback
PhilippMatthes Mar 17, 2026
0691d4e
Add outcome
PhilippMatthes Mar 17, 2026
6dda060
Refactor clusterForWrite logic and enhance tests for router matching
SoWieMarkus Mar 17, 2026
8450a16
Fix error message casing in clusterForWrite function
SoWieMarkus Mar 18, 2026
34fe33d
Enhance error handling for duplicate resources in multi-cluster Get a…
SoWieMarkus Mar 18, 2026
f93f8cb
Implement soft fail if cluster is not available
SoWieMarkus Mar 18, 2026
b46676a
Enhance Get and List methods to log non-NotFound errors and prevent s…
SoWieMarkus Mar 18, 2026
fa723fe
Use corev1.LabelTopologyZone
PhilippMatthes Mar 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Tiltfile
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen

########### Dependency CRDs
# Make sure the local cluster is running if you are running into startup issues here.
url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/hypervisor-crd.yaml'
url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml'
local('curl -L ' + url + ' | kubectl apply -f -')

########### Cortex Operator & CRDs
Expand Down
5 changes: 5 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
_ "k8s.io/client-go/plugin/pkg/client/auth"

"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -264,10 +265,14 @@ func main() {
setupLog.Error(err, "unable to add home cluster")
os.Exit(1)
}
hvGVK := schema.GroupVersionKind{Group: "kvm.cloud.sap", Version: "v1", Kind: "Hypervisor"}
multiclusterClient := &multicluster.Client{
HomeCluster: homeCluster,
HomeRestConfig: restConfig,
HomeScheme: scheme,
ResourceRouters: map[schema.GroupVersionKind]multicluster.ResourceRouter{
hvGVK: multicluster.HypervisorResourceRouter{},
},
}
multiclusterClientConfig := conf.GetConfigOrDie[multicluster.ClientConfig]()
if err := multiclusterClient.InitFromConf(ctx, mgr, multiclusterClientConfig); err != nil {
Expand Down
17 changes: 17 additions & 0 deletions docs/guides/multicluster/cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -e

echo "Deleting home cluster"
kind delete cluster --name cortex-home

echo "Deleting az-a and az-b clusters"
kind delete cluster --name cortex-remote-az-a
kind delete cluster --name cortex-remote-az-b

echo "Cleaning up temporary files"
rm -f /tmp/root-ca-home.pem \
/tmp/root-ca-remote-az-a.pem \
/tmp/root-ca-remote-az-b.pem \
/tmp/cortex-values.yaml \
/tmp/hypervisor-crd.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
name: cortex-remote
name: cortex-remote-az-a
nodes:
- role: control-plane
extraPortMappings:
Expand Down
27 changes: 27 additions & 0 deletions docs/guides/multicluster/cortex-remote-az-b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
name: cortex-remote-az-b
nodes:
- role: control-plane
extraPortMappings:
- containerPort: 6443
hostPort: 8445
extraMounts:
- hostPath: /tmp/root-ca-home.pem
containerPath: /etc/ca-certificates/root-ca.pem
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
apiServer:
extraArgs:
oidc-client-id: "https://host.docker.internal:8443" # = audience
oidc-issuer-url: "https://host.docker.internal:8443"
oidc-username-claim: sub
oidc-ca-file: /etc/ca-certificates/root-ca.pem
certSANs:
- api-proxy
- api-proxy.default.svc
- api-proxy.default.svc.cluster.local
- localhost
- 127.0.0.1
- host.docker.internal
13 changes: 13 additions & 0 deletions docs/guides/multicluster/hypervisors-az-a.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: kvm.cloud.sap/v1
kind: Hypervisor
metadata:
name: hypervisor-1-az-a
labels:
topology.kubernetes.io/zone: cortex-remote-az-a
---
apiVersion: kvm.cloud.sap/v1
kind: Hypervisor
metadata:
name: hypervisor-2-az-a
labels:
topology.kubernetes.io/zone: cortex-remote-az-a
13 changes: 13 additions & 0 deletions docs/guides/multicluster/hypervisors-az-b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: kvm.cloud.sap/v1
kind: Hypervisor
metadata:
name: hypervisor-1-az-b
labels:
topology.kubernetes.io/zone: cortex-remote-az-b
---
apiVersion: kvm.cloud.sap/v1
kind: Hypervisor
metadata:
name: hypervisor-2-az-b
labels:
topology.kubernetes.io/zone: cortex-remote-az-b
121 changes: 86 additions & 35 deletions docs/guides/multicluster/readme.md

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions docs/guides/multicluster/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash

set -e

echo "Creating home cluster"
kind create cluster --config docs/guides/multicluster/cortex-home.yaml

echo "Applying cluster role binding for oidc endpoint access"
kubectl --context kind-cortex-home apply -f docs/guides/multicluster/cortex-home-crb.yaml

echo "Storing home cluster cert under /tmp/root-ca-home.pem"
kubectl --context kind-cortex-home --namespace kube-system \
get configmap extension-apiserver-authentication \
-o jsonpath="{.data['client-ca-file']}" > /tmp/root-ca-home.pem

echo "Creating az-a and az-b clusters"
kind create cluster --config docs/guides/multicluster/cortex-remote-az-a.yaml
kind create cluster --config docs/guides/multicluster/cortex-remote-az-b.yaml

echo "Granting cortex-home sa tokens access to az-a and az-b clusters"
kubectl --context kind-cortex-remote-az-a apply -f docs/guides/multicluster/cortex-remote-crb.yaml
kubectl --context kind-cortex-remote-az-b apply -f docs/guides/multicluster/cortex-remote-crb.yaml

echo "Installing cortex crds in az-a and az-b clusters"
kubectl config use-context kind-cortex-remote-az-a
helm install helm/bundles/cortex-crds --generate-name
kubectl config use-context kind-cortex-remote-az-b
helm install helm/bundles/cortex-crds --generate-name

echo "Installing hypervisor crd as external dependency to all three clusters"
curl -L https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml > /tmp/hypervisor-crd.yaml
kubectl --context kind-cortex-home apply -f /tmp/hypervisor-crd.yaml
kubectl --context kind-cortex-remote-az-a apply -f /tmp/hypervisor-crd.yaml
kubectl --context kind-cortex-remote-az-b apply -f /tmp/hypervisor-crd.yaml

echo "Storing az-a and az-b cluster certs under /tmp/root-ca-remote-az-a.pem and /tmp/root-ca-remote-az-b.pem"
kubectl --context kind-cortex-remote-az-a --namespace kube-system \
get configmap extension-apiserver-authentication \
-o jsonpath="{.data['client-ca-file']}" > /tmp/root-ca-remote-az-a.pem
kubectl --context kind-cortex-remote-az-b --namespace kube-system \
get configmap extension-apiserver-authentication \
-o jsonpath="{.data['client-ca-file']}" > /tmp/root-ca-remote-az-b.pem

echo "Setting up tilt overrides for cortex values"
export TILT_OVERRIDES_PATH=/tmp/cortex-values.yaml
tee $TILT_OVERRIDES_PATH <<EOF
global:
conf:
apiservers:
remotes:
- host: https://host.docker.internal:8444
gvks:
- kvm.cloud.sap/v1/Hypervisor
- kvm.cloud.sap/v1/HypervisorList
labels:
az: cortex-remote-az-a
caCert: |
$(cat /tmp/root-ca-remote-az-a.pem | sed 's/^/ /')
- host: https://host.docker.internal:8445
gvks:
- kvm.cloud.sap/v1/Hypervisor
- kvm.cloud.sap/v1/HypervisorList
labels:
az: cortex-remote-az-b
caCert: |
$(cat /tmp/root-ca-remote-az-b.pem | sed 's/^/ /')
EOF

echo "Applying hypervisor resources in az-a and az-b clusters"
kubectl --context kind-cortex-remote-az-a apply \
-f docs/guides/multicluster/hypervisors-az-a.yaml
kubectl --context kind-cortex-remote-az-b apply \
-f docs/guides/multicluster/hypervisors-az-b.yaml

echo "Starting cortex in home cluster with tilt, using overrides from $TILT_OVERRIDES_PATH"
kubectl config use-context kind-cortex-home
export ACTIVE_DEPLOYMENTS="nova" && tilt up
18 changes: 18 additions & 0 deletions helm/bundles/cortex-cinder/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,24 @@ cortex: &cortex
prometheus: {enable: false}
conf: &cortexConf
schedulingDomain: cinder
apiservers:
home:
gvks:
- cortex.cloud/v1alpha1/Decision
- cortex.cloud/v1alpha1/DecisionList
- cortex.cloud/v1alpha1/Descheduling
- cortex.cloud/v1alpha1/DeschedulingList
- cortex.cloud/v1alpha1/Pipeline
- cortex.cloud/v1alpha1/PipelineList
- cortex.cloud/v1alpha1/Knowledge
- cortex.cloud/v1alpha1/KnowledgeList
- cortex.cloud/v1alpha1/Datasource
- cortex.cloud/v1alpha1/DatasourceList
- cortex.cloud/v1alpha1/KPI
- cortex.cloud/v1alpha1/KPIList
- cortex.cloud/v1alpha1/Reservation
- cortex.cloud/v1alpha1/ReservationList
- v1/Secret
keystoneSecretRef:
name: cortex-cinder-openstack-keystone
namespace: default
Expand Down
24 changes: 24 additions & 0 deletions helm/bundles/cortex-ironcore/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,30 @@ cortex:
conf:
# The operator will only touch CRs with this scheduling domain name.
schedulingDomain: machines
apiservers:
home:
gvks:
- cortex.cloud/v1alpha1/Decision
- cortex.cloud/v1alpha1/DecisionList
- cortex.cloud/v1alpha1/Descheduling
- cortex.cloud/v1alpha1/DeschedulingList
- cortex.cloud/v1alpha1/Pipeline
- cortex.cloud/v1alpha1/PipelineList
- cortex.cloud/v1alpha1/Knowledge
- cortex.cloud/v1alpha1/KnowledgeList
- cortex.cloud/v1alpha1/Datasource
- cortex.cloud/v1alpha1/DatasourceList
- cortex.cloud/v1alpha1/KPI
- cortex.cloud/v1alpha1/KPIList
- cortex.cloud/v1alpha1/Reservation
- cortex.cloud/v1alpha1/ReservationList
- compute.ironcore.dev/v1alpha1/Machine
- compute.ironcore.dev/v1alpha1/MachineList
- compute.ironcore.dev/v1alpha1/MachinePool
- compute.ironcore.dev/v1alpha1/MachinePoolList
- compute.ironcore.dev/v1alpha1/MachineClass
- compute.ironcore.dev/v1alpha1/MachineClassList
- v1/Secret
enabledControllers:
- ironcore-decisions-pipeline-controller
- explanation-controller
Expand Down
18 changes: 18 additions & 0 deletions helm/bundles/cortex-manila/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,24 @@ cortex: &cortex
prometheus: {enable: false}
conf: &cortexConf
schedulingDomain: manila
apiservers:
home:
gvks:
- cortex.cloud/v1alpha1/Decision
- cortex.cloud/v1alpha1/DecisionList
- cortex.cloud/v1alpha1/Descheduling
- cortex.cloud/v1alpha1/DeschedulingList
- cortex.cloud/v1alpha1/Pipeline
- cortex.cloud/v1alpha1/PipelineList
- cortex.cloud/v1alpha1/Knowledge
- cortex.cloud/v1alpha1/KnowledgeList
- cortex.cloud/v1alpha1/Datasource
- cortex.cloud/v1alpha1/DatasourceList
- cortex.cloud/v1alpha1/KPI
- cortex.cloud/v1alpha1/KPIList
- cortex.cloud/v1alpha1/Reservation
- cortex.cloud/v1alpha1/ReservationList
- v1/Secret
keystoneSecretRef:
name: cortex-manila-openstack-keystone
namespace: default
Expand Down
20 changes: 20 additions & 0 deletions helm/bundles/cortex-nova/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,26 @@ cortex: &cortex
prometheus: {enable: false}
conf: &cortexConf
schedulingDomain: nova
apiservers:
home:
gvks:
- cortex.cloud/v1alpha1/Decision
- cortex.cloud/v1alpha1/DecisionList
- cortex.cloud/v1alpha1/Descheduling
- cortex.cloud/v1alpha1/DeschedulingList
- cortex.cloud/v1alpha1/Pipeline
- cortex.cloud/v1alpha1/PipelineList
- cortex.cloud/v1alpha1/Knowledge
- cortex.cloud/v1alpha1/KnowledgeList
- cortex.cloud/v1alpha1/Datasource
- cortex.cloud/v1alpha1/DatasourceList
- cortex.cloud/v1alpha1/KPI
- cortex.cloud/v1alpha1/KPIList
- cortex.cloud/v1alpha1/Reservation
- cortex.cloud/v1alpha1/ReservationList
- kvm.cloud.sap/v1/Hypervisor
- kvm.cloud.sap/v1/HypervisorList
- v1/Secret
keystoneSecretRef:
name: cortex-nova-openstack-keystone
namespace: default
Expand Down
21 changes: 21 additions & 0 deletions helm/bundles/cortex-pods/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,27 @@ cortex:
conf:
# The operator will only touch CRs with this scheduling domain name.
schedulingDomain: pods
apiservers:
home:
gvks:
- cortex.cloud/v1alpha1/Decision
- cortex.cloud/v1alpha1/DecisionList
- cortex.cloud/v1alpha1/Descheduling
- cortex.cloud/v1alpha1/DeschedulingList
- cortex.cloud/v1alpha1/Pipeline
- cortex.cloud/v1alpha1/PipelineList
- cortex.cloud/v1alpha1/Knowledge
- cortex.cloud/v1alpha1/KnowledgeList
- cortex.cloud/v1alpha1/Datasource
- cortex.cloud/v1alpha1/DatasourceList
- cortex.cloud/v1alpha1/KPI
- cortex.cloud/v1alpha1/KPIList
- cortex.cloud/v1alpha1/Reservation
- cortex.cloud/v1alpha1/ReservationList
- v1/Secret
- v1/Pod
- v1/NodeList
- v1/Binding
enabledControllers:
- pods-decisions-pipeline-controller
- explanation-controller
Expand Down
35 changes: 20 additions & 15 deletions internal/knowledge/datasources/plugins/openstack/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/handler"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/predicate"
Expand Down Expand Up @@ -237,19 +237,24 @@ func (r *OpenStackDatasourceReconciler) Reconcile(ctx context.Context, req ctrl.
}

func (r *OpenStackDatasourceReconciler) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
return multicluster.BuildController(mcl, mgr).
Named("cortex-openstack-datasource").
For(
&v1alpha1.Datasource{},
builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool {
// Only react to datasources matching the operator.
ds := obj.(*v1alpha1.Datasource)
if ds.Spec.SchedulingDomain != r.Conf.SchedulingDomain {
return false
}
// Only react to openstack datasources.
return ds.Spec.Type == v1alpha1.DatasourceTypeOpenStack
})),
).
bldr := multicluster.BuildController(mcl, mgr)
// Watch datasource changes across all clusters.
bldr, err := bldr.WatchesMulticluster(
&v1alpha1.Datasource{},
&handler.EnqueueRequestForObject{},
predicate.NewPredicateFuncs(func(obj client.Object) bool {
// Only react to datasources matching the operator.
ds := obj.(*v1alpha1.Datasource)
if ds.Spec.SchedulingDomain != r.Conf.SchedulingDomain {
return false
}
// Only react to openstack datasources.
return ds.Spec.Type == v1alpha1.DatasourceTypeOpenStack
}),
)
if err != nil {
return err
}
return bldr.Named("cortex-openstack-datasource").
Complete(r)
}
Loading
Loading