@@ -9,9 +9,12 @@ import (
99 "strconv"
1010 "strings"
1111
12+ "k8s.io/apimachinery/pkg/api/meta"
1213 "k8s.io/apimachinery/pkg/api/resource"
14+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1315 "sigs.k8s.io/controller-runtime/pkg/client"
1416
17+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
1518 "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
1619 "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
1720 "github.com/cobaltcore-dev/cortex/pkg/conf"
@@ -29,14 +32,17 @@ func getBuildingBlock(hostName string) string {
2932 return "unknown"
3033}
3134
35+ // hostReservationResources holds aggregated CPU and memory reservation quantities for a single hypervisor.
36+ type hostReservationResources struct {
37+ cpu resource.Quantity
38+ memory resource.Quantity
39+ }
40+
3241type KVMResourceCapacityKPI struct {
3342 // Common base for all KPIs that provides standard functionality.
3443 plugins.BaseKPI [struct {}] // No options passed through yaml config
35- utilizedCapacityPerHost * prometheus.Desc
36- paygCapacityPerHost * prometheus.Desc
37- failoverCapacityPerHost * prometheus.Desc
38- reservedCapacityPerHost * prometheus.Desc
3944 totalCapacityPerHost * prometheus.Desc
45+ capacityPerHost * prometheus.Desc
4046}
4147
4248func (KVMResourceCapacityKPI ) GetName () string {
@@ -47,60 +53,9 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf
4753 if err := k .BaseKPI .Init (db , client , opts ); err != nil {
4854 return err
4955 }
50- k .utilizedCapacityPerHost = prometheus .NewDesc (
51- "cortex_kvm_host_capacity_utilized" ,
52- "Utilized resources on the KVM hosts (individually by host)." ,
53- []string {
54- "compute_host" ,
55- "resource" ,
56- "availability_zone" ,
57- "building_block" ,
58- "cpu_architecture" ,
59- "workload_type" ,
60- "enabled" ,
61- "decommissioned" ,
62- "external_customer" ,
63- "maintenance" ,
64- },
65- nil ,
66- )
67- k .paygCapacityPerHost = prometheus .NewDesc (
68- "cortex_kvm_host_capacity_payg" ,
69- "PAYG resources available on the KVM hosts (individually by host)." ,
70- []string {
71- "compute_host" ,
72- "resource" ,
73- "availability_zone" ,
74- "building_block" ,
75- "cpu_architecture" ,
76- "workload_type" ,
77- "enabled" ,
78- "decommissioned" ,
79- "external_customer" ,
80- "maintenance" ,
81- },
82- nil ,
83- )
84- k .reservedCapacityPerHost = prometheus .NewDesc (
85- "cortex_kvm_host_capacity_reserved" ,
86- "Reserved resources on the KVM hosts (individually by host)." ,
87- []string {
88- "compute_host" ,
89- "resource" ,
90- "availability_zone" ,
91- "building_block" ,
92- "cpu_architecture" ,
93- "workload_type" ,
94- "enabled" ,
95- "decommissioned" ,
96- "external_customer" ,
97- "maintenance" ,
98- },
99- nil ,
100- )
101- k .failoverCapacityPerHost = prometheus .NewDesc (
102- "cortex_kvm_host_capacity_failover" ,
103- "Failover resources on the KVM hosts (individually by host)." ,
56+ k .totalCapacityPerHost = prometheus .NewDesc (
57+ "cortex_kvm_host_capacity_total" ,
58+ "Total resource capacity on the KVM hosts (individually by host)." ,
10459 []string {
10560 "compute_host" ,
10661 "resource" ,
@@ -115,12 +70,13 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf
11570 },
11671 nil ,
11772 )
118- k .totalCapacityPerHost = prometheus .NewDesc (
119- "cortex_kvm_host_capacity_total " ,
120- "Total resources on the KVM hosts (individually by host)." ,
73+ k .capacityPerHost = prometheus .NewDesc (
74+ "cortex_kvm_host_capacity_usage " ,
75+ "Resource capacity usage on the KVM hosts (individually by host)." ,
12176 []string {
12277 "compute_host" ,
12378 "resource" ,
79+ "type" ,
12480 "availability_zone" ,
12581 "building_block" ,
12682 "cpu_architecture" ,
@@ -136,23 +92,96 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf
13692}
13793
13894func (k * KVMResourceCapacityKPI ) Describe (ch chan <- * prometheus.Desc ) {
139- ch <- k .utilizedCapacityPerHost
140- ch <- k .paygCapacityPerHost
141- ch <- k .reservedCapacityPerHost
142- ch <- k .failoverCapacityPerHost
14395 ch <- k .totalCapacityPerHost
96+ ch <- k .capacityPerHost
97+ }
98+
99+ // aggregateReservationsByHost groups Ready reservations by host, returning per-host
100+ // failover totals and committed-resource "not yet in use" totals.
101+ func aggregateReservationsByHost (reservations []v1alpha1.Reservation ) (
102+ failoverByHost map [string ]hostReservationResources ,
103+ committedNotInUseByHost map [string ]hostReservationResources ,
104+ ) {
105+
106+ failoverByHost = make (map [string ]hostReservationResources )
107+ committedNotInUseByHost = make (map [string ]hostReservationResources )
108+
109+ for _ , reservation := range reservations {
110+ if reservation .Spec .SchedulingDomain != v1alpha1 .SchedulingDomainNova {
111+ continue
112+ }
113+
114+ readyCondition := meta .FindStatusCondition (reservation .Status .Conditions , v1alpha1 .ReservationConditionReady )
115+ if readyCondition == nil || readyCondition .Status != metav1 .ConditionTrue {
116+ continue
117+ }
118+
119+ host := reservation .Status .Host
120+ if host == "" {
121+ continue
122+ }
123+
124+ switch reservation .Spec .Type {
125+ case v1alpha1 .ReservationTypeFailover :
126+ entry := failoverByHost [host ]
127+ cpuQty := reservation .Spec .Resources [hv1 .ResourceCPU ]
128+ entry .cpu .Add (cpuQty )
129+ memQty := reservation .Spec .Resources [hv1 .ResourceMemory ]
130+ entry .memory .Add (memQty )
131+ failoverByHost [host ] = entry
132+
133+ case v1alpha1 .ReservationTypeCommittedResource :
134+ // Total reserved resources for this reservation.
135+ cpuTotal := reservation .Spec .Resources [hv1 .ResourceCPU ]
136+ memTotal := reservation .Spec .Resources [hv1 .ResourceMemory ]
137+
138+ // Sum allocated resources across all workloads.
139+ var cpuAllocated , memAllocated resource.Quantity
140+ if reservation .Spec .CommittedResourceReservation != nil {
141+ for _ , alloc := range reservation .Spec .CommittedResourceReservation .Allocations {
142+ cpuAllocated .Add (alloc .Resources [hv1 .ResourceCPU ])
143+ memAllocated .Add (alloc .Resources [hv1 .ResourceMemory ])
144+ }
145+ }
146+
147+ // Not yet in use = total - allocated, clamped to zero.
148+ cpuNotInUse := cpuTotal .DeepCopy ()
149+ cpuNotInUse .Sub (cpuAllocated )
150+ if cpuNotInUse .Cmp (resource .MustParse ("0" )) < 0 {
151+ cpuNotInUse = resource .MustParse ("0" )
152+ }
153+
154+ memNotInUse := memTotal .DeepCopy ()
155+ memNotInUse .Sub (memAllocated )
156+ if memNotInUse .Cmp (resource .MustParse ("0" )) < 0 {
157+ memNotInUse = resource .MustParse ("0" )
158+ }
159+
160+ entry := committedNotInUseByHost [host ]
161+ entry .cpu .Add (cpuNotInUse )
162+ entry .memory .Add (memNotInUse )
163+ committedNotInUseByHost [host ] = entry
164+ }
165+ }
166+
167+ return failoverByHost , committedNotInUseByHost
144168}
145169
146170func (k * KVMResourceCapacityKPI ) Collect (ch chan <- prometheus.Metric ) {
147- // The hypervisor resource auto-discovers its current utilization.
148- // We can use the hypervisor status to calculate the total capacity
149- // and then subtract the actual resource allocation from virtual machines.
150171 hvs := & hv1.HypervisorList {}
151172 if err := k .Client .List (context .Background (), hvs ); err != nil {
152173 slog .Error ("failed to list hypervisors" , "error" , err )
153174 return
154175 }
155176
177+ reservations := & v1alpha1.ReservationList {}
178+ if err := k .Client .List (context .Background (), reservations ); err != nil {
179+ slog .Error ("failed to list reservations" , "error" , err )
180+ return
181+ }
182+
183+ failoverByHost , committedNotInUseByHost := aggregateReservationsByHost (reservations .Items )
184+
156185 for _ , hypervisor := range hvs .Items {
157186 if hypervisor .Status .EffectiveCapacity == nil {
158187 slog .Warn ("hypervisor with nil effective capacity, skipping" , "host" , hypervisor .Name )
@@ -182,27 +211,28 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) {
182211 ramUsed = resource .MustParse ("0" )
183212 }
184213
185- exportCapacityMetricKVM (ch , k .totalCapacityPerHost , "cpu" , cpuTotal .AsApproximateFloat64 (), hypervisor )
186- exportCapacityMetricKVM (ch , k .totalCapacityPerHost , "ram" , ramTotal .AsApproximateFloat64 (), hypervisor )
214+ // Get reservation data for this hypervisor (zero-value if absent).
215+ failoverRes := failoverByHost [hypervisor .Name ]
216+ committedRes := committedNotInUseByHost [hypervisor .Name ]
217+
218+ cpuReserved := committedRes .cpu
219+ ramReserved := committedRes .memory
220+ cpuFailover := failoverRes .cpu
221+ ramFailover := failoverRes .memory
187222
188- exportCapacityMetricKVM (ch , k .utilizedCapacityPerHost , "cpu" , cpuUsed .AsApproximateFloat64 (), hypervisor )
189- exportCapacityMetricKVM (ch , k .utilizedCapacityPerHost , "ram" , ramUsed .AsApproximateFloat64 (), hypervisor )
223+ labels := hostLabelsFromHypervisor (hypervisor )
190224
191- // WARNING: Using dummy data for now.
192- // TODO Replace with actual data from reservations capacity CRDs
193- cpuReserved := resource .MustParse ("100" )
194- ramReserved := resource .MustParse ("1Gi" )
225+ k .emitTotal (ch , "cpu" , cpuTotal .AsApproximateFloat64 (), labels )
226+ k .emitTotal (ch , "ram" , ramTotal .AsApproximateFloat64 (), labels )
195227
196- exportCapacityMetricKVM (ch , k . reservedCapacityPerHost , "cpu" , cpuReserved .AsApproximateFloat64 (), hypervisor )
197- exportCapacityMetricKVM (ch , k . reservedCapacityPerHost , "ram" , ramReserved .AsApproximateFloat64 (), hypervisor )
228+ k . emitUsage (ch , "cpu" , cpuUsed .AsApproximateFloat64 (), "utilized" , labels )
229+ k . emitUsage (ch , "ram" , ramUsed .AsApproximateFloat64 (), "utilized" , labels )
198230
199- // WARNING: Using dummy data for now.
200- // TODO Replace with actual data from failover capacity CRDs
201- cpuFailover := resource .MustParse ("100" )
202- ramFailover := resource .MustParse ("1Gi" )
231+ k .emitUsage (ch , "cpu" , cpuReserved .AsApproximateFloat64 (), "reserved" , labels )
232+ k .emitUsage (ch , "ram" , ramReserved .AsApproximateFloat64 (), "reserved" , labels )
203233
204- exportCapacityMetricKVM (ch , k . failoverCapacityPerHost , "cpu" , cpuFailover .AsApproximateFloat64 (), hypervisor )
205- exportCapacityMetricKVM (ch , k . failoverCapacityPerHost , "ram" , ramFailover .AsApproximateFloat64 (), hypervisor )
234+ k . emitUsage (ch , "cpu" , cpuFailover .AsApproximateFloat64 (), "failover" , labels )
235+ k . emitUsage (ch , "ram" , ramFailover .AsApproximateFloat64 (), "failover" , labels )
206236
207237 // Calculate PAYG capacity
208238 paygCPU := cpuTotal .DeepCopy ()
@@ -215,21 +245,27 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) {
215245 paygRAM .Sub (ramReserved )
216246 paygRAM .Sub (ramFailover )
217247
218- exportCapacityMetricKVM (ch , k . paygCapacityPerHost , "cpu" , paygCPU .AsApproximateFloat64 (), hypervisor )
219- exportCapacityMetricKVM (ch , k . paygCapacityPerHost , "ram" , paygRAM .AsApproximateFloat64 (), hypervisor )
248+ k . emitUsage (ch , "cpu" , paygCPU .AsApproximateFloat64 (), "payg" , labels )
249+ k . emitUsage (ch , "ram" , paygRAM .AsApproximateFloat64 (), "payg" , labels )
220250 }
221251}
222252
223- func exportCapacityMetricKVM (ch chan <- prometheus.Metric , metric * prometheus.Desc , resource string , value float64 , hypervisor hv1.Hypervisor ) {
224- bb := getBuildingBlock (hypervisor .Name )
225-
226- availabilityZone := hypervisor .Labels ["topology.kubernetes.io/zone" ]
253+ // kvmHostLabels holds precomputed label values derived from a hypervisor.
254+ type kvmHostLabels struct {
255+ computeHost string
256+ availabilityZone string
257+ buildingBlock string
258+ cpuArchitecture string
259+ workloadType string
260+ enabled string
261+ decommissioned string
262+ externalCustomer string
263+ maintenance string
264+ }
227265
228- enabled := true
266+ func hostLabelsFromHypervisor ( hypervisor hv1. Hypervisor ) kvmHostLabels {
229267 decommissioned := false
230268 externalCustomer := false
231- maintenance := false
232-
233269 workloadType := "general-purpose"
234270 cpuArchitecture := "cascade-lake"
235271
@@ -246,19 +282,52 @@ func exportCapacityMetricKVM(ch chan<- prometheus.Metric, metric *prometheus.Des
246282 }
247283 }
248284
285+ return kvmHostLabels {
286+ computeHost : hypervisor .Name ,
287+ availabilityZone : hypervisor .Labels ["topology.kubernetes.io/zone" ],
288+ buildingBlock : getBuildingBlock (hypervisor .Name ),
289+ cpuArchitecture : cpuArchitecture ,
290+ workloadType : workloadType ,
291+ enabled : strconv .FormatBool (true ),
292+ decommissioned : strconv .FormatBool (decommissioned ),
293+ externalCustomer : strconv .FormatBool (externalCustomer ),
294+ maintenance : strconv .FormatBool (false ),
295+ }
296+ }
297+
298+ func (k * KVMResourceCapacityKPI ) emitTotal (ch chan <- prometheus.Metric , resourceName string , value float64 , l kvmHostLabels ) {
299+ ch <- prometheus .MustNewConstMetric (
300+ k .totalCapacityPerHost ,
301+ prometheus .GaugeValue ,
302+ value ,
303+ l .computeHost ,
304+ resourceName ,
305+ l .availabilityZone ,
306+ l .buildingBlock ,
307+ l .cpuArchitecture ,
308+ l .workloadType ,
309+ l .enabled ,
310+ l .decommissioned ,
311+ l .externalCustomer ,
312+ l .maintenance ,
313+ )
314+ }
315+
316+ func (k * KVMResourceCapacityKPI ) emitUsage (ch chan <- prometheus.Metric , resourceName string , value float64 , capacityType string , l kvmHostLabels ) {
249317 ch <- prometheus .MustNewConstMetric (
250- metric ,
318+ k . capacityPerHost ,
251319 prometheus .GaugeValue ,
252320 value ,
253- hypervisor .Name ,
254- resource ,
255- availabilityZone ,
256- bb ,
257- cpuArchitecture ,
258- workloadType ,
259- strconv .FormatBool (enabled ),
260- strconv .FormatBool (decommissioned ),
261- strconv .FormatBool (externalCustomer ),
262- strconv .FormatBool (maintenance ),
321+ l .computeHost ,
322+ resourceName ,
323+ capacityType ,
324+ l .availabilityZone ,
325+ l .buildingBlock ,
326+ l .cpuArchitecture ,
327+ l .workloadType ,
328+ l .enabled ,
329+ l .decommissioned ,
330+ l .externalCustomer ,
331+ l .maintenance ,
263332 )
264333}
0 commit comments