Skip to content

Commit 4eeefb9

Browse files
authored
Add reservation as source for kvm capacity metrics (#652)
## Changes - Add reservation as source for kvm `reserved` and `failover` usage types - Combined different usage types into same metric to allow grouping in perses dashboards - Adjusted tests
1 parent ae24224 commit 4eeefb9

2 files changed

Lines changed: 1182 additions & 352 deletions

File tree

internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go

Lines changed: 173 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@ import (
99
"strconv"
1010
"strings"
1111

12+
"k8s.io/apimachinery/pkg/api/meta"
1213
"k8s.io/apimachinery/pkg/api/resource"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1315
"sigs.k8s.io/controller-runtime/pkg/client"
1416

17+
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
1518
"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
1619
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
1720
"github.com/cobaltcore-dev/cortex/pkg/conf"
@@ -29,14 +32,17 @@ func getBuildingBlock(hostName string) string {
2932
return "unknown"
3033
}
3134

35+
// hostReservationResources holds aggregated CPU and memory reservation quantities for a single hypervisor.
36+
type hostReservationResources struct {
37+
cpu resource.Quantity
38+
memory resource.Quantity
39+
}
40+
3241
type KVMResourceCapacityKPI struct {
3342
// Common base for all KPIs that provides standard functionality.
3443
plugins.BaseKPI[struct{}] // No options passed through yaml config
35-
utilizedCapacityPerHost *prometheus.Desc
36-
paygCapacityPerHost *prometheus.Desc
37-
failoverCapacityPerHost *prometheus.Desc
38-
reservedCapacityPerHost *prometheus.Desc
3944
totalCapacityPerHost *prometheus.Desc
45+
capacityPerHost *prometheus.Desc
4046
}
4147

4248
func (KVMResourceCapacityKPI) GetName() string {
@@ -47,60 +53,9 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf
4753
if err := k.BaseKPI.Init(db, client, opts); err != nil {
4854
return err
4955
}
50-
k.utilizedCapacityPerHost = prometheus.NewDesc(
51-
"cortex_kvm_host_capacity_utilized",
52-
"Utilized resources on the KVM hosts (individually by host).",
53-
[]string{
54-
"compute_host",
55-
"resource",
56-
"availability_zone",
57-
"building_block",
58-
"cpu_architecture",
59-
"workload_type",
60-
"enabled",
61-
"decommissioned",
62-
"external_customer",
63-
"maintenance",
64-
},
65-
nil,
66-
)
67-
k.paygCapacityPerHost = prometheus.NewDesc(
68-
"cortex_kvm_host_capacity_payg",
69-
"PAYG resources available on the KVM hosts (individually by host).",
70-
[]string{
71-
"compute_host",
72-
"resource",
73-
"availability_zone",
74-
"building_block",
75-
"cpu_architecture",
76-
"workload_type",
77-
"enabled",
78-
"decommissioned",
79-
"external_customer",
80-
"maintenance",
81-
},
82-
nil,
83-
)
84-
k.reservedCapacityPerHost = prometheus.NewDesc(
85-
"cortex_kvm_host_capacity_reserved",
86-
"Reserved resources on the KVM hosts (individually by host).",
87-
[]string{
88-
"compute_host",
89-
"resource",
90-
"availability_zone",
91-
"building_block",
92-
"cpu_architecture",
93-
"workload_type",
94-
"enabled",
95-
"decommissioned",
96-
"external_customer",
97-
"maintenance",
98-
},
99-
nil,
100-
)
101-
k.failoverCapacityPerHost = prometheus.NewDesc(
102-
"cortex_kvm_host_capacity_failover",
103-
"Failover resources on the KVM hosts (individually by host).",
56+
k.totalCapacityPerHost = prometheus.NewDesc(
57+
"cortex_kvm_host_capacity_total",
58+
"Total resource capacity on the KVM hosts (individually by host).",
10459
[]string{
10560
"compute_host",
10661
"resource",
@@ -115,12 +70,13 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf
11570
},
11671
nil,
11772
)
118-
k.totalCapacityPerHost = prometheus.NewDesc(
119-
"cortex_kvm_host_capacity_total",
120-
"Total resources on the KVM hosts (individually by host).",
73+
k.capacityPerHost = prometheus.NewDesc(
74+
"cortex_kvm_host_capacity_usage",
75+
"Resource capacity usage on the KVM hosts (individually by host).",
12176
[]string{
12277
"compute_host",
12378
"resource",
79+
"type",
12480
"availability_zone",
12581
"building_block",
12682
"cpu_architecture",
@@ -136,23 +92,96 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf
13692
}
13793

13894
func (k *KVMResourceCapacityKPI) Describe(ch chan<- *prometheus.Desc) {
139-
ch <- k.utilizedCapacityPerHost
140-
ch <- k.paygCapacityPerHost
141-
ch <- k.reservedCapacityPerHost
142-
ch <- k.failoverCapacityPerHost
14395
ch <- k.totalCapacityPerHost
96+
ch <- k.capacityPerHost
97+
}
98+
99+
// aggregateReservationsByHost groups Ready reservations by host, returning per-host
100+
// failover totals and committed-resource "not yet in use" totals.
101+
func aggregateReservationsByHost(reservations []v1alpha1.Reservation) (
102+
failoverByHost map[string]hostReservationResources,
103+
committedNotInUseByHost map[string]hostReservationResources,
104+
) {
105+
106+
failoverByHost = make(map[string]hostReservationResources)
107+
committedNotInUseByHost = make(map[string]hostReservationResources)
108+
109+
for _, reservation := range reservations {
110+
if reservation.Spec.SchedulingDomain != v1alpha1.SchedulingDomainNova {
111+
continue
112+
}
113+
114+
readyCondition := meta.FindStatusCondition(reservation.Status.Conditions, v1alpha1.ReservationConditionReady)
115+
if readyCondition == nil || readyCondition.Status != metav1.ConditionTrue {
116+
continue
117+
}
118+
119+
host := reservation.Status.Host
120+
if host == "" {
121+
continue
122+
}
123+
124+
switch reservation.Spec.Type {
125+
case v1alpha1.ReservationTypeFailover:
126+
entry := failoverByHost[host]
127+
cpuQty := reservation.Spec.Resources[hv1.ResourceCPU]
128+
entry.cpu.Add(cpuQty)
129+
memQty := reservation.Spec.Resources[hv1.ResourceMemory]
130+
entry.memory.Add(memQty)
131+
failoverByHost[host] = entry
132+
133+
case v1alpha1.ReservationTypeCommittedResource:
134+
// Total reserved resources for this reservation.
135+
cpuTotal := reservation.Spec.Resources[hv1.ResourceCPU]
136+
memTotal := reservation.Spec.Resources[hv1.ResourceMemory]
137+
138+
// Sum allocated resources across all workloads.
139+
var cpuAllocated, memAllocated resource.Quantity
140+
if reservation.Spec.CommittedResourceReservation != nil {
141+
for _, alloc := range reservation.Spec.CommittedResourceReservation.Allocations {
142+
cpuAllocated.Add(alloc.Resources[hv1.ResourceCPU])
143+
memAllocated.Add(alloc.Resources[hv1.ResourceMemory])
144+
}
145+
}
146+
147+
// Not yet in use = total - allocated, clamped to zero.
148+
cpuNotInUse := cpuTotal.DeepCopy()
149+
cpuNotInUse.Sub(cpuAllocated)
150+
if cpuNotInUse.Cmp(resource.MustParse("0")) < 0 {
151+
cpuNotInUse = resource.MustParse("0")
152+
}
153+
154+
memNotInUse := memTotal.DeepCopy()
155+
memNotInUse.Sub(memAllocated)
156+
if memNotInUse.Cmp(resource.MustParse("0")) < 0 {
157+
memNotInUse = resource.MustParse("0")
158+
}
159+
160+
entry := committedNotInUseByHost[host]
161+
entry.cpu.Add(cpuNotInUse)
162+
entry.memory.Add(memNotInUse)
163+
committedNotInUseByHost[host] = entry
164+
}
165+
}
166+
167+
return failoverByHost, committedNotInUseByHost
144168
}
145169

146170
func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) {
147-
// The hypervisor resource auto-discovers its current utilization.
148-
// We can use the hypervisor status to calculate the total capacity
149-
// and then subtract the actual resource allocation from virtual machines.
150171
hvs := &hv1.HypervisorList{}
151172
if err := k.Client.List(context.Background(), hvs); err != nil {
152173
slog.Error("failed to list hypervisors", "error", err)
153174
return
154175
}
155176

177+
reservations := &v1alpha1.ReservationList{}
178+
if err := k.Client.List(context.Background(), reservations); err != nil {
179+
slog.Error("failed to list reservations", "error", err)
180+
return
181+
}
182+
183+
failoverByHost, committedNotInUseByHost := aggregateReservationsByHost(reservations.Items)
184+
156185
for _, hypervisor := range hvs.Items {
157186
if hypervisor.Status.EffectiveCapacity == nil {
158187
slog.Warn("hypervisor with nil effective capacity, skipping", "host", hypervisor.Name)
@@ -182,27 +211,28 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) {
182211
ramUsed = resource.MustParse("0")
183212
}
184213

185-
exportCapacityMetricKVM(ch, k.totalCapacityPerHost, "cpu", cpuTotal.AsApproximateFloat64(), hypervisor)
186-
exportCapacityMetricKVM(ch, k.totalCapacityPerHost, "ram", ramTotal.AsApproximateFloat64(), hypervisor)
214+
// Get reservation data for this hypervisor (zero-value if absent).
215+
failoverRes := failoverByHost[hypervisor.Name]
216+
committedRes := committedNotInUseByHost[hypervisor.Name]
217+
218+
cpuReserved := committedRes.cpu
219+
ramReserved := committedRes.memory
220+
cpuFailover := failoverRes.cpu
221+
ramFailover := failoverRes.memory
187222

188-
exportCapacityMetricKVM(ch, k.utilizedCapacityPerHost, "cpu", cpuUsed.AsApproximateFloat64(), hypervisor)
189-
exportCapacityMetricKVM(ch, k.utilizedCapacityPerHost, "ram", ramUsed.AsApproximateFloat64(), hypervisor)
223+
labels := hostLabelsFromHypervisor(hypervisor)
190224

191-
// WARNING: Using dummy data for now.
192-
// TODO Replace with actual data from reservations capacity CRDs
193-
cpuReserved := resource.MustParse("100")
194-
ramReserved := resource.MustParse("1Gi")
225+
k.emitTotal(ch, "cpu", cpuTotal.AsApproximateFloat64(), labels)
226+
k.emitTotal(ch, "ram", ramTotal.AsApproximateFloat64(), labels)
195227

196-
exportCapacityMetricKVM(ch, k.reservedCapacityPerHost, "cpu", cpuReserved.AsApproximateFloat64(), hypervisor)
197-
exportCapacityMetricKVM(ch, k.reservedCapacityPerHost, "ram", ramReserved.AsApproximateFloat64(), hypervisor)
228+
k.emitUsage(ch, "cpu", cpuUsed.AsApproximateFloat64(), "utilized", labels)
229+
k.emitUsage(ch, "ram", ramUsed.AsApproximateFloat64(), "utilized", labels)
198230

199-
// WARNING: Using dummy data for now.
200-
// TODO Replace with actual data from failover capacity CRDs
201-
cpuFailover := resource.MustParse("100")
202-
ramFailover := resource.MustParse("1Gi")
231+
k.emitUsage(ch, "cpu", cpuReserved.AsApproximateFloat64(), "reserved", labels)
232+
k.emitUsage(ch, "ram", ramReserved.AsApproximateFloat64(), "reserved", labels)
203233

204-
exportCapacityMetricKVM(ch, k.failoverCapacityPerHost, "cpu", cpuFailover.AsApproximateFloat64(), hypervisor)
205-
exportCapacityMetricKVM(ch, k.failoverCapacityPerHost, "ram", ramFailover.AsApproximateFloat64(), hypervisor)
234+
k.emitUsage(ch, "cpu", cpuFailover.AsApproximateFloat64(), "failover", labels)
235+
k.emitUsage(ch, "ram", ramFailover.AsApproximateFloat64(), "failover", labels)
206236

207237
// Calculate PAYG capacity
208238
paygCPU := cpuTotal.DeepCopy()
@@ -215,21 +245,27 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) {
215245
paygRAM.Sub(ramReserved)
216246
paygRAM.Sub(ramFailover)
217247

218-
exportCapacityMetricKVM(ch, k.paygCapacityPerHost, "cpu", paygCPU.AsApproximateFloat64(), hypervisor)
219-
exportCapacityMetricKVM(ch, k.paygCapacityPerHost, "ram", paygRAM.AsApproximateFloat64(), hypervisor)
248+
k.emitUsage(ch, "cpu", paygCPU.AsApproximateFloat64(), "payg", labels)
249+
k.emitUsage(ch, "ram", paygRAM.AsApproximateFloat64(), "payg", labels)
220250
}
221251
}
222252

223-
func exportCapacityMetricKVM(ch chan<- prometheus.Metric, metric *prometheus.Desc, resource string, value float64, hypervisor hv1.Hypervisor) {
224-
bb := getBuildingBlock(hypervisor.Name)
225-
226-
availabilityZone := hypervisor.Labels["topology.kubernetes.io/zone"]
253+
// kvmHostLabels holds precomputed label values derived from a hypervisor.
254+
type kvmHostLabels struct {
255+
computeHost string
256+
availabilityZone string
257+
buildingBlock string
258+
cpuArchitecture string
259+
workloadType string
260+
enabled string
261+
decommissioned string
262+
externalCustomer string
263+
maintenance string
264+
}
227265

228-
enabled := true
266+
func hostLabelsFromHypervisor(hypervisor hv1.Hypervisor) kvmHostLabels {
229267
decommissioned := false
230268
externalCustomer := false
231-
maintenance := false
232-
233269
workloadType := "general-purpose"
234270
cpuArchitecture := "cascade-lake"
235271

@@ -246,19 +282,52 @@ func exportCapacityMetricKVM(ch chan<- prometheus.Metric, metric *prometheus.Des
246282
}
247283
}
248284

285+
return kvmHostLabels{
286+
computeHost: hypervisor.Name,
287+
availabilityZone: hypervisor.Labels["topology.kubernetes.io/zone"],
288+
buildingBlock: getBuildingBlock(hypervisor.Name),
289+
cpuArchitecture: cpuArchitecture,
290+
workloadType: workloadType,
291+
enabled: strconv.FormatBool(true),
292+
decommissioned: strconv.FormatBool(decommissioned),
293+
externalCustomer: strconv.FormatBool(externalCustomer),
294+
maintenance: strconv.FormatBool(false),
295+
}
296+
}
297+
298+
func (k *KVMResourceCapacityKPI) emitTotal(ch chan<- prometheus.Metric, resourceName string, value float64, l kvmHostLabels) {
299+
ch <- prometheus.MustNewConstMetric(
300+
k.totalCapacityPerHost,
301+
prometheus.GaugeValue,
302+
value,
303+
l.computeHost,
304+
resourceName,
305+
l.availabilityZone,
306+
l.buildingBlock,
307+
l.cpuArchitecture,
308+
l.workloadType,
309+
l.enabled,
310+
l.decommissioned,
311+
l.externalCustomer,
312+
l.maintenance,
313+
)
314+
}
315+
316+
func (k *KVMResourceCapacityKPI) emitUsage(ch chan<- prometheus.Metric, resourceName string, value float64, capacityType string, l kvmHostLabels) {
249317
ch <- prometheus.MustNewConstMetric(
250-
metric,
318+
k.capacityPerHost,
251319
prometheus.GaugeValue,
252320
value,
253-
hypervisor.Name,
254-
resource,
255-
availabilityZone,
256-
bb,
257-
cpuArchitecture,
258-
workloadType,
259-
strconv.FormatBool(enabled),
260-
strconv.FormatBool(decommissioned),
261-
strconv.FormatBool(externalCustomer),
262-
strconv.FormatBool(maintenance),
321+
l.computeHost,
322+
resourceName,
323+
capacityType,
324+
l.availabilityZone,
325+
l.buildingBlock,
326+
l.cpuArchitecture,
327+
l.workloadType,
328+
l.enabled,
329+
l.decommissioned,
330+
l.externalCustomer,
331+
l.maintenance,
263332
)
264333
}

0 commit comments

Comments
 (0)