diff --git a/NEXT_RELEASE_NOTES.md b/NEXT_RELEASE_NOTES.md index 0fdef6de6..57597bade 100644 --- a/NEXT_RELEASE_NOTES.md +++ b/NEXT_RELEASE_NOTES.md @@ -46,12 +46,15 @@ The release notes should contain at least the following sections: ## Optional migration tasks +* `enable_opentelemetry` has been splited into two flags: `enable_metrics` and `enable_tracing` + ## Important information * Fixed a bug where the `evict` command ignored entries without a locality. If your DSS instance does not have a locality set, the next `evict` run may be slow while it processes the backlog of old entries. * Fixed a bug where Helm charts and Tanka files didn't actually perform any actions via the `evict` command when run via cron jobs (because no locality was set and no delete flag was specified). If you have a large number of entries, the next run may be slow while it processes the backlog of old entries. * AWS load balancer names are no longer enforced by Helm charts or Tanka files. Existing clusters will retain their current names, while new ones will use names automatically generated by AWS. * Server timeout flag has been renamed from `server timeout` to `server_timeout`. +* Grafana version deployed by Helm charts or Tanka files have been upgraded to 13.0. Ensure to read grafana changelog based on your current version. ## Minimal database schema version diff --git a/cmds/core-service/main.go b/cmds/core-service/main.go index 7db8be387..5f1b5bdbe 100644 --- a/cmds/core-service/main.go +++ b/cmds/core-service/main.go @@ -55,8 +55,9 @@ var ( logLevel = flag.String("log_level", logging.DefaultLevel.String(), "The log level") dumpRequests = flag.Bool("dump_requests", false, "Log full HTTP request and response (note: will dump sensitive information to logs; intended only for debugging and/or development)") profServiceName = flag.String("gcp_prof_service_name", "", "Service name for the Go profiler") - enableOpenTelemetry = flag.Bool("enable_opentelemetry", false, "Enable OpenTelemetry, including traces and activation metric endpoint") - metricsListeningAddress = flag.String("metrics_addr", ":8079", "Address and port that the OpenTelemetry prometheus service binds to and listens on for incoming connections") + enableMetrics = flag.Bool("enable_metrics", false, "Enable metric endpoint") + enableTracing = flag.Bool("enable_tracing", false, "Enable tracing") + metricsListeningAddress = flag.String("metrics_addr", ":8079", "Address and port that the for the prometheus-compatible metric service binds to and listens on for incoming connections") pkFile = flag.String("public_key_files", "", "Path to public Keys to use for JWT decoding, separated by commas.") jwksEndpoint = flag.String("jwks_endpoint", "", "URL pointing to an endpoint serving JWKS") @@ -120,7 +121,7 @@ func createRIDServers(ctx context.Context, locality string, logger *zap.Logger) return nil, nil, stacktrace.Propagate(err, "Unable to interact with store") } - if *enableOpenTelemetry { + if *enableMetrics { err = registerRIDMetrics(ctx, ridStore) if err != nil { @@ -147,7 +148,7 @@ func createSCDServer(ctx context.Context, logger *zap.Logger) (*scd.Server, erro return nil, err } - if *enableOpenTelemetry { + if *enableMetrics { err = registerSCDMetrics(ctx, scdStore) if err != nil { @@ -339,7 +340,7 @@ func RunHTTPServer(ctx context.Context, ctxCanceler func(), address, locality st handler = authorizer.TokenMiddleware(handler) handler = timeoutMiddleware(*timeout, handler) - if *enableOpenTelemetry { + if *enableMetrics || *enableTracing { // We use the default settings; the APIRouter handler will override the span value accordingly, as it has more information. handler = otelhttp.NewHandler(handler, "http") } @@ -454,8 +455,8 @@ func main() { } // Set up OpenTelemetry. - if *enableOpenTelemetry { - otelShutdown, err := setupOTelSDK(ctx, *metricsListeningAddress) + if *enableMetrics || *enableTracing { + otelShutdown, err := setupOTelSDK(ctx, *enableMetrics, *enableTracing, *metricsListeningAddress) if err != nil { logger.Panic("Failed to initialize OpenTelemetry", zap.Error(err)) } diff --git a/cmds/core-service/otel.go b/cmds/core-service/otel.go index 811fe0258..f2269a1f0 100644 --- a/cmds/core-service/otel.go +++ b/cmds/core-service/otel.go @@ -23,28 +23,42 @@ import ( // setupOTelSDK bootstraps the OpenTelemetry pipeline. // If it does not return an error, make sure to call shutdown for proper cleanup. -func setupOTelSDK(ctx context.Context, metricsListeningAddress string) (func(context.Context) error, error) { +func setupOTelSDK(ctx context.Context, enableMetrics bool, enableTracing bool, metricsListeningAddress string) (func(context.Context) error, error) { // Set up propagator. prop := newPropagator() otel.SetTextMapPropagator(prop) - // Set up trace provider. - tracerProvider, err := newTracerProvider(ctx) - if err != nil { - return nil, err + var tracerProvider *trace.TracerProvider + var meterProvider *metric.MeterProvider + + if enableTracing { + // Set up trace provider. + tracerProvider, err := newTracerProvider(ctx) + if err != nil { + return nil, err + } + otel.SetTracerProvider(tracerProvider) } - otel.SetTracerProvider(tracerProvider) - // Set up metrics exporter - meterProvider, err := newMeterProvider(ctx, metricsListeningAddress) - if err != nil { - return nil, err + if enableMetrics { + // Set up metrics exporter + meterProvider, err := newMeterProvider(ctx, metricsListeningAddress) + if err != nil { + return nil, err + } + otel.SetMeterProvider(meterProvider) } - otel.SetMeterProvider(meterProvider) shutdown := func(ctx context.Context) error { - return errors.Join(tracerProvider.Shutdown(ctx), meterProvider.Shutdown(ctx)) + var err error + if tracerProvider != nil { + err = errors.Join(err, tracerProvider.Shutdown(ctx)) + } + if meterProvider != nil { + err = errors.Join(err, meterProvider.Shutdown(ctx)) + } + return err } return shutdown, nil } diff --git a/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf b/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf index 4da77e55a..c185319fb 100644 --- a/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf +++ b/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf @@ -68,6 +68,7 @@ resource "local_file" "helm_chart_values" { publicEndpoint = "https://${var.app_hostname}" enableScd = var.enable_scd enableScdGlobalLock = var.enable_scd_global_lock + enableDssMetrics = var.enable_dss_metrics locality = "zone=${var.locality}" evict = { @@ -274,6 +275,7 @@ resource "local_file" "helm_chart_values" { publicEndpoint = "https://${var.app_hostname}" enableScd = var.enable_scd enableScdGlobalLock = var.enable_scd_global_lock + enableDssMetrics = var.enable_dss_metrics locality = "zone=${var.locality}" evict = { diff --git a/deploy/infrastructure/dependencies/terraform-commons-dss/tanka.tf b/deploy/infrastructure/dependencies/terraform-commons-dss/tanka.tf index d0ca3cd24..6fc846719 100644 --- a/deploy/infrastructure/dependencies/terraform-commons-dss/tanka.tf +++ b/deploy/infrastructure/dependencies/terraform-commons-dss/tanka.tf @@ -9,6 +9,7 @@ resource "local_file" "tanka_config_main" { VAR_CLUSTER_CONTEXT = var.kubernetes_context_name VAR_ENABLE_SCD = var.enable_scd VAR_ENABLE_SCD_GLOBAL_LOCK = var.enable_scd_global_lock + VAR_ENABLE_DSS_METRICS = var.enable_dss_metrics VAR_DB_HOSTNAME_SUFFIX = var.db_hostname_suffix VAR_LOCALITY = var.locality VAR_DATASTORE = var.datastore_type diff --git a/deploy/infrastructure/dependencies/terraform-commons-dss/templates/main.jsonnet.tmp b/deploy/infrastructure/dependencies/terraform-commons-dss/templates/main.jsonnet.tmp index 73209a4bb..31200a25b 100644 --- a/deploy/infrastructure/dependencies/terraform-commons-dss/templates/main.jsonnet.tmp +++ b/deploy/infrastructure/dependencies/terraform-commons-dss/templates/main.jsonnet.tmp @@ -12,6 +12,7 @@ local metadata = metadataBase { single_cluster: false, enableScd: ${VAR_ENABLE_SCD}, // <-- This boolean value is VAR_ENABLE_SCD enableScdGlobalLock: ${VAR_ENABLE_SCD_GLOBAL_LOCK}, // <-- This boolean value is VAR_ENABLE_SCD_GLOBAL_LOCK + enableDssMetrics: ${VAR_ENABLE_DSS_METRICS}, // <-- This boolean value is VAR_ENABLE_DSS_METRICS datastore: '${VAR_DATASTORE}', locality: '${VAR_LOCALITY}', cockroach+: { diff --git a/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf b/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf index f8a878838..faa2ecd95 100644 --- a/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf +++ b/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf @@ -488,3 +488,16 @@ variable "enable_monitoring" { } +variable "enable_dss_metrics" { + type = bool + default = false + description = <<-EOT + Enable DSS's prometheus metric. + + Require DSS version to be at least 0.23.0. + + Example: `true` + EOT +} + + diff --git a/deploy/infrastructure/modules/terraform-aws-dss/TFVARS.gen.md b/deploy/infrastructure/modules/terraform-aws-dss/TFVARS.gen.md index a04e9431f..435f3a219 100644 --- a/deploy/infrastructure/modules/terraform-aws-dss/TFVARS.gen.md +++ b/deploy/infrastructure/modules/terraform-aws-dss/TFVARS.gen.md @@ -164,6 +164,12 @@ Use latest to use the latest schema version.

Use latest to use the latest schema version.

Example: 3.1.0


Default value: "latest" + + enable_dss_metrics (bool) +

Enable DSS's prometheus metric.

+

Require DSS version to be at least 0.23.0.

+

Example: true

+
Default value: false enable_monitoring (bool)

Set to true to enable monitoring stack with prometheus / grafana.

diff --git a/deploy/infrastructure/modules/terraform-aws-dss/main.tf b/deploy/infrastructure/modules/terraform-aws-dss/main.tf index a7738d32a..c86df7f98 100644 --- a/deploy/infrastructure/modules/terraform-aws-dss/main.tf +++ b/deploy/infrastructure/modules/terraform-aws-dss/main.tf @@ -60,6 +60,7 @@ module "terraform-commons-dss" { enable_monitoring = var.enable_monitoring enable_scd = var.enable_scd enable_scd_global_lock = var.enable_scd_global_lock + enable_dss_metrics = var.enable_dss_metrics prometheus_hostname = var.prometheus_hostname ip_prometheus = module.terraform-aws-kubernetes.ip_prometheus diff --git a/deploy/infrastructure/modules/terraform-aws-dss/variables.gen.tf b/deploy/infrastructure/modules/terraform-aws-dss/variables.gen.tf index 8ceb75d0d..2006bd019 100644 --- a/deploy/infrastructure/modules/terraform-aws-dss/variables.gen.tf +++ b/deploy/infrastructure/modules/terraform-aws-dss/variables.gen.tf @@ -602,3 +602,16 @@ variable "enable_monitoring" { } +variable "enable_dss_metrics" { + type = bool + default = false + description = <<-EOT + Enable DSS's prometheus metric. + + Require DSS version to be at least 0.23.0. + + Example: `true` + EOT +} + + diff --git a/deploy/infrastructure/modules/terraform-google-dss/TFVARS.gen.md b/deploy/infrastructure/modules/terraform-google-dss/TFVARS.gen.md index c766ef271..8b25f4f08 100644 --- a/deploy/infrastructure/modules/terraform-google-dss/TFVARS.gen.md +++ b/deploy/infrastructure/modules/terraform-google-dss/TFVARS.gen.md @@ -126,6 +126,12 @@ Use latest to use the latest schema version.

Use latest to use the latest schema version.

Example: 3.1.0


Default value: "latest" + + enable_dss_metrics (bool) +

Enable DSS's prometheus metric.

+

Require DSS version to be at least 0.23.0.

+

Example: true

+
Default value: false enable_monitoring (bool)

Set to true to enable monitoring stack with prometheus / grafana.

diff --git a/deploy/infrastructure/modules/terraform-google-dss/main.tf b/deploy/infrastructure/modules/terraform-google-dss/main.tf index ff8dfcb66..3be59b50e 100644 --- a/deploy/infrastructure/modules/terraform-google-dss/main.tf +++ b/deploy/infrastructure/modules/terraform-google-dss/main.tf @@ -60,6 +60,7 @@ module "terraform-commons-dss" { enable_monitoring = var.enable_monitoring enable_scd = var.enable_scd enable_scd_global_lock = var.enable_scd_global_lock + enable_dss_metrics = var.enable_dss_metrics prometheus_hostname = var.prometheus_hostname ip_prometheus = module.terraform-google-kubernetes.ip_prometheus diff --git a/deploy/infrastructure/modules/terraform-google-dss/variables.gen.tf b/deploy/infrastructure/modules/terraform-google-dss/variables.gen.tf index 88cf4f4d9..796b5172c 100644 --- a/deploy/infrastructure/modules/terraform-google-dss/variables.gen.tf +++ b/deploy/infrastructure/modules/terraform-google-dss/variables.gen.tf @@ -604,3 +604,16 @@ variable "enable_monitoring" { } +variable "enable_dss_metrics" { + type = bool + default = false + description = <<-EOT + Enable DSS's prometheus metric. + + Require DSS version to be at least 0.23.0. + + Example: `true` + EOT +} + + diff --git a/deploy/infrastructure/utils/definitions/enable_dss_metrics.tf b/deploy/infrastructure/utils/definitions/enable_dss_metrics.tf new file mode 100644 index 000000000..6c851ac6e --- /dev/null +++ b/deploy/infrastructure/utils/definitions/enable_dss_metrics.tf @@ -0,0 +1,11 @@ +variable "enable_dss_metrics" { + type = bool + default = false + description = <<-EOT + Enable DSS's prometheus metric. + + Require DSS version to be at least 0.23.0. + + Example: `true` + EOT +} diff --git a/deploy/infrastructure/utils/variables.py b/deploy/infrastructure/utils/variables.py index 496861155..de238bc23 100755 --- a/deploy/infrastructure/utils/variables.py +++ b/deploy/infrastructure/utils/variables.py @@ -64,6 +64,7 @@ "evict_rid_isas", "evict_rid_subscriptions", "enable_monitoring", + "enable_dss_metrics", ] # dependencies/terraform-*-kubernetes diff --git a/deploy/operations/ci/aws-1/variables.gen.tf b/deploy/operations/ci/aws-1/variables.gen.tf index 8ceb75d0d..2006bd019 100644 --- a/deploy/operations/ci/aws-1/variables.gen.tf +++ b/deploy/operations/ci/aws-1/variables.gen.tf @@ -602,3 +602,16 @@ variable "enable_monitoring" { } +variable "enable_dss_metrics" { + type = bool + default = false + description = <<-EOT + Enable DSS's prometheus metric. + + Require DSS version to be at least 0.23.0. + + Example: `true` + EOT +} + + diff --git a/deploy/services/helm-charts/dss/templates/dss-core-service.yaml b/deploy/services/helm-charts/dss/templates/dss-core-service.yaml index 9553454df..273a3b354 100644 --- a/deploy/services/helm-charts/dss/templates/dss-core-service.yaml +++ b/deploy/services/helm-charts/dss/templates/dss-core-service.yaml @@ -35,6 +35,12 @@ spec: metadata: labels: app: {{.Release.Name}}-core-service +{{ if $dss.conf.enableDssMetrics }} + annotations: + prometheus.io/path: metrics + prometheus.io/port: '8079' + prometheus.io/scrape: 'true' +{{ end }} spec: initContainers: {{- $waitForDatastore | nindent 8 }} @@ -61,6 +67,9 @@ spec: - --dump_requests=true - --enable_scd={{$dss.conf.enableScd}} - --enable_scd_global_lock={{$dss.conf.enableScdGlobalLock | default false}} +{{ if $dss.conf.enableDssMetrics }} + - --enable_metrics=true +{{ end }} - --gcp_prof_service_name= {{- if $dss.conf.jwksEndpoint }} - --jwks_endpoint={{ $dss.conf.jwksEndpoint }} @@ -79,6 +88,10 @@ spec: ports: - containerPort: 8080 name: http +{{ if $dss.conf.enableDssMetrics }} + - containerPort: 8079 + name: metrics +{{ end }} readinessProbe: httpGet: path: /healthy diff --git a/deploy/services/helm-charts/dss/values.example.yaml b/deploy/services/helm-charts/dss/values.example.yaml index 20b1cfc7e..674fa05ff 100644 --- a/deploy/services/helm-charts/dss/values.example.yaml +++ b/deploy/services/helm-charts/dss/values.example.yaml @@ -13,6 +13,7 @@ dss: publicEndpoint: https://dss.example.com enableScd: true enableScdGlobalLock: false + enableDssMetrics: false locality: zone=interuss-example-google-ew1 cockroachdb: diff --git a/deploy/services/helm-charts/dss/values.schema.json b/deploy/services/helm-charts/dss/values.schema.json index 08aa1e399..c47de1112 100644 --- a/deploy/services/helm-charts/dss/values.schema.json +++ b/deploy/services/helm-charts/dss/values.schema.json @@ -266,6 +266,9 @@ "enableScdGlobalLock": { "type": "boolean" }, + "enableDssMetrics": { + "type": "boolean" + }, "hostname": { "type": "string", "description": "Public hostname of the dss. Example: dss.example.com" diff --git a/deploy/services/helm-charts/dss/values.yaml b/deploy/services/helm-charts/dss/values.yaml index a7f67ffda..e8b39123b 100644 --- a/deploy/services/helm-charts/dss/values.yaml +++ b/deploy/services/helm-charts/dss/values.yaml @@ -225,6 +225,41 @@ prometheus: target_label: __name__ kubernetes_sd_configs: - role: endpoints + - job_name: K8s-Pods + kubernetes_sd_configs: + - role: pod + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + separator: ; + regex: "true" + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: $1 + action: replace + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + separator: ; + regex: ([^:]+)(?::\d+)?;(\d+) + target_label: __address__ + replacement: $1:$2 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: kubernetes_namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod_name + replacement: $1 + action: replace recording_rules.yml: "groups": @@ -319,6 +354,9 @@ prometheus: grafana: + image: + tag: "13.0-ubuntu-slim" + datasources: datasources.yaml: apiVersion: 1 diff --git a/deploy/services/tanka/core-service.libsonnet b/deploy/services/tanka/core-service.libsonnet index d760b558d..b6da80194 100644 --- a/deploy/services/tanka/core-service.libsonnet +++ b/deploy/services/tanka/core-service.libsonnet @@ -89,6 +89,13 @@ local awsLoadBalancer(metadata) = base.AWSLoadBalancerWithManagedCert(metadata, }, spec+: { template+: { + metadata+: if metadata.enableDssMetrics then { + annotations: { + 'prometheus.io/path': 'metrics', + 'prometheus.io/port': '8079', + 'prometheus.io/scrape': 'true', + } + } else {}, spec+: { volumes: volumes.all(metadata).backendVolumes, initContainers: [ @@ -104,7 +111,12 @@ local awsLoadBalancer(metadata) = base.AWSLoadBalancerWithManagedCert(metadata, containerPort: metadata.backend.port, name: 'http', }, - ], + ] + if metadata.enableDssMetrics then [ + { + containerPort: 8079, + name: 'metrics', + }, + ] else [], volumeMounts: volumes.all(metadata).backendMounts, command: ['core-service'], args_:: { @@ -119,9 +131,11 @@ local awsLoadBalancer(metadata) = base.AWSLoadBalancerWithManagedCert(metadata, enable_scd: metadata.enableScd, enable_scd_global_lock: metadata.enableScdGlobalLock, } + datastoreparameters.all(metadata) - + if metadata.backend.publicEndpoint != '' then { + + (if metadata.backend.publicEndpoint != '' then { public_endpoint: metadata.backend.publicEndpoint, - } else {}, + } else {}) + (if metadata.enableDssMetrics then { + enable_metrics: true, + } else {}), readinessProbe: { httpGet: { path: '/healthy', diff --git a/deploy/services/tanka/dashboard.libsonnet b/deploy/services/tanka/dashboard.libsonnet index 9a33da9da..d6132de69 100644 --- a/deploy/services/tanka/dashboard.libsonnet +++ b/deploy/services/tanka/dashboard.libsonnet @@ -5,6 +5,7 @@ local crdbSqlDash = import 'grafana_dashboards/crdb-sql-grafana.json'; local crdbStorageDash = import 'grafana_dashboards/crdb-storage-grafana.json'; local promOverview = import 'grafana_dashboards/prometheus-overview.json'; local kubeOverview = import 'grafana_dashboards/kubernetes-overview.json'; +local dssDash = import 'grafana_dashboards/dss.json'; local util = import 'util.libsonnet'; { all(metadata): { @@ -39,6 +40,11 @@ local util = import 'util.libsonnet'; 'kubernetes-overview.json': std.toString(kubeOverview), }, }, + grafDss: base.ConfigMap(metadata, 'grafana-dss') { + data: { + 'dss.json': std.toString(dssDash), + }, + }, }, volumeConfigs: { grafCrdbReplica: { @@ -83,6 +89,13 @@ local util = import 'util.libsonnet'; name: 'grafana-kube-overview', }, }, + grafDss: { + name: 'grafana-dss', + configMap: { + defaultMode: 420, + name: 'grafana-dss', + }, + }, }, volumes: util.mapToList(self.volumeConfigs), mountConfigs: { @@ -116,6 +129,11 @@ local util = import 'util.libsonnet'; readOnly: false, mountPath: '/var/lib/grafana/dashboards/grafana-kube-overview', }, + grafDss: { + name: 'grafana-dss', + readOnly: false, + mountPath: '/var/lib/grafana/dashboards/grafana-dss', + }, }, mount: util.mapToList(self.mountConfigs), }, diff --git a/deploy/services/tanka/examples/minikube/main.jsonnet b/deploy/services/tanka/examples/minikube/main.jsonnet index f571c8fa5..a3af495a0 100644 --- a/deploy/services/tanka/examples/minikube/main.jsonnet +++ b/deploy/services/tanka/examples/minikube/main.jsonnet @@ -10,6 +10,7 @@ local metadata = metadataBase { single_cluster: true, enableScd: true, enableScdGlobalLock: false, + enableDssMetrics: false, datastore: 'yugabyte', locality: 'minikube', cockroach+: { diff --git a/deploy/services/tanka/examples/minimum/main.jsonnet b/deploy/services/tanka/examples/minimum/main.jsonnet index 7ad633dc7..8a195abb3 100644 --- a/deploy/services/tanka/examples/minimum/main.jsonnet +++ b/deploy/services/tanka/examples/minimum/main.jsonnet @@ -12,6 +12,7 @@ local metadata = metadataBase { single_cluster: false, enableScd: false, // <-- This boolean value is VAR_ENABLE_SCD enableScdGlobalLock: false, // <-- This boolean value is VAR_ENABLE_SCD_GLOBAL_LOCK + enableDssMetrics: false, // <-- This boolean value is VAR_ENABLE_DSS_METRICS datastore: 'VAR_DATASTORE', locality: 'VAR_LOCALITY', cockroach+: { diff --git a/deploy/services/tanka/grafana.libsonnet b/deploy/services/tanka/grafana.libsonnet index f7a43f08b..dcc366016 100644 --- a/deploy/services/tanka/grafana.libsonnet +++ b/deploy/services/tanka/grafana.libsonnet @@ -100,7 +100,7 @@ local notifierConfig(metadata) = { containers: [ { name: 'grafana', - image: 'grafana/grafana:latest', + image: 'grafana/grafana:13.0-ubuntu-slim', ports: [ { name: 'grafana', diff --git a/deploy/services/tanka/grafana_dashboards/dss.json b/deploy/services/tanka/grafana_dashboards/dss.json new file mode 100644 index 000000000..98bc0179f --- /dev/null +++ b/deploy/services/tanka/grafana_dashboards/dss.json @@ -0,0 +1,1724 @@ +{ + "apiVersion": "dashboard.grafana.app/v2", + "kind": "Dashboard", + "metadata": { + "name": "gmx6nv", + "namespace": "default", + "uid": "9f657869-25b1-43ad-ba93-612445924e11", + "resourceVersion": "1780401114738017", + "generation": 17, + "creationTimestamp": "2026-06-02T11:23:14Z", + "labels": { + "grafana.app/deprecatedInternalID": "3625584525881344" + }, + "annotations": { + "grafana.app/createdBy": "anonymous:0", + "grafana.app/folder": "", + "grafana.app/saved-from-ui": "Grafana v13.0.1 (a100054f)", + "grafana.app/updatedBy": "anonymous:0", + "grafana.app/updatedTimestamp": "2026-06-02T11:51:54Z" + } + }, + "spec": { + "annotations": [ + { + "kind": "AnnotationQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "grafana", + "version": "v0", + "datasource": { + "name": "-- Grafana --" + }, + "spec": {} + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "builtIn": true + } + } + ], + "cursorSync": "Off", + "description": "Base dashboard, showing core DSS metrics", + "editable": true, + "elements": { + "panel-1": { + "kind": "Panel", + "spec": { + "id": 1, + "title": "RID: Identification Service Areas", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "builder", + "expr": "avg(rid_identification_service_areas_total)", + "legendFormat": "Count", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-10": { + "kind": "Panel", + "spec": { + "id": 10, + "title": "Latency p50/p95/p99", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "histogram_quantile(0.50, sum by (le) (rate(http_server_request_duration_seconds_bucket[5m])))", + "legendFormat": "p50" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "histogram_quantile(0.95, sum by (le) (rate(http_server_request_duration_seconds_bucket[5m])))", + "legendFormat": "p95" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "histogram_quantile(0.99, sum by (le) (rate(http_server_request_duration_seconds_bucket[5m])))", + "legendFormat": "p99" + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-11": { + "kind": "Panel", + "spec": { + "id": 11, + "title": "Operations/s by type", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "sum by (pgx_operation_type) (rate(db_client_operation_duration_seconds_count[$__rate_interval]))", + "legendFormat": "{{pgx_operation_type}}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-12": { + "kind": "Panel", + "spec": { + "id": 12, + "title": "Operation latency p50/p95/p99", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "histogram_quantile(0.50, sum by (le, pgx_operation_type) (rate(db_client_operation_duration_seconds_bucket[5m])))", + "legendFormat": "p50 {{pgx_operation_type}}" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "histogram_quantile(0.95, sum by (le, pgx_operation_type) (rate(db_client_operation_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{pgx_operation_type}}" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "histogram_quantile(0.99, sum by (le, pgx_operation_type) (rate(db_client_operation_duration_seconds_bucket[5m])))", + "legendFormat": "p99 {{pgx_operation_type}}" + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-13": { + "kind": "Panel", + "spec": { + "id": 13, + "title": "Operation errors/s", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "", + "version": "v0", + "spec": { + "expr": "sum by (pgx_operation_type) (rate(db_client_operation_errors_total[$__rate_interval]))", + "legendFormat": "{{pgx_operation_type}}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-2": { + "kind": "Panel", + "spec": { + "id": 2, + "title": "RID: Subscriptions", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "builder", + "expr": "avg(rid_subscriptions_total)", + "legendFormat": "Count", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-3": { + "kind": "Panel", + "spec": { + "id": 3, + "title": "SCD: Constraints", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "builder", + "expr": "avg(scd_constraints_total)", + "legendFormat": "Count", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-4": { + "kind": "Panel", + "spec": { + "id": 4, + "title": "SCD: Operational Intents", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "builder", + "expr": "avg(scd_operational_intents_total)", + "legendFormat": "Count", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-5": { + "kind": "Panel", + "spec": { + "id": 5, + "title": "SCD: Subscriptions", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "builder", + "expr": "avg(scd_subscriptions_total)", + "legendFormat": "Count", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-6": { + "kind": "Panel", + "spec": { + "id": 6, + "title": "Status code", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "code", + "expr": "sum(rate(http_server_request_body_size_bytes_count[$__rate_interval])) by (http_response_status_code)", + "legendFormat": "__auto", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "eps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-7": { + "kind": "Panel", + "spec": { + "id": 7, + "title": "Route", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "code", + "expr": "sum(rate(http_server_request_body_size_bytes_count[$__rate_interval])) by (http_route)", + "legendFormat": "{{http_route}}", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "eps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-8": { + "kind": "Panel", + "spec": { + "id": 8, + "title": "Method", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "code", + "expr": "sum(rate(http_server_request_body_size_bytes_count[$__rate_interval])) by (http_request_method)", + "legendFormat": "__auto", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "eps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-9": { + "kind": "Panel", + "spec": { + "id": 9, + "title": "Latency", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "expr": "sum by (le) (rate(http_server_request_duration_seconds_bucket{ }[$__rate_interval]))", + "format": "heatmap", + "fromExploreMetrics": false + } + }, + "refId": "http_server_request_duration_seconds_bucket-heatmap", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "prometheus" + }, + "spec": { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "instant": false, + "legendFormat": "__auto", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": { + "maxDataPoints": 500 + } + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "heatmap", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "calculate": false, + "cellGap": 1, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 32 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "selectionMode": "x", + "showValue": "auto", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + } + } + } + } + } + }, + "layout": { + "kind": "RowsLayout", + "spec": { + "rows": [ + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Objects in database", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 3, + "height": 10, + "element": { + "kind": "ElementReference", + "name": "panel-1" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 3, + "y": 0, + "width": 3, + "height": 10, + "element": { + "kind": "ElementReference", + "name": "panel-2" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 6, + "y": 0, + "width": 3, + "height": 10, + "element": { + "kind": "ElementReference", + "name": "panel-3" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 9, + "y": 0, + "width": 3, + "height": 10, + "element": { + "kind": "ElementReference", + "name": "panel-4" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 3, + "height": 10, + "element": { + "kind": "ElementReference", + "name": "panel-5" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "HTTP Statistics", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-6" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 11, + "element": { + "kind": "ElementReference", + "name": "panel-9" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 8, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-7" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 11, + "width": 12, + "height": 13, + "element": { + "kind": "ElementReference", + "name": "panel-10" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 16, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-8" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Database Statistics", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-11" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-13" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 8, + "width": 24, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-12" + } + } + } + ] + } + } + } + } + ] + } + }, + "links": [], + "liveNow": false, + "preload": false, + "tags": [], + "timeSettings": { + "timezone": "browser", + "from": "now-1h", + "to": "now", + "autoRefresh": "", + "autoRefreshIntervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "hideTimepicker": false, + "fiscalYearStartMonth": 0 + }, + "title": "DSS Metrics", + "variables": [], + "preferences": { + "layout": { + "kind": "GridLayout", + "spec": { + "items": [] + } + } + } + } +} diff --git a/deploy/services/tanka/metadata_base.libsonnet b/deploy/services/tanka/metadata_base.libsonnet index 1e96928f9..62bac6f03 100644 --- a/deploy/services/tanka/metadata_base.libsonnet +++ b/deploy/services/tanka/metadata_base.libsonnet @@ -9,6 +9,7 @@ single_cluster: false, enableScd: false, enableScdGlobalLock: false, + enableDssMetrics: false, datastore: 'cockroachdb', locality: error 'must supply locality', cockroach: { diff --git a/deploy/services/tanka/prometheus_configs/scrape-configs.libsonnet b/deploy/services/tanka/prometheus_configs/scrape-configs.libsonnet index 7fa50060e..aedb28bd2 100644 --- a/deploy/services/tanka/prometheus_configs/scrape-configs.libsonnet +++ b/deploy/services/tanka/prometheus_configs/scrape-configs.libsonnet @@ -187,5 +187,42 @@ insecure_skip_verify: true, }, }, + { + job_name: 'K8s-Pods', + kubernetes_sd_configs: [{ role: 'pod' }], + relabel_configs: [ + { + source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_scrape'], + action: 'keep', + regex: true, + }, + { + source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_path'], + action: 'replace', + target_label: '__metrics_path__', + regex: '(.+)', + }, + { + source_labels: ['__address__', '__meta_kubernetes_pod_annotation_prometheus_io_port'], + action: 'replace', + target_label: '__address__', + regex: '([^:]+)(?::\\d+)?;(\\d+)', + replacement: '$1:$2', + }, + { + source_labels: ['__meta_kubernetes_namespace'], + action: 'replace', + target_label: 'kubernetes_namespace', + }, + { + source_labels: ['__meta_kubernetes_pod_name'], + action: 'replace', + target_label: 'pod_name', + }, + ], + tls_config: { + insecure_skip_verify: true, + }, + }, ], } diff --git a/docs/infrastructure/google-manual.md b/docs/infrastructure/google-manual.md index c7f98c3bb..12909313e 100644 --- a/docs/infrastructure/google-manual.md +++ b/docs/infrastructure/google-manual.md @@ -214,6 +214,9 @@ your chosen provider. e global throughput but improve throughput with lot of subscriptions in the same areas. + 1. `VAR_ENABLE_DSS_METRICS`: Set this boolean true to enable + prometheus-compatible metric endpoint. + 1. `VAR_LOCALITY`: Unique name for your DSS instance. Currently, we recommend "_", and the `=` character is not allowed. However, any unique (among all other participating DSS diff --git a/docs/operations/monitoring.md b/docs/operations/monitoring.md index 25a650b8c..5fe328d2d 100644 --- a/docs/operations/monitoring.md +++ b/docs/operations/monitoring.md @@ -18,6 +18,8 @@ This can be enabled via: - The `monitoring.enabled` option in helm - By using tanka, which always enables it +By default, DSS metrics are not enabled; see below, OpenTelemetry section, to activate them. + ### Grafana access To access the Grafana interface, first ensure that the appropriate @@ -137,23 +139,30 @@ You can enable it on the DSS server to get: * Tracing for all queries * A Prometheus endpoint with some metrics -Currently, this setting is not yet available in Terraform, Helm or Tanka. +Currently, thoses settings are not yet available in Terraform, Helm or Tanka. !!! warning - By default, when OpenTelemetry is enabled, the metrics service listens on all addresses. + By default, when metrics are enabled, the metrics service listens on all addresses. ### Metrics +Use flag `--enable_metrics` to enable metrics. + Point any Prometheus server to the endpoint (by default on port 8079). You can use the `--metrics_addr` flag to change the listening port and address. -No dashboard has been created yet, but one is planned. +A dashboard is automatically deployed by Helm and Tanka. If you use you own grafana instance, it can be found [there](https://github.com/interuss/dss/blob/master/deploy/services/tanka/grafana_dashboards/dss.json). + +You can use the `enable_dss_metrics` option in Terraform, `dss.conf.enableDssMetrics` in Helm, or `enableDssMetrics` in Tanka to activate it when using these. +This will also automatically enable collection by Prometheus if used. ### Tracing +Use flag `--enable_tracing` to enable tracing. + Traces can be sent to any OpenTelemetry-compliant service. Self-hostable examples include [Jaeger](https://www.jaegertracing.io/), [OpenObserve](https://github.com/openobserve/openobserve), [Grafana Tempo](https://grafana.com/docs/tempo/latest/), and [SigNoz](https://github.com/SigNoz/signoz). Multiple SaaS solutions are also available (including some of the previously mentioned tools). You need to use the `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable to configure it. Point it toward your server by following its specific documentation. @@ -202,6 +211,6 @@ index f09bda59..8c96cf3e 100755 -locality local_dev \ - -public_endpoint http://127.0.0.1:8082 + -public_endpoint http://127.0.0.1:8082 \ -+ -enable_opentelemetry ++ -enable_tracing fi ```