From bc76f33642a7485b41978c663f92f7dda5152053 Mon Sep 17 00:00:00 2001 From: "amazing.gao" Date: Mon, 25 May 2026 10:42:38 +0800 Subject: [PATCH] fix(metrics): align ginprom labels and sync Redis alert docs --- docs/metric.md | 14 ++- docs/prometheus_alerts_template.yaml | 14 +-- docs/wechat_portfolio_alerts.yaml | 97 +++++++++++++++++++-- pkg/server/ginserver/mid/ginprom/ginprom.go | 19 ++-- 4 files changed, 118 insertions(+), 26 deletions(-) diff --git a/docs/metric.md b/docs/metric.md index 839ec3a..5591984 100644 --- a/docs/metric.md +++ b/docs/metric.md @@ -65,9 +65,15 @@ | :------------------------------------- | :-------- | :----------------------------------- | :------------------------------------------ | | `http_server_requests_inflight` | Gauge | `method`, `url` | 当前正在处理的 HTTP 请求数 (饱和度) | | `http_server_requests_total` | Counter | `method`, `url`, `status`, `errcode` | 处理的 HTTP 请求总数 (流量 & 错误) | -| `http_server_request_duration_seconds` | Histogram | `method`, `url`, `status`, `errcode` | HTTP 请求耗时分布 (延迟),桶:.005s - 10s | +| `http_server_request_duration_seconds` | Histogram | `method`, `url`, `status` | HTTP 请求耗时分布 (延迟),桶:.005s - 10s | | `http_server_request_size_bytes` | Histogram | `method`, `url` | HTTP 请求体大小分布 (流量),桶:1KB - 100MB | -| `http_server_response_size_bytes` | Histogram | `method`, `url`, `status`, `errcode` | HTTP 响应体大小分布 (流量),桶:1KB - 100MB | +| `http_server_response_size_bytes` | Histogram | `method`, `url` | HTTP 响应体大小分布 (流量),桶:1KB - 100MB | + +> 兼容性说明(`box/box v1.3.0`): +> - `http_server_requests_total` 仍保留 `errcode`,用于业务错误统计与告警。 +> - `http_server_request_duration_seconds` 已移除 `errcode` 标签(仅保留 `method`,`url`,`status`),用于降低时序基数。 +> - `http_server_response_size_bytes` 仅保留 `method`,`url`。 +> - Prometheus 无需手工“清除旧指标”再上报新指标。升级后新实例会按新标签集上报,旧标签集时序会自然停止写入并按存储保留策略过期。 ### 1.2 HTTP Client (Wukong) @@ -101,8 +107,8 @@ | 指标名称 | 类型 | Labels | 说明 | | :-------------------------------------- | :-------- | :----------------------------------------------------- | :--------------------- | -| `redis_client_requests_total` | Counter | `address`, `db`, `masterName`, `pipe`, `cmd`, `result` | Redis 命令执行总数 | -| `redis_client_request_duration_seconds` | Histogram | `address`, `db`, `masterName`, `pipe`, `cmd`, `result` | Redis 命令执行耗时分布 | +| `redis_client_requests_total` | Counter | `pipe`, `cmd`, `result` | Redis 命令执行总数 | +| `redis_client_request_duration_seconds` | Histogram | `pipe`, `cmd`, `result` | Redis 命令执行耗时分布 | **错误分类 (`result` 标签值)**: diff --git a/docs/prometheus_alerts_template.yaml b/docs/prometheus_alerts_template.yaml index d1c9419..ab0ab63 100644 --- a/docs/prometheus_alerts_template.yaml +++ b/docs/prometheus_alerts_template.yaml @@ -355,14 +355,14 @@ groups: type: availability annotations: summary: "Redis command failure count: {{ $value }}" - description: "{{ $labels.cmd }} on {{ $labels.address }}. Error type: {{ $labels.result }}" + description: "{{ $labels.cmd }} error type: {{ $labels.result }}" # Redis Connection Error Rate - Critical - alert: RedisConnectionErrorRateCritical expr: | - (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{result="connection_error"}[1m])) + (sum by(namespace, job, cmd) (rate(redis_client_requests_total{result="connection_error"}[1m])) / - sum by(namespace, job, cmd, address) (rate(redis_client_requests_total[1m]))) > 0.01 + sum by(namespace, job, cmd) (rate(redis_client_requests_total[1m]))) > 0.01 for: 1m labels: severity: critical @@ -371,14 +371,14 @@ groups: error_type: connection annotations: summary: "Redis connection error rate: {{ $value | humanizePercentage }}" - description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} connection error rate is above 1%." + description: "Redis command {{ $labels.cmd }} connection error rate is above 1%." # Redis Timeout Error Rate - Critical - alert: RedisTimeoutErrorRateCritical expr: | - (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{result="timeout_error"}[1m])) + (sum by(namespace, job, cmd) (rate(redis_client_requests_total{result="timeout_error"}[1m])) / - sum by(namespace, job, cmd, address) (rate(redis_client_requests_total[1m]))) > 0.01 + sum by(namespace, job, cmd) (rate(redis_client_requests_total[1m]))) > 0.01 for: 1m labels: severity: critical @@ -387,7 +387,7 @@ groups: error_type: timeout annotations: summary: "Redis timeout error rate: {{ $value | humanizePercentage }}" - description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} timeout error rate is above 1%." + description: "Redis command {{ $labels.cmd }} timeout error rate is above 1%." - alert: RedisLatencyP99High expr: histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket[5m])) by (le, namespace, job)) > 0.1 diff --git a/docs/wechat_portfolio_alerts.yaml b/docs/wechat_portfolio_alerts.yaml index c9aaf02..ec53cb0 100644 --- a/docs/wechat_portfolio_alerts.yaml +++ b/docs/wechat_portfolio_alerts.yaml @@ -331,14 +331,14 @@ groups: type: availability annotations: summary: "Redis command failure count: {{ $value }}" - description: "{{ $labels.cmd }} on {{ $labels.address }}. Error type: {{ $labels.result }}" + description: "{{ $labels.cmd }} error type: {{ $labels.result }}" # Redis Connection Error Rate - Critical - alert: RedisConnectionErrorRateCritical expr: | - (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="connection_error"}[1m])) + (sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="connection_error"}[1m])) / - sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01 + sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01 for: 1m labels: severity: critical @@ -347,14 +347,14 @@ groups: error_type: connection annotations: summary: "Redis connection error rate: {{ $value | humanizePercentage }}" - description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} connection error rate is above 1%." + description: "Redis command {{ $labels.cmd }} connection error rate is above 1%." # Redis Timeout Error Rate - Critical - alert: RedisTimeoutErrorRateCritical expr: | - (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="timeout_error"}[1m])) + (sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="timeout_error"}[1m])) / - sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01 + sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01 for: 1m labels: severity: critical @@ -363,7 +363,7 @@ groups: error_type: timeout annotations: summary: "Redis timeout error rate: {{ $value | humanizePercentage }}" - description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} timeout error rate is above 1%." + description: "Redis command {{ $labels.cmd }} timeout error rate is above 1%." - alert: RedisLatencyP99High expr: histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket{namespace="wechat",job="portfolio"}[5m])) by (le, namespace, job)) > 0.1 @@ -378,6 +378,89 @@ groups: + # @box_module: kafka + # ========================================================== + # Kafka (EventBus) Alerts + # 应用侧 EventBus(kafka_producer_send_total / kafka_consumer_*)本地序列化、业务回调与消费限速 + # ========================================================== + + - alert: KafkaProducerLocalErrorRateCritical + expr: | + ( + sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio",error!=""}[1m])) + / + sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio"}[1m])) + ) > 0.05 + for: 1m + labels: + severity: critical + component: kafka + type: availability + annotations: + summary: "Kafka producer local error rate: {{ $value | humanizePercentage }}" + description: "Topic {{ $labels.topic }} local errors (e.g. marshal) exceed 5% of producer send counter increments." + + - alert: KafkaProducerLocalErrorRateHigh + expr: | + ( + sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio",error!=""}[1m])) + / + sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio"}[1m])) + ) > 0.01 + for: 2m + labels: + severity: warning + component: kafka + type: availability + annotations: + summary: "Kafka producer local error rate: {{ $value | humanizePercentage }}" + description: "Topic {{ $labels.topic }} local errors exceed 1% of producer send counter increments." + + - alert: KafkaConsumerHandlerErrorRateCritical + expr: | + ( + sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio",error!=""}[1m])) + / + sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio"}[1m])) + ) > 0.05 + for: 1m + labels: + severity: critical + component: kafka + type: availability + annotations: + summary: "Kafka consumer handler error rate: {{ $value | humanizePercentage }}" + description: "Topic {{ $labels.topic }} group {{ $labels.group }} handler errors exceed 5% of receive counter increments." + + - alert: KafkaConsumerHandlerErrorRateHigh + expr: | + ( + sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio",error!=""}[1m])) + / + sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio"}[1m])) + ) > 0.01 + for: 2m + labels: + severity: warning + component: kafka + type: availability + annotations: + summary: "Kafka consumer handler error rate: {{ $value | humanizePercentage }}" + description: "Topic {{ $labels.topic }} group {{ $labels.group }} handler errors exceed 1% of receive counter increments." + + - alert: KafkaConsumerRatelimitSustained + expr: sum by(namespace, job, topic, group) (rate(kafka_consumer_ratelimit_total{namespace="wechat",job="portfolio"}[5m])) > 0 + for: 10m + labels: + severity: warning + component: kafka + type: saturation + annotations: + summary: "Kafka consumer rate limit active" + description: "Topic {{ $labels.topic }} group {{ $labels.group }} is repeatedly hitting consumer-side rate limits." + + + # @box_module: schedule # ========================================================== # Schedule Job Alerts diff --git a/pkg/server/ginserver/mid/ginprom/ginprom.go b/pkg/server/ginserver/mid/ginprom/ginprom.go index fa1a555..66d4088 100644 --- a/pkg/server/ginserver/mid/ginprom/ginprom.go +++ b/pkg/server/ginserver/mid/ginprom/ginprom.go @@ -71,18 +71,18 @@ func newGinProm(c *Config) *GinProm { func (prom *GinProm) Handler() gin.HandlerFunc { return func(ctx *gin.Context) { start := time.Now() - labels := []string{ + baseLabels := []string{ ctx.Request.Method, prom.cfg.requestURLMappingFn(ctx), } // Saturation: +1 - reqInFlight.WithLabelValues(labels...).Inc() - defer reqInFlight.WithLabelValues(labels...).Dec() + reqInFlight.WithLabelValues(baseLabels...).Inc() + defer reqInFlight.WithLabelValues(baseLabels...).Dec() // Traffic: Request Size reqSz := computeApproximateRequestSize(ctx.Request) - reqSize.WithLabelValues(labels...).Observe(reqSz) + reqSize.WithLabelValues(baseLabels...).Observe(reqSz) ctx.Next() @@ -91,13 +91,16 @@ func (prom *GinProm) Handler() gin.HandlerFunc { resSz = 0 } - labels = append(labels, strconv.Itoa(ctx.Writer.Status()), strconv.Itoa(ctx.GetInt("errcode"))) + status := strconv.Itoa(ctx.Writer.Status()) + errCode := strconv.Itoa(ctx.GetInt("errcode")) + statusLabels := append(baseLabels, status) + totalLabels := append(statusLabels, errCode) // Traffic: Response Size & Total Count (implies Errors via labels) - resSize.WithLabelValues(labels...).Observe(float64(resSz)) - reqTotal.WithLabelValues(labels...).Inc() + resSize.WithLabelValues(baseLabels...).Observe(float64(resSz)) + reqTotal.WithLabelValues(totalLabels...).Inc() // Latency - reqDuration.WithLabelValues(labels...).Observe(time.Since(start).Seconds()) + reqDuration.WithLabelValues(statusLabels...).Observe(time.Since(start).Seconds()) } }