From bc76f33642a7485b41978c663f92f7dda5152053 Mon Sep 17 00:00:00 2001
From: "amazing.gao" <amazing.gao@qq.com>
Date: Mon, 25 May 2026 10:42:38 +0800
Subject: [PATCH] fix(metrics): align ginprom labels and sync Redis alert docs

---
 docs/metric.md                              | 14 ++-
 docs/prometheus_alerts_template.yaml        | 14 +--
 docs/wechat_portfolio_alerts.yaml           | 97 +++++++++++++++++++--
 pkg/server/ginserver/mid/ginprom/ginprom.go | 19 ++--
 4 files changed, 118 insertions(+), 26 deletions(-)

diff --git a/docs/metric.md b/docs/metric.md
index 839ec3a..5591984 100644
--- a/docs/metric.md
+++ b/docs/metric.md
@@ -65,9 +65,15 @@
 | :------------------------------------- | :-------- | :----------------------------------- | :------------------------------------------ |
 | `http_server_requests_inflight`        | Gauge     | `method`, `url`                      | 当前正在处理的 HTTP 请求数 (饱和度)         |
 | `http_server_requests_total`           | Counter   | `method`, `url`, `status`, `errcode` | 处理的 HTTP 请求总数 (流量 & 错误)          |
-| `http_server_request_duration_seconds` | Histogram | `method`, `url`, `status`, `errcode` | HTTP 请求耗时分布 (延迟)，桶：.005s - 10s   |
+| `http_server_request_duration_seconds` | Histogram | `method`, `url`, `status`            | HTTP 请求耗时分布 (延迟)，桶：.005s - 10s   |
 | `http_server_request_size_bytes`       | Histogram | `method`, `url`                      | HTTP 请求体大小分布 (流量)，桶：1KB - 100MB |
-| `http_server_response_size_bytes`      | Histogram | `method`, `url`, `status`, `errcode` | HTTP 响应体大小分布 (流量)，桶：1KB - 100MB |
+| `http_server_response_size_bytes`      | Histogram | `method`, `url`                      | HTTP 响应体大小分布 (流量)，桶：1KB - 100MB |
+
+> 兼容性说明（`box/box v1.3.0`）：
+> - `http_server_requests_total` 仍保留 `errcode`，用于业务错误统计与告警。
+> - `http_server_request_duration_seconds` 已移除 `errcode` 标签（仅保留 `method`,`url`,`status`），用于降低时序基数。
+> - `http_server_response_size_bytes` 仅保留 `method`,`url`。
+> - Prometheus 无需手工“清除旧指标”再上报新指标。升级后新实例会按新标签集上报，旧标签集时序会自然停止写入并按存储保留策略过期。
 
 ### 1.2 HTTP Client (Wukong)
 
@@ -101,8 +107,8 @@
 
 | 指标名称                                | 类型      | Labels                                                 | 说明                   |
 | :-------------------------------------- | :-------- | :----------------------------------------------------- | :--------------------- |
-| `redis_client_requests_total`           | Counter   | `address`, `db`, `masterName`, `pipe`, `cmd`, `result` | Redis 命令执行总数     |
-| `redis_client_request_duration_seconds` | Histogram | `address`, `db`, `masterName`, `pipe`, `cmd`, `result` | Redis 命令执行耗时分布 |
+| `redis_client_requests_total`           | Counter   | `pipe`, `cmd`, `result`                                 | Redis 命令执行总数     |
+| `redis_client_request_duration_seconds` | Histogram | `pipe`, `cmd`, `result`                                 | Redis 命令执行耗时分布 |
 
 **错误分类 (`result` 标签值)**:
 
diff --git a/docs/prometheus_alerts_template.yaml b/docs/prometheus_alerts_template.yaml
index d1c9419..ab0ab63 100644
--- a/docs/prometheus_alerts_template.yaml
+++ b/docs/prometheus_alerts_template.yaml
@@ -355,14 +355,14 @@ groups:
           type: availability
         annotations:
           summary: "Redis command failure count: {{ $value }}"
-          description: "{{ $labels.cmd }} on {{ $labels.address }}. Error type: {{ $labels.result }}"
+          description: "{{ $labels.cmd }} error type: {{ $labels.result }}"
 
       # Redis Connection Error Rate - Critical
       - alert: RedisConnectionErrorRateCritical
         expr: |
-          (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{result="connection_error"}[1m]))
+          (sum by(namespace, job, cmd) (rate(redis_client_requests_total{result="connection_error"}[1m]))
           /
-          sum by(namespace, job, cmd, address) (rate(redis_client_requests_total[1m]))) > 0.01
+          sum by(namespace, job, cmd) (rate(redis_client_requests_total[1m]))) > 0.01
         for: 1m
         labels:
           severity: critical
@@ -371,14 +371,14 @@ groups:
           error_type: connection
         annotations:
           summary: "Redis connection error rate: {{ $value | humanizePercentage }}"
-          description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} connection error rate is above 1%."
+          description: "Redis command {{ $labels.cmd }} connection error rate is above 1%."
 
       # Redis Timeout Error Rate - Critical
       - alert: RedisTimeoutErrorRateCritical
         expr: |
-          (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{result="timeout_error"}[1m]))
+          (sum by(namespace, job, cmd) (rate(redis_client_requests_total{result="timeout_error"}[1m]))
           /
-          sum by(namespace, job, cmd, address) (rate(redis_client_requests_total[1m]))) > 0.01
+          sum by(namespace, job, cmd) (rate(redis_client_requests_total[1m]))) > 0.01
         for: 1m
         labels:
           severity: critical
@@ -387,7 +387,7 @@ groups:
           error_type: timeout
         annotations:
           summary: "Redis timeout error rate: {{ $value | humanizePercentage }}"
-          description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} timeout error rate is above 1%."
+          description: "Redis command {{ $labels.cmd }} timeout error rate is above 1%."
 
       - alert: RedisLatencyP99High
         expr: histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket[5m])) by (le, namespace, job)) > 0.1
diff --git a/docs/wechat_portfolio_alerts.yaml b/docs/wechat_portfolio_alerts.yaml
index c9aaf02..ec53cb0 100644
--- a/docs/wechat_portfolio_alerts.yaml
+++ b/docs/wechat_portfolio_alerts.yaml
@@ -331,14 +331,14 @@ groups:
           type: availability
         annotations:
           summary: "Redis command failure count: {{ $value }}"
-          description: "{{ $labels.cmd }} on {{ $labels.address }}. Error type: {{ $labels.result }}"
+          description: "{{ $labels.cmd }} error type: {{ $labels.result }}"
 
       # Redis Connection Error Rate - Critical
       - alert: RedisConnectionErrorRateCritical
         expr: |
-          (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="connection_error"}[1m]))
+          (sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="connection_error"}[1m]))
           /
-          sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
+          sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
         for: 1m
         labels:
           severity: critical
@@ -347,14 +347,14 @@ groups:
           error_type: connection
         annotations:
           summary: "Redis connection error rate: {{ $value | humanizePercentage }}"
-          description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} connection error rate is above 1%."
+          description: "Redis command {{ $labels.cmd }} connection error rate is above 1%."
 
       # Redis Timeout Error Rate - Critical
       - alert: RedisTimeoutErrorRateCritical
         expr: |
-          (sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="timeout_error"}[1m]))
+          (sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="timeout_error"}[1m]))
           /
-          sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
+          sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
         for: 1m
         labels:
           severity: critical
@@ -363,7 +363,7 @@ groups:
           error_type: timeout
         annotations:
           summary: "Redis timeout error rate: {{ $value | humanizePercentage }}"
-          description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} timeout error rate is above 1%."
+          description: "Redis command {{ $labels.cmd }} timeout error rate is above 1%."
 
       - alert: RedisLatencyP99High
         expr: histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket{namespace="wechat",job="portfolio"}[5m])) by (le, namespace, job)) > 0.1
@@ -378,6 +378,89 @@ groups:
 
 
 
+      # @box_module: kafka
+      # ==========================================================
+      # Kafka (EventBus) Alerts
+      # 应用侧 EventBus（kafka_producer_send_total / kafka_consumer_*）本地序列化、业务回调与消费限速
+      # ==========================================================
+
+      - alert: KafkaProducerLocalErrorRateCritical
+        expr: |
+          (
+            sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio",error!=""}[1m]))
+            /
+            sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio"}[1m]))
+          ) > 0.05
+        for: 1m
+        labels:
+          severity: critical
+          component: kafka
+          type: availability
+        annotations:
+          summary: "Kafka producer local error rate: {{ $value | humanizePercentage }}"
+          description: "Topic {{ $labels.topic }} local errors (e.g. marshal) exceed 5% of producer send counter increments."
+
+      - alert: KafkaProducerLocalErrorRateHigh
+        expr: |
+          (
+            sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio",error!=""}[1m]))
+            /
+            sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio"}[1m]))
+          ) > 0.01
+        for: 2m
+        labels:
+          severity: warning
+          component: kafka
+          type: availability
+        annotations:
+          summary: "Kafka producer local error rate: {{ $value | humanizePercentage }}"
+          description: "Topic {{ $labels.topic }} local errors exceed 1% of producer send counter increments."
+
+      - alert: KafkaConsumerHandlerErrorRateCritical
+        expr: |
+          (
+            sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio",error!=""}[1m]))
+            /
+            sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio"}[1m]))
+          ) > 0.05
+        for: 1m
+        labels:
+          severity: critical
+          component: kafka
+          type: availability
+        annotations:
+          summary: "Kafka consumer handler error rate: {{ $value | humanizePercentage }}"
+          description: "Topic {{ $labels.topic }} group {{ $labels.group }} handler errors exceed 5% of receive counter increments."
+
+      - alert: KafkaConsumerHandlerErrorRateHigh
+        expr: |
+          (
+            sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio",error!=""}[1m]))
+            /
+            sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio"}[1m]))
+          ) > 0.01
+        for: 2m
+        labels:
+          severity: warning
+          component: kafka
+          type: availability
+        annotations:
+          summary: "Kafka consumer handler error rate: {{ $value | humanizePercentage }}"
+          description: "Topic {{ $labels.topic }} group {{ $labels.group }} handler errors exceed 1% of receive counter increments."
+
+      - alert: KafkaConsumerRatelimitSustained
+        expr: sum by(namespace, job, topic, group) (rate(kafka_consumer_ratelimit_total{namespace="wechat",job="portfolio"}[5m])) > 0
+        for: 10m
+        labels:
+          severity: warning
+          component: kafka
+          type: saturation
+        annotations:
+          summary: "Kafka consumer rate limit active"
+          description: "Topic {{ $labels.topic }} group {{ $labels.group }} is repeatedly hitting consumer-side rate limits."
+
+
+
       # @box_module: schedule
       # ==========================================================
       # Schedule Job Alerts
diff --git a/pkg/server/ginserver/mid/ginprom/ginprom.go b/pkg/server/ginserver/mid/ginprom/ginprom.go
index fa1a555..66d4088 100644
--- a/pkg/server/ginserver/mid/ginprom/ginprom.go
+++ b/pkg/server/ginserver/mid/ginprom/ginprom.go
@@ -71,18 +71,18 @@ func newGinProm(c *Config) *GinProm {
 func (prom *GinProm) Handler() gin.HandlerFunc {
 	return func(ctx *gin.Context) {
 		start := time.Now()
-		labels := []string{
+		baseLabels := []string{
 			ctx.Request.Method,
 			prom.cfg.requestURLMappingFn(ctx),
 		}
 
 		// Saturation: +1
-		reqInFlight.WithLabelValues(labels...).Inc()
-		defer reqInFlight.WithLabelValues(labels...).Dec()
+		reqInFlight.WithLabelValues(baseLabels...).Inc()
+		defer reqInFlight.WithLabelValues(baseLabels...).Dec()
 
 		// Traffic: Request Size
 		reqSz := computeApproximateRequestSize(ctx.Request)
-		reqSize.WithLabelValues(labels...).Observe(reqSz)
+		reqSize.WithLabelValues(baseLabels...).Observe(reqSz)
 
 		ctx.Next()
 
@@ -91,13 +91,16 @@ func (prom *GinProm) Handler() gin.HandlerFunc {
 			resSz = 0
 		}
 
-		labels = append(labels, strconv.Itoa(ctx.Writer.Status()), strconv.Itoa(ctx.GetInt("errcode")))
+		status := strconv.Itoa(ctx.Writer.Status())
+		errCode := strconv.Itoa(ctx.GetInt("errcode"))
+		statusLabels := append(baseLabels, status)
+		totalLabels := append(statusLabels, errCode)
 
 		// Traffic: Response Size & Total Count (implies Errors via labels)
-		resSize.WithLabelValues(labels...).Observe(float64(resSz))
-		reqTotal.WithLabelValues(labels...).Inc()
+		resSize.WithLabelValues(baseLabels...).Observe(float64(resSz))
+		reqTotal.WithLabelValues(totalLabels...).Inc()
 
 		// Latency
-		reqDuration.WithLabelValues(labels...).Observe(time.Since(start).Seconds())
+		reqDuration.WithLabelValues(statusLabels...).Observe(time.Since(start).Seconds())
 	}
 }