Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions docs/metric.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,15 @@
| :------------------------------------- | :-------- | :----------------------------------- | :------------------------------------------ |
| `http_server_requests_inflight` | Gauge | `method`, `url` | 当前正在处理的 HTTP 请求数 (饱和度) |
| `http_server_requests_total` | Counter | `method`, `url`, `status`, `errcode` | 处理的 HTTP 请求总数 (流量 & 错误) |
| `http_server_request_duration_seconds` | Histogram | `method`, `url`, `status`, `errcode` | HTTP 请求耗时分布 (延迟),桶:.005s - 10s |
| `http_server_request_duration_seconds` | Histogram | `method`, `url`, `status` | HTTP 请求耗时分布 (延迟),桶:.005s - 10s |
| `http_server_request_size_bytes` | Histogram | `method`, `url` | HTTP 请求体大小分布 (流量),桶:1KB - 100MB |
| `http_server_response_size_bytes` | Histogram | `method`, `url`, `status`, `errcode` | HTTP 响应体大小分布 (流量),桶:1KB - 100MB |
| `http_server_response_size_bytes` | Histogram | `method`, `url` | HTTP 响应体大小分布 (流量),桶:1KB - 100MB |

> 兼容性说明(`box/box v1.3.0`):
> - `http_server_requests_total` 仍保留 `errcode`,用于业务错误统计与告警。
> - `http_server_request_duration_seconds` 已移除 `errcode` 标签(仅保留 `method`,`url`,`status`),用于降低时序基数。
> - `http_server_response_size_bytes` 仅保留 `method`,`url`。
> - Prometheus 无需手工“清除旧指标”再上报新指标。升级后新实例会按新标签集上报,旧标签集时序会自然停止写入并按存储保留策略过期。

### 1.2 HTTP Client (Wukong)

Expand Down Expand Up @@ -101,8 +107,8 @@

| 指标名称 | 类型 | Labels | 说明 |
| :-------------------------------------- | :-------- | :----------------------------------------------------- | :--------------------- |
| `redis_client_requests_total` | Counter | `address`, `db`, `masterName`, `pipe`, `cmd`, `result` | Redis 命令执行总数 |
| `redis_client_request_duration_seconds` | Histogram | `address`, `db`, `masterName`, `pipe`, `cmd`, `result` | Redis 命令执行耗时分布 |
| `redis_client_requests_total` | Counter | `pipe`, `cmd`, `result` | Redis 命令执行总数 |
| `redis_client_request_duration_seconds` | Histogram | `pipe`, `cmd`, `result` | Redis 命令执行耗时分布 |

**错误分类 (`result` 标签值)**:

Expand Down
14 changes: 7 additions & 7 deletions docs/prometheus_alerts_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -355,14 +355,14 @@ groups:
type: availability
annotations:
summary: "Redis command failure count: {{ $value }}"
description: "{{ $labels.cmd }} on {{ $labels.address }}. Error type: {{ $labels.result }}"
description: "{{ $labels.cmd }} error type: {{ $labels.result }}"

# Redis Connection Error Rate - Critical
- alert: RedisConnectionErrorRateCritical
expr: |
(sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{result="connection_error"}[1m]))
(sum by(namespace, job, cmd) (rate(redis_client_requests_total{result="connection_error"}[1m]))
/
sum by(namespace, job, cmd, address) (rate(redis_client_requests_total[1m]))) > 0.01
sum by(namespace, job, cmd) (rate(redis_client_requests_total[1m]))) > 0.01
for: 1m
labels:
severity: critical
Expand All @@ -371,14 +371,14 @@ groups:
error_type: connection
annotations:
summary: "Redis connection error rate: {{ $value | humanizePercentage }}"
description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} connection error rate is above 1%."
description: "Redis command {{ $labels.cmd }} connection error rate is above 1%."

# Redis Timeout Error Rate - Critical
- alert: RedisTimeoutErrorRateCritical
expr: |
(sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{result="timeout_error"}[1m]))
(sum by(namespace, job, cmd) (rate(redis_client_requests_total{result="timeout_error"}[1m]))
/
sum by(namespace, job, cmd, address) (rate(redis_client_requests_total[1m]))) > 0.01
sum by(namespace, job, cmd) (rate(redis_client_requests_total[1m]))) > 0.01
for: 1m
labels:
severity: critical
Expand All @@ -387,7 +387,7 @@ groups:
error_type: timeout
annotations:
summary: "Redis timeout error rate: {{ $value | humanizePercentage }}"
description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} timeout error rate is above 1%."
description: "Redis command {{ $labels.cmd }} timeout error rate is above 1%."

- alert: RedisLatencyP99High
expr: histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket[5m])) by (le, namespace, job)) > 0.1
Expand Down
97 changes: 90 additions & 7 deletions docs/wechat_portfolio_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -331,14 +331,14 @@ groups:
type: availability
annotations:
summary: "Redis command failure count: {{ $value }}"
description: "{{ $labels.cmd }} on {{ $labels.address }}. Error type: {{ $labels.result }}"
description: "{{ $labels.cmd }} error type: {{ $labels.result }}"

# Redis Connection Error Rate - Critical
- alert: RedisConnectionErrorRateCritical
expr: |
(sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="connection_error"}[1m]))
(sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="connection_error"}[1m]))
/
sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
for: 1m
labels:
severity: critical
Expand All @@ -347,14 +347,14 @@ groups:
error_type: connection
annotations:
summary: "Redis connection error rate: {{ $value | humanizePercentage }}"
description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} connection error rate is above 1%."
description: "Redis command {{ $labels.cmd }} connection error rate is above 1%."

# Redis Timeout Error Rate - Critical
- alert: RedisTimeoutErrorRateCritical
expr: |
(sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="timeout_error"}[1m]))
(sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio",result="timeout_error"}[1m]))
/
sum by(namespace, job, cmd, address) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
sum by(namespace, job, cmd) (rate(redis_client_requests_total{namespace="wechat",job="portfolio"}[1m]))) > 0.01
for: 1m
labels:
severity: critical
Expand All @@ -363,7 +363,7 @@ groups:
error_type: timeout
annotations:
summary: "Redis timeout error rate: {{ $value | humanizePercentage }}"
description: "Redis command {{ $labels.cmd }} on {{ $labels.address }} timeout error rate is above 1%."
description: "Redis command {{ $labels.cmd }} timeout error rate is above 1%."

- alert: RedisLatencyP99High
expr: histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket{namespace="wechat",job="portfolio"}[5m])) by (le, namespace, job)) > 0.1
Expand All @@ -378,6 +378,89 @@ groups:



# @box_module: kafka
# ==========================================================
# Kafka (EventBus) Alerts
# 应用侧 EventBus(kafka_producer_send_total / kafka_consumer_*)本地序列化、业务回调与消费限速
# ==========================================================

- alert: KafkaProducerLocalErrorRateCritical
expr: |
(
sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio",error!=""}[1m]))
/
sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio"}[1m]))
) > 0.05
for: 1m
labels:
severity: critical
component: kafka
type: availability
annotations:
summary: "Kafka producer local error rate: {{ $value | humanizePercentage }}"
description: "Topic {{ $labels.topic }} local errors (e.g. marshal) exceed 5% of producer send counter increments."

- alert: KafkaProducerLocalErrorRateHigh
expr: |
(
sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio",error!=""}[1m]))
/
sum by(namespace, job, topic) (rate(kafka_producer_send_total{namespace="wechat",job="portfolio"}[1m]))
) > 0.01
for: 2m
labels:
severity: warning
component: kafka
type: availability
annotations:
summary: "Kafka producer local error rate: {{ $value | humanizePercentage }}"
description: "Topic {{ $labels.topic }} local errors exceed 1% of producer send counter increments."

- alert: KafkaConsumerHandlerErrorRateCritical
expr: |
(
sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio",error!=""}[1m]))
/
sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio"}[1m]))
) > 0.05
for: 1m
labels:
severity: critical
component: kafka
type: availability
annotations:
summary: "Kafka consumer handler error rate: {{ $value | humanizePercentage }}"
description: "Topic {{ $labels.topic }} group {{ $labels.group }} handler errors exceed 5% of receive counter increments."

- alert: KafkaConsumerHandlerErrorRateHigh
expr: |
(
sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio",error!=""}[1m]))
/
sum by(namespace, job, topic, group) (rate(kafka_consumer_receive_total{namespace="wechat",job="portfolio"}[1m]))
) > 0.01
for: 2m
labels:
severity: warning
component: kafka
type: availability
annotations:
summary: "Kafka consumer handler error rate: {{ $value | humanizePercentage }}"
description: "Topic {{ $labels.topic }} group {{ $labels.group }} handler errors exceed 1% of receive counter increments."

- alert: KafkaConsumerRatelimitSustained
expr: sum by(namespace, job, topic, group) (rate(kafka_consumer_ratelimit_total{namespace="wechat",job="portfolio"}[5m])) > 0
for: 10m
labels:
severity: warning
component: kafka
type: saturation
annotations:
summary: "Kafka consumer rate limit active"
description: "Topic {{ $labels.topic }} group {{ $labels.group }} is repeatedly hitting consumer-side rate limits."



# @box_module: schedule
# ==========================================================
# Schedule Job Alerts
Expand Down
19 changes: 11 additions & 8 deletions pkg/server/ginserver/mid/ginprom/ginprom.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,18 @@ func newGinProm(c *Config) *GinProm {
func (prom *GinProm) Handler() gin.HandlerFunc {
return func(ctx *gin.Context) {
start := time.Now()
labels := []string{
baseLabels := []string{
ctx.Request.Method,
prom.cfg.requestURLMappingFn(ctx),
}

// Saturation: +1
reqInFlight.WithLabelValues(labels...).Inc()
defer reqInFlight.WithLabelValues(labels...).Dec()
reqInFlight.WithLabelValues(baseLabels...).Inc()
defer reqInFlight.WithLabelValues(baseLabels...).Dec()

// Traffic: Request Size
reqSz := computeApproximateRequestSize(ctx.Request)
reqSize.WithLabelValues(labels...).Observe(reqSz)
reqSize.WithLabelValues(baseLabels...).Observe(reqSz)

ctx.Next()

Expand All @@ -91,13 +91,16 @@ func (prom *GinProm) Handler() gin.HandlerFunc {
resSz = 0
}

labels = append(labels, strconv.Itoa(ctx.Writer.Status()), strconv.Itoa(ctx.GetInt("errcode")))
status := strconv.Itoa(ctx.Writer.Status())
errCode := strconv.Itoa(ctx.GetInt("errcode"))
statusLabels := append(baseLabels, status)
totalLabels := append(statusLabels, errCode)

// Traffic: Response Size & Total Count (implies Errors via labels)
resSize.WithLabelValues(labels...).Observe(float64(resSz))
reqTotal.WithLabelValues(labels...).Inc()
resSize.WithLabelValues(baseLabels...).Observe(float64(resSz))
reqTotal.WithLabelValues(totalLabels...).Inc()

// Latency
reqDuration.WithLabelValues(labels...).Observe(time.Since(start).Seconds())
reqDuration.WithLabelValues(statusLabels...).Observe(time.Since(start).Seconds())
}
}
Loading