04 - 指标类型
04 - 指标类型
4.1 概述
Prometheus 定义了四种核心指标类型,每种类型有不同的语义和使用场景。理解这些类型是正确设计监控指标的基础。
| 类型 | 英文 | 特征 | 典型场景 |
|---|---|---|---|
| 计数器 | Counter | 只增不减(单调递增) | 请求总数、错误总数 |
| 仪表盘 | Gauge | 可增可减 | 温度、内存使用量 |
| 直方图 | Histogram | 分桶统计 | 请求延迟分布 |
| 摘要 | Summary | 客户端分位数 | 请求延迟分位数 |
4.2 Counter(计数器)
Counter 是最简单也最常用的指标类型。它表示一个单调递增的值,只能增加或在重启时重置为零。
特性
- 只增不减:值永远不会减少
- 重启重置:进程重启后值归零
- 适用场景:累计量(请求数、错误数、字节数)
代码示例
// Go - 使用 client_golang
import "github.com/prometheus/client_golang/prometheus"
var httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "path", "status"},
)
func init() {
prometheus.MustRegister(httpRequestsTotal)
}
func handler(w http.ResponseWriter, r *http.Request) {
// 处理请求
// ...
// 计数
httpRequestsTotal.WithLabelValues(r.Method, r.URL.Path, "200").Inc()
}
# Python - 使用 prometheus_client
from prometheus_client import Counter
http_requests_total = Counter(
'http_requests_total',
'Total number of HTTP requests',
['method', 'path', 'status']
)
def handle_request(method, path, status):
http_requests_total.labels(method=method, path=path, status=status).inc()
// Java - 使用 micrometer
Counter httpRequests = Counter.builder("http_requests_total")
.description("Total number of HTTP requests")
.tag("method", "GET")
.tag("path", "/api")
.register(meterRegistry);
httpRequests.increment();
原始指标输出
# HELP http_requests_total Total number of HTTP requests
# TYPE http_requests_total counter
http_requests_total{method="GET",path="/api/users",status="200"} 15234
http_requests_total{method="POST",path="/api/orders",status="201"} 3456
http_requests_total{method="GET",path="/api/users",status="500"} 12
查询技巧
Counter 通常需要配合 rate() 或 increase() 函数使用:
# 每秒请求速率(过去 5 分钟)
rate(http_requests_total[5m])
# 过去 1 小时的请求增量
increase(http_requests_total[1h])
# 按方法分组的请求速率
sum by (method) (rate(http_requests_total[5m]))
# 错误率(5xx 占比)
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
注意:不要直接对 Counter 做
avg()或sum()聚合,必须先用rate()转换为速率。裸 Counter 值在进程重启后会重置,直接聚合没有意义。
业务场景:电商订单监控
var ordersCreated = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "ecommerce_orders_created_total",
Help: "Total number of orders created",
},
[]string{"channel", "payment_method", "region"},
)
var orderAmount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "ecommerce_order_amount_total_cents",
Help: "Total order amount in cents",
},
[]string{"currency"},
)
# 每分钟订单创建速率
rate(ecommerce_orders_created_total[1m]) * 60
# 各渠道订单占比
sum by (channel) (rate(ecommerce_orders_created_total[5m]))
/
sum(rate(ecommerce_orders_created_total[5m]))
4.3 Gauge(仪表盘)
Gauge 表示一个可以任意增减的数值,反映当前状态。
特性
- 可增可减:值可以升高也可以降低
- 瞬时值:表示当前的测量值
- 适用场景:温度、内存、连接数、队列长度
代码示例
// Go
var inprogressRequests = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "http_requests_inprogress",
Help: "Number of HTTP requests currently in progress",
},
)
var cpuTemperature = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_cpu_temperature_celsius",
Help: "Current CPU temperature",
},
[]string{"core"},
)
// 操作
inprogressRequests.Inc() // 增加 1
inprogressRequests.Dec() // 减少 1
inprogressRequests.Set(42.0) // 设置为 42.0
inprogressRequests.Add(10.0) // 增加 10
inprogressRequests.Sub(3.0) // 减少 3
# Python
from prometheus_client import Gauge
inprogress = Gauge('http_requests_inprogress', 'In-progress requests')
connections = Gauge('db_connections_active', 'Active DB connections', ['pool'])
# 使用装饰器自动跟踪
@inprogress.track_inprogress()
def handle_request():
pass
# 手动操作
connections.labels(pool='primary').set(15)
connections.labels(pool='replica').inc()
原始指标输出
# HELP http_requests_inprogress Number of HTTP requests currently in progress
# TYPE http_requests_inprogress gauge
http_requests_inprogress 7
# HELP node_cpu_temperature_celsius Current CPU temperature
# TYPE node_cpu_temperature_celsius gauge
node_cpu_temperature_celsius{core="0"} 65.5
node_cpu_temperature_celsius{core="1"} 63.2
查询技巧
# 当前值
http_requests_inprogress
# 过去 1 小时的平均值
avg_over_time(http_requests_inprogress[1h])
# 过去 1 小时的最大值
max_over_time(http_requests_inprogress[1h])
# 过去 1 小时的变化量
delta(http_requests_inprogress[1h])
# 内存使用率百分比
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
/ node_memory_MemTotal_bytes * 100
业务场景:连接池监控
var dbPoolSize = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "db_pool_connections",
Help: "Current number of connections in the pool",
},
[]string{"pool", "state"}, // state: active, idle, waiting
)
// 使用回调获取连接池状态
go func() {
for {
stats := db.Stats()
dbPoolSize.WithLabelValues("primary", "active").Set(float64(stats.ActiveConnections))
dbPoolSize.WithLabelValues("primary", "idle").Set(float64(stats.IdleConnections))
dbPoolSize.WithLabelValues("primary", "waiting").Set(float64(stats.WaitCount))
time.Sleep(5 * time.Second)
}
}()
4.4 Histogram(直方图)
Histogram 对观测值进行采样,将其放入可配置的桶(bucket)中,并记录观测值的总和与总数。
特性
- 分桶统计:将数据分布到预定义的桶中
- 服务端计算:分位数在 Prometheus 服务端计算
- 可聚合:多个实例的 Histogram 可以合并
- 适用场景:请求延迟、响应大小
工作原理
观测值: 0.1s, 0.2s, 0.3s, 0.5s, 0.8s, 1.2s, 2.5s
桶分布 (秒):
le=0.1 → 1 个
le=0.25 → 2 个 (包含 ≤0.1 的)
le=0.5 → 4 个
le=1.0 → 5 个
le=2.5 → 6 个
le=5.0 → 7 个
le=+Inf → 7 个 (总是等于总数)
代码示例
// Go
var httpDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0},
},
[]string{"method", "path"},
)
// 记录耗时
timer := prometheus.NewTimer(prometheus.ObserverFunc(
httpDuration.WithLabelValues("GET", "/api/users").Observe))
defer timer.ObserveDuration()
// 或直接观察
httpDuration.WithLabelValues("GET", "/api/users").Observe(0.25)
# Python
from prometheus_client import Histogram
http_duration = Histogram(
'http_request_duration_seconds',
'HTTP request duration',
['method', 'path'],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]
)
# 使用装饰器
@http_duration.labels(method='GET', path='/api').time()
def handle_request():
pass
# 手动观察
http_duration.labels(method='POST', path='/api').observe(0.35)
原始指标输出
# HELP http_request_duration_seconds HTTP request duration in seconds
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{method="GET",path="/api",le="0.01"} 12
http_request_duration_seconds_bucket{method="GET",path="/api",le="0.05"} 245
http_request_duration_seconds_bucket{method="GET",path="/api",le="0.1"} 1856
http_request_duration_seconds_bucket{method="GET",path="/api",le="0.25"} 4523
http_request_duration_seconds_bucket{method="GET",path="/api",le="0.5"} 6234
http_request_duration_seconds_bucket{method="GET",path="/api",le="1.0"} 7102
http_request_duration_seconds_bucket{method="GET",path="/api",le="2.5"} 7234
http_request_duration_seconds_bucket{method="GET",path="/api",le="5.0"} 7240
http_request_duration_seconds_bucket{method="GET",path="/api",le="10.0"} 7241
http_request_duration_seconds_bucket{method="GET",path="/api",le="+Inf"} 7241
http_request_duration_seconds_sum{method="GET",path="/api"} 856.23
http_request_duration_seconds_count{method="GET",path="/api"} 7241
每个 Histogram 自动生成三个指标系列:
_bucket{le="x"}:累积桶计数_sum:观测值总和_count:观测值总数
查询技巧
# P99 延迟(99分位数)
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
# P95 延迟,按 path 分组
histogram_quantile(0.95,
sum by (path, le) (rate(http_request_duration_seconds_bucket[5m]))
)
# 平均延迟
rate(http_request_duration_seconds_sum[5m])
/
rate(http_request_duration_seconds_count[5m])
# 请求速率
rate(http_request_duration_seconds_count[5m])
桶设计原则
| 桶范围 | 适用场景 | 示例 |
|---|---|---|
| 0.001 - 10 | API 响应延迟 | 0.01, 0.05, 0.1, 0.5, 1, 5 |
| 0.1 - 60 | 页面加载延迟 | 0.1, 0.5, 1, 5, 15, 30, 60 |
| 100 - 100000 | 请求体大小(bytes) | 100, 1000, 10000, 100000 |
| 自定义 | 业务特定 | 根据实际数据分布调整 |
注意:桶的数量直接影响存储和查询开销。建议 10-15 个桶,覆盖 99% 的数据范围。过多的桶会显著增加存储成本。
4.5 Summary(摘要)
Summary 与 Histogram 类似,也是对观测值进行采样,但分位数在客户端计算。
特性
- 客户端计算分位数:直接在应用内计算
- 不可聚合:不同实例的 Summary 不能合并
- 适用场景:单实例延迟监控
代码示例
// Go
var httpDuration = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Objectives: map[float64]float64{
0.5: 0.05, // P50,误差 5%
0.9: 0.01, // P90,误差 1%
0.99: 0.001, // P99,误差 0.1%
},
MaxAge: 10 * time.Minute, // 滑动窗口
AgeBuckets: 5,
},
[]string{"method", "path"},
)
原始指标输出
# HELP http_request_duration_seconds HTTP request duration in seconds
# TYPE http_request_duration_seconds summary
http_request_duration_seconds{method="GET",path="/api",quantile="0.5"} 0.045
http_request_duration_seconds{method="GET",path="/api",quantile="0.9"} 0.123
http_request_duration_seconds{method="GET",path="/api",quantile="0.99"} 0.876
http_request_duration_seconds_sum{method="GET",path="/api"} 856.23
http_request_duration_seconds_count{method="GET",path="/api"} 7241
Histogram vs Summary 对比
| 特性 | Histogram | Summary |
|---|---|---|
| 分位数计算 | 服务端(PromQL) | 客户端(应用内) |
| 可聚合 | ✅ 可以跨实例合并 | ❌ 不可合并 |
| 精度 | 取决于桶配置 | 可精确配置 |
| 存储开销 | 较高(多个桶系列) | 较低 |
| 推荐场景 | 生产环境首选 | 单实例或少量实例 |
最佳实践:优先使用 Histogram,只在需要精确分位数且实例数少的场景下使用 Summary。
4.6 指标命名规范
命名规则
<namespace>_<subsystem>_<name>_<unit>_<suffix>
示例:
http_requests_total # 命名空间_名称_后缀
node_cpu_seconds_total # 命名空间_子系统_名称_单位_后缀
mysql_global_status_threads # 命名空间_子系统_名称
命名最佳实践
| 规则 | 正确 | 错误 |
|---|---|---|
| 使用 snake_case | http_requests_total | httpRequestsTotal |
Counter 加 _total 后缀 | errors_total | errors |
| 包含单位后缀 | _seconds, _bytes | _duration, _size |
| 使用基础单位 | _seconds 而非 _milliseconds | |
| 避免重复命名空间 | app_http_requests | app_http_app_requests |
单位后缀
| 单位 | 后缀 | 示例 |
|---|---|---|
| 秒 | _seconds | http_duration_seconds |
| 字节 | _bytes | response_size_bytes |
| 比率 | _ratio | cpu_usage_ratio |
| 百分比 | _percent(不推荐,用 0-1 的 ratio) |
4.7 标签设计原则
标签基数(Cardinality)
标签的基数 = 所有可能的唯一值的数量。
低基数(推荐) 高基数(危险)
───────────── ─────────────
method: GET/POST user_id: 0~10^6
status: 200/404/500 request_id: UUID
instance: ~100 台 ip_address: 所有 IP
注意:每个唯一的标签组合都会创建一个新的时间序列。高基数标签会导致内存和存储爆炸。避免使用
user_id、request_id、ip等高基数标签。
标签建议
// ✅ 好的标签设计
var requests = prometheus.NewCounterVec(
prometheus.CounterOpts{Name: "http_requests_total"},
[]string{"method", "path", "status"}, // 低基数
)
// ❌ 危险的标签设计
var requests = prometheus.NewCounterVec(
prometheus.CounterOpts{Name: "http_requests_total"},
[]string{"method", "url", "user_id", "trace_id"}, // url/user_id 高基数!
)
4.8 Instrumentation 实战
Go 应用完整埋点示例
package main
import (
"log"
"math/rand"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// 定义指标
var (
httpRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "myapp",
Name: "http_requests_total",
Help: "Total HTTP requests",
},
[]string{"method", "path", "status"},
)
httpRequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "myapp",
Name: "http_request_duration_seconds",
Help: "HTTP request duration",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "path"},
)
activeConnections = promauto.NewGauge(
prometheus.GaugeOpts{
Namespace: "myapp",
Name: "active_connections",
Help: "Currently active connections",
},
)
)
// 中间件
func instrumentHandler(path string, next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
activeConnections.Inc()
defer activeConnections.Dec()
next(w, r)
duration := time.Since(start).Seconds()
httpRequestDuration.WithLabelValues(r.Method, path).Observe(duration)
httpRequestsTotal.WithLabelValues(r.Method, path, "200").Inc()
}
}
func main() {
http.HandleFunc("/api/users", instrumentHandler("/api/users", func(w http.ResponseWriter, r *http.Request) {
time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond)
w.Write([]byte("users"))
}))
http.Handle("/metrics", promhttp.Handler())
log.Fatal(http.ListenAndServe(":8080", nil))
}
4.9 本章小结
| 类型 | 语义 | 查询函数 | 适用场景 |
|---|---|---|---|
| Counter | 只增不减 | rate() increase() | 请求/错误计数 |
| Gauge | 可增可减 | 直接查询 / _over_time() | 温度/内存/连接数 |
| Histogram | 分桶 + 总和 + 总数 | histogram_quantile() | 延迟分布(推荐) |
| Summary | 客户端分位数 | 直接查询 quantile | 单实例延迟 |
扩展阅读
上一章:03 - 架构与原理 下一章:05 - PromQL 基础