PostgreSQL 完全指南 / 15 - 监控诊断
第 15 章 · 监控诊断
有效的监控是预防故障的关键。本章覆盖 pg_stat 系统视图、pg_stat_statements、Prometheus 集成等。
15.1 核心统计视图
pg_stat_activity(活动连接)
-- 查看所有活动连接
SELECT
pid,
usename,
datname,
client_addr,
state,
wait_event_type,
wait_event,
query,
query_start,
now() - query_start AS duration
FROM pg_stat_activity
WHERE state IS NOT NULL
ORDER BY duration DESC;
-- 统计连接状态
SELECT state, count(*) FROM pg_stat_activity GROUP BY state;
-- 查找长查询(超过 5 分钟)
SELECT pid, now() - query_start AS duration, query
FROM pg_stat_activity
WHERE state = 'active' AND now() - query_start > INTERVAL '5 minutes';
-- 终止长查询
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'active' AND now() - query_start > INTERVAL '10 minutes';
-- 空闲事务(idle in transaction)
SELECT pid, now() - state_change AS idle_duration, query
FROM pg_stat_activity
WHERE state = 'idle in transaction'
AND now() - state_change > INTERVAL '5 minutes';
pg_stat_database(数据库统计)
SELECT
datname,
numbackends AS connections,
xact_commit AS commits,
xact_rollback AS rollbacks,
blks_read,
blks_hit,
round(blks_hit::numeric / NULLIF(blks_hit + blks_read, 0) * 100, 2) AS cache_hit_pct,
tup_returned,
tup_fetched,
tup_inserted,
tup_updated,
tup_deleted,
deadlocks,
temp_files,
pg_size_pretty(temp_bytes) AS temp_bytes
FROM pg_stat_database
WHERE datname = current_database();
pg_stat_user_tables(表统计)
SELECT
relname AS table_name,
seq_scan, -- 全表扫描次数
seq_tup_read, -- 全表扫描读取行数
idx_scan, -- 索引扫描次数
idx_tup_fetch, -- 索引扫描返回行数
n_tup_ins AS inserts,
n_tup_upd AS updates,
n_tup_del AS deletes,
n_live_tup AS live_rows,
n_dead_tup AS dead_rows,
round(n_dead_tup::numeric / NULLIF(n_live_tup + n_dead_tup, 0) * 100, 2) AS dead_pct,
last_vacuum,
last_autovacuum,
last_analyze,
last_autoanalyze
FROM pg_stat_user_tables
ORDER BY n_dead_tup DESC;
pg_stat_bgwriter(后台写入统计)
SELECT
checkpoints_timed,
checkpoints_req,
buffers_checkpoint,
buffers_backend,
buffers_alloc,
checkpoint_write_time,
checkpoint_sync_time
FROM pg_stat_bgwriter;
15.2 pg_stat_statements
pg_stat_statements 是最重要的性能监控扩展。
-- 安装
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
-- postgresql.conf
-- shared_preload_libraries = 'pg_stat_statements'
-- pg_stat_statements.max = 10000
-- pg_stat_statements.track = all
-- 最耗时的查询 Top 20
SELECT
queryid,
left(query, 100) AS query_short,
calls,
round(total_exec_time::numeric, 2) AS total_time_ms,
round(mean_exec_time::numeric, 2) AS avg_time_ms,
round(max_exec_time::numeric, 2) AS max_time_ms,
rows,
shared_blks_hit,
shared_blks_read
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;
-- 频率最高的查询
SELECT queryid, left(query, 100), calls, rows
FROM pg_stat_statements
ORDER BY calls DESC LIMIT 20;
-- 重置统计
SELECT pg_stat_statements_reset();
15.3 表膨胀监控
-- 表膨胀率估算
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname || '.' || tablename)) AS total_size,
n_live_tup,
n_dead_tup,
round(n_dead_tup::numeric / NULLIF(n_live_tup, 0) * 100, 2) AS dead_pct,
last_autovacuum
FROM pg_stat_user_tables
WHERE n_dead_tup > 10000
ORDER BY n_dead_tup DESC;
-- 索引膨胀(需要 pgstattuple 扩展)
CREATE EXTENSION IF NOT EXISTS pgstattuple;
SELECT * FROM pgstatindex('idx_emp_email');
15.4 Prometheus + Grafana 集成
使用 postgres_exporter 采集指标:
# docker-compose.yml
version: "3.8"
services:
postgres:
image: postgres:17
environment:
POSTGRES_PASSWORD: password
ports:
- "5432:5432"
postgres-exporter:
image: prometheuscommunity/postgres-exporter
environment:
DATA_SOURCE_NAME: "postgresql://postgres:password@postgres:5432/postgres?sslmode=disable"
ports:
- "9187:9187"
prometheus:
image: prom/prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
grafana:
image: grafana/grafana
ports:
- "3000:3000"
# prometheus.yml
scrape_configs:
- job_name: "postgres"
static_configs:
- targets: ["postgres-exporter:9187"]
15.5 关键监控指标
| 指标 | 严重阈值 | 查询方式 |
|---|---|---|
| 连接数使用率 | > 80% | pg_stat_activity |
| 缓存命中率 | < 99% | pg_stat_database |
| 事务回滚率 | > 5% | pg_stat_database |
| 死锁次数 | > 0 | pg_stat_database |
| 复制延迟 | > 10MB | pg_stat_replication |
| 表膨胀率 | > 30% | pg_stat_user_tables |
| 长事务 | > 5min | pg_stat_activity |
| 空闲事务 | > 1min | pg_stat_activity |
| WAL 文件数 | > 100 | pg_ls_waldir |
15.6 等待事件
-- 查看当前等待
SELECT
pid,
wait_event_type,
wait_event,
state,
query
FROM pg_stat_activity
WHERE wait_event IS NOT NULL AND state = 'active';
-- 等待事件类型
-- LWLock: 轻量锁(内部锁)
-- Lock: 重量锁(行锁、表锁)
-- BufferPin: 缓冲区固定
-- Activity: 后台进程等待
-- Client: 等待客户端
-- Extension: 扩展等待
-- IO: 磁盘 I/O
-- IPC: 进程间通信
-- Timeout: 超时等待
业务场景
| 场景 | 监控方案 |
|---|---|
| 日常巡检 | pg_stat_activity + pg_stat_database |
| 慢查询优化 | pg_stat_statements |
| 容量规划 | 表/索引大小 + 增长趋势 |
| 告警 | Prometheus + Grafana 告警规则 |
| 事故回溯 | pg_stat_statements + 日志 |