Varnish Cache 运维教程 / 第14章:运维最佳实践
第14章:运维最佳实践
14.1 运维规范
14.1.1 配置管理规范
# 配置文件版本控制
/etc/varnish/
├── default.vcl # 主配置文件
├── backends/
│ ├── web.vcl # 后端定义
│ ├── api.vcl # API 后端
│ └── static.vcl # 静态资源后端
├── directors/
│ ├── round-robin.vcl # 轮询配置
│ └── fallback.vcl # 故障转移配置
├── rules/
│ ├── cache-rules.vcl # 缓存规则
│ ├── security.vcl # 安全规则
│ └── routing.vcl # 路由规则
├── probes/
│ └── health.vcl # 健康检查配置
└── includes/
└── functions.vcl # 公共函数
# 主配置文件示例
vcl 4.1;
# 导入模块
import std;
import directors;
# 包含其他配置
include "backends/web.vcl";
include "backends/api.vcl";
include "directors/round-robin.vcl";
include "rules/cache-rules.vcl";
include "rules/security.vcl";
# 主要子程序
sub vcl_recv {
call security_check;
call route_request;
call cache_decision;
}
14.1.2 命名规范
# 后端命名:{环境}-{服务}-{序号}
backend prod_web_01 { ... }
backend prod_web_02 { ... }
backend prod_api_01 { ... }
# Director 命名:{类型}-{服务}
# (在 VCL 中使用小写下划线)
# new web_pool = directors.round_robin();
# new api_pool = directors.fallback();
# 自定义头部命名
# X-{项目}-{功能}
set req.http.X-MyApp-Cache-TTL = "300";
set req.http.X-MyApp-Request-ID = req.xid;
# 探针命名:{服务}-probe
probe web_probe { ... }
probe api_probe { ... }
14.1.3 变更管理流程
1. 提交变更请求
│
2. 代码审查
│
3. 测试环境验证
│
4. 灰度发布(10% 流量)
│
5. 监控观察(30 分钟)
│
6. 全量发布
│
7. 验证确认
│
8. 更新文档
#!/bin/bash
# deploy-vcl.sh - VCL 部署脚本
VCL_FILE="$1"
VCL_NAME="deploy_$(date +%Y%m%d_%H%M%S)"
echo "=== VCL Deployment ==="
echo "File: $VCL_FILE"
echo "Name: $VCL_NAME"
# 1. 验证 VCL 语法
echo "1. Validating VCL..."
if ! varnishd -C -f "$VCL_FILE"; then
echo "ERROR: VCL validation failed"
exit 1
fi
echo " VCL validation passed"
# 2. 加载 VCL
echo "2. Loading VCL..."
if ! varnishadm vcl.load "$VCL_NAME" "$VCL_FILE"; then
echo "ERROR: VCL loading failed"
exit 1
fi
echo " VCL loaded as $VCL_NAME"
# 3. 切换 VCL
echo "3. Activating VCL..."
if ! varnishadm vcl.use "$VCL_NAME"; then
echo "ERROR: VCL activation failed"
exit 1
fi
echo " VCL activated"
# 4. 清理旧 VCL
echo "4. Cleaning up old VCLs..."
OLD_VCLS=$(varnishadm vcl.list | grep "^available" | awk '{print $3}' | head -n -3)
for vcl in $OLD_VCLS; do
varnishadm vcl.discard "$vcl" 2>/dev/null
echo " Discarded: $vcl"
done
echo "=== Deployment Complete ==="
14.2 监控体系
14.2.1 监控架构
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Varnish │────▶│ Exporter │────▶│ Prometheus │
│ 实例 │ │ (9131) │ │ │
└─────────────┘ └─────────────┘ └──────┬──────┘
│
▼
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ 告警 │◀────│ AlertManager│◀───│ Grafana │
│ (邮件/钉钉)│ │ │ │ (仪表盘) │
└─────────────┘ └─────────────┘ └─────────────┘
14.2.2 关键监控指标
| 指标 | 说明 | 告警阈值 |
|---|---|---|
| 缓存命中率 | HIT / (HIT + MISS) | < 70% |
| 请求速率 | 每秒请求数 | > 10000 或突增 50% |
| 后端健康状态 | 健康后端数量 | < 50% |
| 线程使用率 | 活跃线程 / 最大线程 | > 80% |
| 内存使用率 | 已用内存 / 总内存 | > 90% |
| 5xx 错误率 | 5xx / 总请求 | > 1% |
| 平均响应时间 | P95 响应延迟 | > 100ms |
| 连接数 | 当前连接数 | > 最大连接数 80% |
| ban 数量 | 活跃 ban 规则数 | > 1000 |
| LRU 淘汰率 | 每秒淘汰对象数 | 持续 > 0 |
14.2.3 Prometheus 配置
# prometheus.yml
scrape_configs:
- job_name: 'varnish'
static_configs:
- targets:
- 'varnish-01:9131'
- 'varnish-02:9131'
- 'varnish-03:9131'
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
- job_name: 'node'
static_configs:
- targets:
- 'varnish-01:9100'
- 'varnish-02:9100'
- 'varnish-03:9100'
14.2.4 告警规则
# alert-rules.yml
groups:
- name: varnish
rules:
- alert: VarnishLowHitRate
expr: rate(varnish_main_cache_hit[5m]) / (rate(varnish_main_cache_hit[5m]) + rate(varnish_main_cache_miss[5m])) * 100 < 70
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish 缓存命中率低于 70%"
description: "{{ $labels.instance }} 缓存命中率: {{ $value }}%"
- alert: VarnishHighErrorRate
expr: rate(varnish_main_client_req_5xx[5m]) / rate(varnish_main_client_req[5m]) * 100 > 1
for: 2m
labels:
severity: critical
annotations:
summary: "Varnish 5xx 错误率超过 1%"
- alert: VarnishBackendUnhealthy
expr: varnish_backend_healthy == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Varnish 后端不健康: {{ $labels.backend }}"
- alert: VarnishHighMemoryUsage
expr: varnish_sma_g_bytes / (varnish_sma_g_bytes + varnish_sma_g_space) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish 内存使用率超过 90%"
- alert: VarnishThreadSaturation
expr: varnish_main_threads > 400
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish 线程数超过 400"
14.2.5 Grafana 仪表盘
{
"dashboard": {
"title": "Varnish Cache Overview",
"panels": [
{
"title": "Cache Hit Rate",
"type": "stat",
"targets": [{
"expr": "rate(varnish_main_cache_hit[5m]) / (rate(varnish_main_cache_hit[5m]) + rate(varnish_main_cache_miss[5m])) * 100"
}],
"thresholds": {
"steps": [
{"value": 0, "color": "red"},
{"value": 70, "color": "yellow"},
{"value": 90, "color": "green"}
]
}
},
{
"title": "Request Rate",
"type": "graph",
"targets": [{
"expr": "rate(varnish_main_client_req[5m])"
}]
},
{
"title": "Backend Health",
"type": "stat",
"targets": [{
"expr": "count(varnish_backend_healthy == 1)"
}]
},
{
"title": "Response Time Distribution",
"type": "heatmap",
"targets": [{
"expr": "rate(varnish_main_n_vcl_response[5m])"
}]
}
]
}
}
14.3 安全加固
14.3.1 访问控制
# ACL 定义
acl management {
"localhost";
"192.168.1.0"/24;
"10.0.0.0"/8;
}
acl purge_allowed {
"localhost";
"192.168.1.0"/24;
}
acl blocked_ips {
"192.168.1.100";
"10.0.0.50";
}
sub vcl_recv {
# IP 黑名单
if (client.ip ~ blocked_ips) {
return (synth(403, "Forbidden"));
}
# 管理接口访问限制
if (req.url ~ "^/admin") {
if (!client.ip ~ management) {
return (synth(403, "Admin access denied"));
}
}
# Purge 操作限制
if (req.method == "PURGE") {
if (!client.ip ~ purge_allowed) {
return (synth(403, "Purge not allowed"));
}
return (purge);
}
# BAN 操作限制
if (req.method == "BAN") {
if (!client.ip ~ purge_allowed) {
return (synth(403, "Ban not allowed"));
}
}
}
14.3.2 安全头部
sub vcl_deliver {
# 安全头部
set resp.http.X-Content-Type-Options = "nosniff";
set resp.http.X-Frame-Options = "SAMEORIGIN";
set resp.http.X-XSS-Protection = "1; mode=block";
set resp.http.Referrer-Policy = "strict-origin-when-cross-origin";
set resp.http.Strict-Transport-Security = "max-age=31536000; includeSubDomains";
# 移除敏感信息
unset resp.http.Server;
unset resp.http.X-Powered-By;
unset resp.http.X-AspNet-Version;
# 生产环境移除调试头部
unset resp.http.X-Varnish;
unset resp.http.Via;
unset resp.http.X-Debug;
}
14.3.3 速率限制
# 注意:VCL 原生不支持速率限制
# 需要使用外部工具或 VMOD
# 使用 Nginx 进行速率限制
# 或使用 Lua/Clojure VMOD 实现
# 基本的请求限制(基于头部)
sub vcl_recv {
# 限制单 IP 请求速率
if (req.http.X-Rate-Limit == "exceeded") {
return (synth(429, "Too Many Requests"));
}
}
14.3.4 DDoS 防护
sub vcl_recv {
# 基本 DDoS 防护
# 限制请求体大小
if (req.method == "POST" && req.http.Content-Length ~ "[0-9]{7,}") {
return (synth(413, "Request Entity Too Large"));
}
# 检查可疑 User-Agent
if (req.http.User-Agent ~ "(?i)(bot|crawler|spider|scraper)" &&
req.url !~ "^/robots.txt") {
return (synth(403, "Forbidden"));
}
# 限制并发连接(需要外部工具配合)
# 限制特定路径的请求频率
if (req.url ~ "^/api/login" && req.method == "POST") {
# 登录接口需要特殊保护
}
}
14.3.5 管理接口安全
# 1. 限制管理接口监听地址
varnishd -T 127.0.0.1:6082 # 只监听本地
# 2. 使用密钥认证
# /etc/varnish/secret
echo "your-secret-key" > /etc/varnish/secret
chmod 600 /etc/varnish/secret
# 启动时指定密钥
varnishd -T 127.0.0.1:6082 -S /etc/varnish/secret
# 3. 使用 SSH 隧道访问管理接口
ssh -L 6082:localhost:6082 user@varnish-server
varnishadm -T localhost:6082
14.4 CDN 架构
14.4.1 多级缓存架构
用户请求
│
▼
┌──────────────┐
│ CDN 边缘 │ ← 第一级缓存(全球分布)
│ (Cloudflare)│
└──────┬───────┘
│
▼
┌──────────────┐
│ 区域 Varnish │ ← 第二级缓存(区域部署)
│ 集群 │
└──────┬───────┘
│
▼
┌──────────────┐
│ 应用 Varnish │ ← 第三级缓存(应用层)
│ │
└──────┬───────┘
│
▼
┌──────────────┐
│ 后端应用 │ ← 源站
│ 服务器 │
└──────────────┘
14.4.2 多级缓存配置
# 区域 Varnish 配置
vcl 4.1;
backend cdn_origin {
.host = "cdn.example.com";
.port = "443";
.ssl = true;
.probe = {
.url = "/health";
.timeout = 5s;
.interval = 10s;
}
}
sub vcl_recv {
# 传递 CDN 头部
if (req.http.CF-Connecting-IP) {
set req.http.X-Real-IP = req.http.CF-Connecting-IP;
}
# 设置缓存键(包含 CDN 信息)
set req.http.X-Cache-Key = req.url + req.http.Host;
}
sub vcl_backend_response {
# 尊重 CDN 的缓存控制
if (beresp.http.Cache-Control ~ "s-maxage=(\d+)") {
# 使用 s-maxage
}
# 设置 Varnish 的 TTL
if (bereq.url ~ "\.(css|js|jpg|png|gif)$") {
set beresp.ttl = 1h;
} else {
set beresp.ttl = 5m;
}
}
14.4.3 缓存清除协调
#!/bin/bash
# cdn-purge.sh - 协调多级缓存清除
URL="$1"
# 1. 清除 Varnish 缓存
echo "Purging Varnish cache..."
curl -X PURGE -H "Host: www.example.com" "http://varnish:6081${URL}"
# 2. 清除 CDN 缓存(以 Cloudflare 为例)
echo "Purging Cloudflare cache..."
curl -X POST "https://api.cloudflare.com/client/v4/zones/${ZONE_ID}/purge_cache" \
-H "Authorization: Bearer ${CF_API_TOKEN}" \
-H "Content-Type: application/json" \
--data "{\"files\":[\"https://www.example.com${URL}\"]}"
# 3. 清除应用层缓存
echo "Purging application cache..."
curl -X POST "http://app-server/cache/purge" \
-H "Authorization: Bearer ${APP_TOKEN}" \
-d "{\"url\": \"${URL}\"}"
echo "All caches purged."
14.5 生产部署
14.5.1 部署清单
## 部署前检查
- [ ] VCL 语法验证通过
- [ ] 后端健康检查正常
- [ ] 防火墙规则配置
- [ ] TLS 证书有效
- [ ] 监控告警配置
- [ ] 回滚方案准备
- [ ] 备份当前配置
- [ ] 通知相关人员
## 部署步骤
1. 备份当前配置
2. 加载新 VCL(不激活)
3. 验证新 VCL
4. 灰度发布(10% 流量)
5. 监控关键指标
6. 全量发布
7. 验证确认
8. 更新文档
## 部署后检查
- [ ] 缓存命中率正常
- [ ] 后端负载正常
- [ ] 错误率无异常
- [ ] 响应时间正常
- [ ] 监控数据正常
- [ ] 用户反馈正常
14.5.2 回滚方案
#!/bin/bash
# rollback-vcl.sh - VCL 回滚脚本
echo "=== VCL Rollback ==="
# 获取当前活动的 VCL
CURRENT=$(varnishadm vcl.list | grep "^active" | awk '{print $3}')
echo "Current VCL: $CURRENT"
# 列出可用的 VCL
echo "Available VCLs:"
varnishadm vcl.list | grep "^available"
# 使用上一个 VCL
PREVIOUS=$(varnishadm vcl.list | grep "^available" | tail -1 | awk '{print $3}')
if [ -z "$PREVIOUS" ]; then
echo "ERROR: No previous VCL available"
exit 1
fi
echo "Rolling back to: $PREVIOUS"
# 切换 VCL
if varnishadm vcl.use "$PREVIOUS"; then
echo "Rollback successful"
# 清理失败的 VCL
varnishadm vcl.discard "$CURRENT" 2>/dev/null
else
echo "ERROR: Rollback failed"
exit 1
fi
14.5.3 容量规划
# 容量计算公式
# 1. 缓存容量
# 缓存大小 = 平均对象大小 × 目标对象数 × 1.2(冗余)
# 示例:10KB × 100,000 × 1.2 = 1.2GB
# 2. 并发连接数
# 并发数 = 每秒请求数 × 平均响应时间
# 示例:1000 req/s × 0.1s = 100 并发
# 3. 带宽需求
# 带宽 = 每秒请求数 × 平均响应大小
# 示例:1000 req/s × 50KB = 50MB/s = 400Mbps
# 4. CPU 需求
# CPU 核数 = 并发数 / 500(每核约 500 并发)
# 示例:1000 / 500 = 2 核
# 监控验证
varnishstat -1 | grep -E "MAIN.sess|MAIN.client_req"
14.6 更新策略
14.6.1 版本更新流程
#!/bin/bash
# update-varnish.sh - Varnish 更新脚本
echo "=== Varnish Update ==="
# 1. 检查当前版本
CURRENT_VERSION=$(varnishd -V 2>&1 | head -1)
echo "Current version: $CURRENT_VERSION"
# 2. 备份配置
echo "Backing up configuration..."
tar -czf /backup/varnish-config-$(date +%Y%m%d).tar.gz /etc/varnish/
# 3. 停止服务
echo "Stopping Varnish..."
sudo systemctl stop varnish
# 4. 更新软件包
echo "Updating Varnish..."
sudo apt-get update
sudo apt-get install -y varnish
# 5. 验证新版本
NEW_VERSION=$(varnishd -V 2>&1 | head -1)
echo "New version: $NEW_VERSION"
# 6. 验证配置
echo "Validating configuration..."
if ! varnishd -C -f /etc/varnish/default.vcl; then
echo "ERROR: Configuration invalid with new version"
echo "Rolling back..."
sudo apt-get install -y varnish=$CURRENT_VERSION
exit 1
fi
# 7. 启动服务
echo "Starting Varnish..."
sudo systemctl start varnish
# 8. 验证服务
echo "Verifying service..."
sleep 5
if curl -sf http://localhost:6081/ > /dev/null; then
echo "Update successful"
else
echo "ERROR: Service not responding"
exit 1
fi
echo "=== Update Complete ==="
14.6.2 零停机更新
#!/bin/bash
# zero-downtime-update.sh - 零停机更新
# 使用蓝绿部署
# 步骤 1:在新容器中启动 Varnish
docker run -d --name varnish-new \
-p 6083:6081 \
-v /etc/varnish:/etc/varnish:ro \
varnish:7.5-new
# 步骤 2:验证新容器
sleep 10
if curl -sf http://localhost:6083/ > /dev/null; then
echo "New container healthy"
else
echo "New container failed"
docker rm -f varnish-new
exit 1
fi
# 步骤 3:切换流量(使用 HAProxy 或 Nginx)
# 更新负载均衡器配置,将流量指向新容器
# 步骤 4:停止旧容器
docker stop varnish-old
docker rm varnish-old
# 步骤 5:重命名新容器
docker rename varnish-new varnish-old
echo "Zero-downtime update complete"
14.6.3 配置热更新
# VCL 热更新(不中断服务)
# 1. 加载新 VCL
varnishadm vcl.load new_config /etc/varnish/new.vcl
# 2. 验证新 VCL
varnishadm vcl.list
# 3. 激活新 VCL
varnishadm vcl.use new_config
# 4. 清理旧 VCL
varnishadm vcl.discard old_config
# 注意:VCL 热更新不会中断现有连接
# 新连接将使用新 VCL,现有连接继续使用旧 VCL
14.7 备份与恢复
14.7.1 配置备份
#!/bin/bash
# backup-config.sh - Varnish 配置备份
BACKUP_DIR="/backup/varnish"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/varnish-config-${DATE}.tar.gz"
# 创建备份目录
mkdir -p "$BACKUP_DIR"
# 备份配置文件
tar -czf "$BACKUP_FILE" \
/etc/varnish/ \
/etc/default/varnish \
/etc/systemd/system/varnish.service \
2>/dev/null
# 保留最近 30 天的备份
find "$BACKUP_DIR" -name "*.tar.gz" -mtime +30 -delete
echo "Backup created: $BACKUP_FILE"
14.7.2 配置恢复
#!/bin/bash
# restore-config.sh - Varnish 配置恢复
BACKUP_FILE="$1"
if [ -z "$BACKUP_FILE" ]; then
echo "Usage: $0 <backup-file>"
exit 1
fi
echo "=== Restoring Varnish Configuration ==="
echo "Backup: $BACKUP_FILE"
# 1. 停止 Varnish
echo "Stopping Varnish..."
sudo systemctl stop varnish
# 2. 恢复配置
echo "Restoring configuration..."
sudo tar -xzf "$BACKUP_FILE" -C /
# 3. 验证配置
echo "Validating configuration..."
if ! varnishd -C -f /etc/varnish/default.vcl; then
echo "ERROR: Configuration validation failed"
exit 1
fi
# 4. 启动 Varnish
echo "Starting Varnish..."
sudo systemctl start varnish
# 5. 验证服务
echo "Verifying service..."
sleep 5
if curl -sf http://localhost:6081/ > /dev/null; then
echo "Restore successful"
else
echo "ERROR: Service not responding"
exit 1
fi
echo "=== Restore Complete ==="
14.8 文档规范
14.8.1 运维文档模板
# Varnish 运维文档
## 1. 系统概述
- 部署架构
- 服务器列表
- 网络拓扑
## 2. 配置说明
- VCL 配置说明
- 参数配置说明
- 后端配置说明
## 3. 运维操作
- 日常巡检
- 故障处理
- 性能调优
- 版本更新
## 4. 应急预案
- 缓存清除
- 服务降级
- 故障转移
- 回滚操作
## 5. 监控告警
- 监控指标
- 告警规则
- 告警处理
## 6. 联系方式
- 运维负责人
- 开发负责人
- 供应商联系
14.8.2 变更记录
## 变更记录
### 2026-05-10
- **类型**: 配置变更
- **内容**: 优化缓存策略,产品页 TTL 从 5 分钟调整为 10 分钟
- **原因**: 减少后端负载
- **影响**: 缓存命中率提升 5%
- **负责人**: 张三
### 2026-05-08
- **类型**: 版本升级
- **内容**: Varnish 7.4.3 → 7.5.1
- **原因**: 安全补丁
- **影响**: 无
- **负责人**: 李四
14.9 注意事项
重要
- 所有配置变更必须经过测试环境验证
- 生产环境变更必须有回滚方案
- 监控告警必须有人响应和处理
- 定期备份配置,备份必须可恢复
- 文档要及时更新,保持与实际配置一致
- 安全漏洞要及时修复,定期更新版本
- 容量规划要提前进行,避免资源不足