强曰为道

与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

第08章:负载均衡与 Director

第08章:负载均衡与 Director

8.1 负载均衡概述

Varnish 通过 Director(导向器)机制实现后端服务器的负载均衡。Director 是一组后端服务器的集合,按照特定算法分配请求。

8.1.1 负载均衡的优势

优势说明
高可用性单个后端故障不影响整体服务
水平扩展增加后端即可提升处理能力
负载分配均匀分配请求,避免单点过载
故障隔离自动剔除故障节点

8.1.2 Director 类型

类型算法适用场景
round_robin轮询通用场景,后端性能一致
fallback回退主备架构,故障转移
random随机简单负载分配
hash哈希会话保持,缓存亲和

8.2 基本 Director 配置

8.2.1 单后端配置

vcl 4.1;

backend web01 {
    .host = "192.168.1.10";
    .port = "80";
    .connect_timeout = 5s;
    .first_byte_timeout = 30s;
    .between_bytes_timeout = 10s;
    .max_connections = 300;

    .probe = {
        .url = "/health";
        .timeout = 3s;
        .interval = 5s;
        .window = 5;
        .threshold = 3;
    }
}

sub vcl_recv {
    set req.backend_hint = web01;
}

8.2.2 多后端简单配置

vcl 4.1;

backend web01 {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend web02 {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend web03 {
    .host = "192.168.1.12";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
    .expected_response = 200;
}

sub vcl_recv {
    # 简单轮询(不使用 Director)
    if (req.url ~ "^/api/") {
        set req.backend_hint = web01;
    } else {
        # 随机选择
        if (randombool(0.5, 100.0)) {
            set req.backend_hint = web02;
        } else {
            set req.backend_hint = web03;
        }
    }
}

8.3 Director 详解

8.3.1 Round-Robin Director

轮询是最简单的负载均衡算法,按顺序将请求分配给每个后端。

vcl 4.1;

import directors;

backend web01 {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend web02 {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend web03 {
    .host = "192.168.1.12";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    # 创建 round_robin director
    new web_director = directors.round_robin();

    # 添加后端
    web_director.add_backend(web01);
    web_director.add_backend(web02);
    web_director.add_backend(web03);
}

sub vcl_recv {
    # 使用 director 选择后端
    set req.backend_hint = web_director.backend();
}

8.3.2 Fallback Director

回退 Director 按优先级选择后端,第一个健康的后端会被选中。

vcl 4.1;

import directors;

backend primary {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend secondary {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend tertiary {
    .host = "192.168.1.12";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    # 创建 fallback director
    new failover = directors.fallback();

    # 按优先级添加(先添加的优先级最高)
    failover.add_backend(primary);
    failover.add_backend(secondary);
    failover.add_backend(tertiary);
}

sub vcl_recv {
    set req.backend_hint = failover.backend();
}

8.3.3 Random Director

随机 Director 按权重随机选择后端。

vcl 4.1;

import directors;

backend web01 {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend web02 {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend web03 {
    .host = "192.168.1.12";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    # 创建 random director
    new random_director = directors.random();

    # 添加后端并设置权重
    random_director.add_backend(web01, 10);  # 权重 10
    random_director.add_backend(web02, 5);   # 权重 5
    random_director.add_backend(web03, 1);   # 权重 1
}

sub vcl_recv {
    set req.backend_hint = random_director.backend();
}

8.3.4 Hash Director

哈希 Director 根据请求的特定属性(如 URL、Cookie)选择后端,确保同一请求总是到达同一后端。

vcl 4.1;

import directors;

backend cache01 {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend cache02 {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend cache03 {
    .host = "192.168.1.12";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    # 创建 hash director
    new hash_director = directors.hash();

    hash_director.add_backend(cache01, 1.0);
    hash_director.add_backend(cache02, 1.0);
    hash_director.add_backend(cache03, 1.0);
}

sub vcl_recv {
    # 根据 URL 哈希选择后端(缓存亲和)
    set req.backend_hint = hash_director.backend(req.url);

    # 或根据客户端 IP 哈希
    # set req.backend_hint = hash_director.backend(client.ip);

    # 或根据 Cookie 中的 session ID
    # if (req.http.Cookie ~ "session_id=") {
    #     set req.backend_hint = hash_director.backend(
    #         regsub(req.http.Cookie, ".*session_id=([^;]+).*", "\1")
    #     );
    # }
}

8.4 健康检查

8.4.1 健康检查配置

# 基本健康检查
probe basic_health {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

# 高级健康检查
probe advanced_health {
    .request =
        "GET /health HTTP/1.1"
        "Host: backend.local"
        "Connection: close"
        "User-Agent: Varnish-Health-Check";

    .timeout = 3s;
    .interval = 10s;
    .window = 10;
    .threshold = 8;

    .expected_response = 200;

    # 检查响应内容
    .match = "\"status\":\"healthy\"";
}

# TCP 健康检查(不检查 HTTP 响应)
probe tcp_health {
    .timeout = 1s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

8.4.2 健康检查参数详解

参数说明默认值
.url检查的 URL 路径-
.request自定义 HTTP 请求-
.timeout超时时间2s
.interval检查间隔5s
.window滑动窗口大小8
.threshold健康阈值3
.expected_response期望的状态码200
.match响应体匹配-
.initial初始状态threshold

8.4.3 健康检查端点实现

# Flask 示例:健康检查端点
from flask import Flask, jsonify
import psutil

app = Flask(__name__)

@app.route('/health')
def health_check():
    """基础健康检查"""
    return jsonify({
        "status": "healthy",
        "timestamp": datetime.utcnow().isoformat()
    })

@app.route('/health/detailed')
def detailed_health():
    """详细健康检查"""
    health = {
        "status": "healthy",
        "checks": {
            "cpu": {
                "status": "healthy" if psutil.cpu_percent() < 80 else "unhealthy",
                "value": psutil.cpu_percent()
            },
            "memory": {
                "status": "healthy" if psutil.virtual_memory().percent < 80 else "unhealthy",
                "value": psutil.virtual_memory().percent
            },
            "disk": {
                "status": "healthy" if psutil.disk_usage('/').percent < 90 else "unhealthy",
                "value": psutil.disk_usage('/').percent
            }
        }
    }

    status_code = 200 if health["status"] == "healthy" else 503
    return jsonify(health), status_code

8.4.4 查看健康检查状态

# 查看后端健康状态
varnishadm backend.list

# 输出示例:
# 200  Backend name                   Admin      Probe
# web01(192.168.1.10,,80)             probe      Success 5/5
# web02(192.168.1.11,,80)             probe      Success 3/5
# web03(192.168.1.12,,80)             probe      Sick 0/5

# 查看详细探针信息
varnishadm backend.list -p

8.5 高级负载均衡策略

8.5.1 基于 URL 的路由

vcl 4.1;

import directors;

backend api01 {
    .host = "192.168.1.20";
    .port = "80";
    .probe = health_check;
}

backend api02 {
    .host = "192.168.1.21";
    .port = "80";
    .probe = health_check;
}

backend web01 {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend web02 {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend static01 {
    .host = "192.168.1.30";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    # API 服务器池
    new api_pool = directors.round_robin();
    api_pool.add_backend(api01);
    api_pool.add_backend(api02);

    # Web 服务器池
    new web_pool = directors.round_robin();
    web_pool.add_backend(web01);
    web_pool.add_backend(web02);

    # 静态资源服务器池
    new static_pool = directors.round_robin();
    static_pool.add_backend(static01);
}

sub vcl_recv {
    # 基于 URL 路径路由
    if (req.url ~ "^/api/") {
        set req.backend_hint = api_pool.backend();
    } elseif (req.url ~ "\.(css|js|jpg|png|gif|webp|svg|ico|woff2)$") {
        set req.backend_hint = static_pool.backend();
    } else {
        set req.backend_hint = web_pool.backend();
    }
}

8.5.2 基于域名的路由

vcl 4.1;

import directors;

backend blog01 {
    .host = "192.168.1.40";
    .port = "80";
    .probe = health_check;
}

backend shop01 {
    .host = "192.168.1.50";
    .port = "80";
    .probe = health_check;
}

backend main01 {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_recv {
    # 基于域名路由
    switch (req.http.Host) {
        case "blog.example.com":
            set req.backend_hint = blog01;
        case "shop.example.com":
            set req.backend_hint = shop01;
        case "www.example.com":
            set req.backend_hint = main01;
        case "example.com":
            # 重定向到 www
            return (synth(750, "https://www.example.com" + req.url));
        default:
            return (synth(404, "Unknown host"));
    }
}

8.5.3 灰度发布

vcl 4.1;

import directors;

backend stable {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend canary {
    .host = "192.168.1.20";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_recv {
    # 灰度发布策略:10% 流量到新版本

    # 方法 1:基于随机
    if (randombool(10, 100)) {
        set req.backend_hint = canary;
        set req.http.X-Backend-Version = "canary";
    } else {
        set req.backend_hint = stable;
        set req.http.X-Backend-Version = "stable";
    }

    # 方法 2:基于特定用户
    if (req.http.X-User-ID ~ "^(1000|1001|1002|1003|1004|1005|1006|1007|1008|1009)$") {
        set req.backend_hint = canary;
        set req.http.X-Backend-Version = "canary";
    }

    # 方法 3:基于 Cookie 标记
    if (req.http.Cookie ~ "canary=true") {
        set req.backend_hint = canary;
        set req.http.X-Backend-Version = "canary";
    }
}

8.6 故障转移

8.6.1 自动故障转移

vcl 4.1;

import directors;

backend primary {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend secondary {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend tertiary {
    .host = "192.168.1.12";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    new failover = directors.fallback();
    failover.add_backend(primary);
    failover.add_backend(secondary);
    failover.add_backend(tertiary);
}

sub vcl_recv {
    set req.backend_hint = failover.backend();
}

sub vcl_backend_response {
    # 后端返回 5xx 错误时触发故障转移
    if (beresp.status >= 500 && req.restarts < 3) {
        return (retry);
    }
}

sub vcl_backend_fetch {
    # 重试时使用不同的后端
    if (req.restarts > 0) {
        set bereq.backend = failover.backend();
    }
}

sub vcl_deliver {
    # 记录故障转移信息
    if (req.restarts > 0) {
        set resp.http.X-Failover-Count = req.restarts;
    }
}

8.6.2 Grace 模式故障转移

sub vcl_recv {
    # 检查后端健康状态
    if (!std.healthy(req.backend_hint)) {
        # 后端不健康时,使用更长的 grace
        set req.grace = 1h;
    } else {
        set req.grace = 30s;
    }
}

sub vcl_hit {
    # 对象在 grace 期间
    if (obj.ttl + obj.grace > 0s) {
        # 后端不健康时返回过期内容
        if (!std.healthy(req.backend_hint)) {
            return (deliver);
        }

        # 后端健康但对象过期,后台更新
        return (deliver);
    }

    return (miss);
}

8.7 Director 监控

8.7.1 查看 Director 状态

# 查看所有 director
varnishadm directors.list

# 查看后端状态
varnishadm backend.list

# 查看后端详细信息
varnishadm backend.list -p

8.7.2 监控脚本

#!/bin/bash
# monitor-backends.sh - 后端健康监控

VARNISHADM="varnishadm"

while true; do
    echo "=== Backend Status $(date) ==="

    # 获取后端状态
    $VARNISHADM backend.list | while read line; do
        if echo "$line" | grep -q "Sick"; then
            echo "ALERT: Backend is sick - $line"
            # 发送告警
            # send_alert "Backend sick: $line"
        fi
    done

    # 获取统计信息
    echo "--- Statistics ---"
    $VARNISHADM stats | grep -E "backend|fetch"

    sleep 60
done

8.8 注意事项

重要

  1. Director 是在 vcl_init 中初始化的,不能在运行时修改
  2. 健康检查会消耗后端资源,合理设置检查间隔
  3. 使用 fallback Director 时,注意后端的添加顺序(优先级)
  4. hash Director 的 key 选择直接影响负载分布
  5. 故障转移时注意避免请求循环
  6. 灰度发布时确保 canary 版本的健康检查端点正确

8.9 业务场景

场景一:读写分离架构

vcl 4.1;

import directors;

backend master {
    .host = "192.168.1.10";
    .port = "80";
    .probe = health_check;
}

backend slave01 {
    .host = "192.168.1.11";
    .port = "80";
    .probe = health_check;
}

backend slave02 {
    .host = "192.168.1.12";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 2s;
    .interval = 5s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    new read_pool = directors.round_robin();
    read_pool.add_backend(slave01);
    read_pool.add_backend(slave02);
}

sub vcl_recv {
    # 写操作路由到 master
    if (req.method == "POST" || req.method == "PUT" ||
        req.method == "DELETE" || req.method == "PATCH") {
        set req.backend_hint = master;
        return (pass);
    }

    # 读操作路由到 slave
    set req.backend_hint = read_pool.backend();
}

场景二:多数据中心

vcl 4.1;

import directors;

backend dc1_web01 {
    .host = "10.0.1.10";
    .port = "80";
    .probe = health_check;
}

backend dc1_web02 {
    .host = "10.0.1.11";
    .port = "80";
    .probe = health_check;
}

backend dc2_web01 {
    .host = "10.0.2.10";
    .port = "80";
    .probe = health_check;
}

backend dc2_web02 {
    .host = "10.0.2.11";
    .port = "80";
    .probe = health_check;
}

probe health_check {
    .url = "/health";
    .timeout = 3s;
    .interval = 10s;
    .window = 5;
    .threshold = 3;
}

sub vcl_init {
    # 本地数据中心
    new local_dc = directors.round_robin();
    local_dc.add_backend(dc1_web01);
    local_dc.add_backend(dc1_web02);

    # 远程数据中心(回退)
    new remote_dc = directors.round_robin();
    remote_dc.add_backend(dc2_web01);
    remote_dc.add_backend(dc2_web02);

    # 全局(故障转移)
    new global = directors.fallback();
    global.add_backend(dc1_web01);
    global.add_backend(dc2_web01);
}

sub vcl_recv {
    # 优先使用本地数据中心
    set req.backend_hint = local_dc.backend();
}

8.10 扩展阅读