Nagios 监控运维完整教程 / 第10章：通知升级与依赖

第10章：通知升级与依赖

合理的通知升级策略和依赖关系配置是企业级监控的核心。本章详细讲解通知升级机制、服务/主机依赖、维护时段管理以及高级自定义逻辑。

一、通知升级（Escalation）

1.1 主机通知升级

# 阶段1：初始通知发给一线运维（第1-3次通知）
define hostescalation {
    host_name               web-server-01
    first_notification      1
    last_notification       3
    notification_interval   5
    contact_groups          l1-ops
    escalation_period       24x7
    escalation_options      d,u
}

# 阶段2：升级到高级运维（第4-8次通知）
define hostescalation {
    host_name               web-server-01
    first_notification      4
    last_notification       8
    notification_interval   10
    contact_groups          l2-ops
    escalation_period       24x7
    escalation_options      d,u
}

# 阶段3：升级到管理层（第9次及以后）
define hostescalation {
    host_name               web-server-01
    first_notification      9
    last_notification       0  # 0 = 无限继续
    notification_interval   30
    contact_groups          management
    escalation_period       24x7
    escalation_options      d,u
}

1.2 服务通知升级

# 阶段1：初始通知（第1-5次，每5分钟）
define serviceescalation {
    host_name               web-server-01
    service_description     HTTP
    first_notification      1
    last_notification       5
    notification_interval   5
    contact_groups          web-ops
    escalation_period       24x7
    escalation_options      w,u,c,r
}

# 阶段2：升级通知（第6-15次，每10分钟）
define serviceescalation {
    host_name               web-server-01
    service_description     HTTP
    first_notification      6
    last_notification       15
    notification_interval   10
    contact_groups          senior-ops
    escalation_period       24x7
    escalation_options      c,r
}

# 阶段3：管理层通知（第16次及以后，每30分钟）
define serviceescalation {
    host_name               web-server-01
    service_description     HTTP
    first_notification      16
    last_notification       0
    notification_interval   30
    contact_groups          management
    escalation_period       24x7
    escalation_options      c
}

1.3 批量通知升级

# 使用主机组批量配置
define serviceescalation {
    hostgroup_name          webservers
    service_description     HTTP
    first_notification      1
    last_notification       5
    notification_interval   5
    contact_groups          web-ops
    escalation_options      w,u,c,r
}

define serviceescalation {
    hostgroup_name          webservers
    service_description     HTTP
    first_notification      6
    last_notification       0
    notification_interval   15
    contact_groups          senior-ops,management
    escalation_options      c
}

# 使用通配符匹配所有服务
define serviceescalation {
    host_name               web-server-01
    service_description     *       # 匹配所有服务
    first_notification      10
    last_notification       0
    notification_interval   60
    contact_groups          management
    escalation_options      c
}

1.4 通知升级选项

escalation_options	含义
`w`	WARNING 状态时触发
`u`	UNKNOWN 状态时触发
`c`	CRITICAL 状态时触发
`r`	RECOVERY 时触发
`d`	主机 DOWN 时触发
`n`	不触发（禁用升级）

1.5 通知升级流程图

服务变为 CRITICAL
│
├── 第1次通知 (0分钟)  ──→  L1 运维 (web-ops)
│   │
│   ├── 恢复 ──→ 发送 RECOVERY 通知给 web-ops
│   │
│   └── 5分钟后仍未恢复
│
├── 第2次通知 (5分钟)  ──→  L1 运维
├── 第3次通知 (10分钟) ──→  L1 运维
├── 第4次通知 (15分钟) ──→  L1 运维
├── 第5次通知 (20分钟) ──→  L1 运维
│
│   升级阈值达到
│
├── 第6次通知 (25分钟) ──→  L2 运维 (senior-ops)
├── 第7次通知 (35分钟) ──→  L2 运维
│   ...
├── 第15次通知          ──→  L2 运维
│
│   再次升级
│
├── 第16次通知          ──→  管理层 (management)
│   ...（每30分钟通知一次，直到恢复）
│
└── 恢复通知            ──→  所有已通知的联系人

二、服务依赖

2.1 基本依赖定义

# Web 服务依赖于数据库服务
define servicedependency {
    host_name                       db-server-01
    service_description             MySQL
    dependent_host_name             web-server-01
    dependent_service_description   HTTP
    execution_failure_criteria      w,u,c
    notification_failure_criteria   w,u,c
    inherits_parent                 1
}

# 解读：
# 当 db-server-01 的 MySQL 服务状态为 WARNING/UNKNOWN/CRITICAL 时
# web-server-01 的 HTTP 服务不会执行检查和发送通知

2.2 依赖失败标准

标志	含义	影响
`o`	OK	依赖服务 OK 时不执行检查/不通知
`w`	WARNING	依赖服务 WARNING 时不执行/不通知
`u`	UNKNOWN	依赖服务 UNKNOWN 时不执行/不通知
`c`	CRITICAL	依赖服务 CRITICAL 时不执行/不通知
`n`	None	任何状态都不影响（等同于无依赖）

2.3 多层依赖链

# 存储层
define servicedependency {
    host_name                       san-array-01
    service_description             RAID Status
    dependent_host_name             db-server-01
    dependent_service_description   Disk
    execution_failure_criteria      c
    notification_failure_criteria   c
}

# 数据库层
define servicedependency {
    host_name                       db-server-01
    service_description             MySQL
    dependent_host_name             app-server-01
    dependent_service_description   MySQL Connection
    execution_failure_criteria      w,u,c
    notification_failure_criteria   w,u,c
}

# 应用层
define servicedependency {
    host_name                       app-server-01
    service_description             Application
    dependent_host_name             web-server-01
    dependent_service_description   HTTP Login
    execution_failure_criteria      w,u,c
    notification_failure_criteria   w,u,c
}

# 依赖链：
# SAN 存储故障 → 数据库磁盘检查被跳过
#              → MySQL 服务检查被跳过
#              → 应用连接检查被跳过
#              → Web 登录检查被跳过
# 只发送 SAN 故障通知，避免级联告警

2.4 主机依赖

# 网关故障时，内网主机检查跳过
define hostdependency {
    host_name                   gateway
    dependent_host_name         web-server-01
    execution_failure_criteria  d,u
    notification_failure_criteria   d,u
    inherits_parent             1
}

# 批量定义
define hostdependency {
    host_name                   core-switch
    dependent_host_name         web-server-01,web-server-02,db-server-01
    execution_failure_criteria  d,u
    notification_failure_criteria   d,u
}

2.5 依赖继承

# inherits_parent = 1 时，依赖关系传递
# A → B → C (B 依赖 A，C 依赖 B)
# 如果 A 故障，B 和 C 的检查都会被跳过

define servicedependency {
    host_name                       host-a
    service_description             Service A
    dependent_host_name             host-b
    dependent_service_description   Service B
    execution_failure_criteria      c
    notification_failure_criteria   c
    inherits_parent                 1
}

define servicedependency {
    host_name                       host-b
    service_description             Service B
    dependent_host_name             host-c
    dependent_service_description   Service C
    execution_failure_criteria      c
    notification_failure_criteria   c
    inherits_parent                 1
}

三、维护时段（Downtime）

3.1 计划维护定义

# 通过外部命令设置计划维护

# 固定维护时段（精确指定开始和结束时间）
# 语法: SCHEDULE_SVC_DOWNTIME;主机名;服务名;开始时间;结束时间;固定;触发ID;持续时间;用户;备注

# 设置固定维护（2小时）
START=$(date +%s)
END=$(date -d '+2 hours' +%s)
echo "[$START] SCHEDULE_SVC_DOWNTIME;web-server-01;HTTP;$START;$END;1;0;0;admin;Planned maintenance" \
    >> /var/log/nagios/rw/nagios.cmd

# 灵活维护时段（从问题发生时开始计时）
echo "[$(date +%s)] SCHEDULE_SVC_DOWNTIME;web-server-01;HTTP;0;0;0;0;3600;admin;Flexible maintenance - 1 hour" \
    >> /var/log/nagios/rw/nagios.cmd

# 主机维护
echo "[$(date +%s)] SCHEDULE_HOST_DOWNTIME;web-server-01;$START;$END;1;0;0;admin;Host maintenance" \
    >> /var/log/nagios/rw/nagios.cmd

# 主机及所有服务维护
echo "[$(date +%s)] SCHEDULE_HOST_SVC_DOWNTIME;web-server-01;$START;$END;1;0;0;admin;Full maintenance" \
    >> /var/log/nagios/rw/nagios.cmd

3.2 维护脚本

#!/bin/bash
# schedule_maintenance.sh - 批量设置维护窗口

HOSTGROUP=$1
DURATION_MINUTES=$2
USER=$3
COMMENT=$4

if [ -z "$HOSTGROUP" ] || [ -z "$DURATION_MINUTES" ]; then
    echo "Usage: $0 <hostgroup> <duration_minutes> [user] [comment]"
    exit 1
fi

USER=${USER:-admin}
COMMENT=${COMMENT:-Scheduled maintenance}
CMD_FILE="/var/log/nagios/rw/nagios.cmd"
START=$(date +%s)
END=$(date -d "+${DURATION_MINUTES} minutes" +%s)

# 获取主机组成员（从 status.dat 解析）
HOSTS=$(grep "hostgroup_name=${HOSTGROUP}" /var/log/nagios/objects.cache -A 50 | grep "members=" | head -1 | sed 's/members=//' | tr ',' ' ')

if [ -z "$HOSTS" ]; then
    echo "Error: No hosts found in hostgroup '$HOSTGROUP'"
    exit 1
fi

for HOST in $HOSTS; do
    echo "[$START] SCHEDULE_HOST_SVC_DOWNTIME;${HOST};${START};${END};1;0;0;${USER};${COMMENT}" >> $CMD_FILE
    echo "Scheduled downtime for: $HOST ($DURATION_MINUTES minutes)"
done

echo "Maintenance scheduled for $(echo $HOSTS | wc -w) hosts."

3.3 维护对通知的影响

场景	通知行为	配置
维护期间故障	默认不通知	`notification_options = s`
维护期间故障	通知	在服务定义中包含 `s`
维护结束后故障	通知（如果仍在故障）	自动触发
维护前已故障	维护期间暂停通知	默认行为

四、自定义逻辑

4.1 事件处理器

# 事件处理器在状态变化时自动执行
define service {
    use                     generic-service
    host_name               web-server-01
    service_description     HTTP
    check_command           check_http
    event_handler           restart-httpd
    event_handler_enabled   1
}

# 事件处理命令
define command {
    command_name    restart-httpd
    command_line    /usr/local/nagios/libexec/event_handlers/restart_httpd.sh $SERVICESTATE$ $SERVICESTATETYPE$
}

4.2 事件处理脚本

#!/bin/bash
# restart_httpd.sh - 自动重启 Apache

STATE=$1
STATETYPE=$2
LOG="/var/log/nagios/event_handlers.log"

echo "$(date '+%Y-%m-%d %H:%M:%S') - HTTP state: $STATE/$STATETYPE" >> $LOG

# 仅在硬状态故障时尝试重启
if [ "$STATE" = "CRITICAL" ] && [ "$STATETYPE" = "HARD" ]; then
    echo "$(date '+%Y-%m-%d %H:%M:%S') - Attempting httpd restart" >> $LOG
    sudo systemctl restart httpd
    sleep 5

    if systemctl is-active --quiet httpd; then
        echo "$(date '+%Y-%m-%d %H:%M:%S') - httpd restarted successfully" >> $LOG
    else
        echo "$(date '+%Y-%m-%d %H:%M:%S') - httpd restart FAILED" >> $LOG
    fi
fi

4.3 检查新鲜度

# 新鲜度检查确保被动检查数据不过期
define service {
    use                     generic-service
    host_name               external-system
    service_description     Heartbeat
    passive_checks_enabled  1
    active_checks_enabled   0
    check_freshness         1
    freshness_threshold     300    # 5分钟未收到更新则主动检查
    check_command           check_dummy!2!"No heartbeat received"
    max_check_attempts      1
}

五、业务场景

5.1 电商平台通知策略

# 核心交易链路：快速升级
# L1 (0-5分钟) → L2 (5-15分钟) → CTO (15分钟+)

define serviceescalation {
    hostgroup_name          trade-system
    service_description     *
    first_notification      1
    last_notification       3
    notification_interval   2
    contact_groups          trade-ops
    escalation_options      c,r
}

define serviceescalation {
    hostgroup_name          trade-system
    service_description     *
    first_notification      4
    last_notification       8
    notification_interval   3
    contact_groups          trade-senior
    escalation_options      c,r
}

define serviceescalation {
    hostgroup_name          trade-system
    service_description     *
    first_notification      9
    last_notification       0
    notification_interval   5
    contact_groups          trade-lead,cto
    escalation_options      c
}

5.2 分时段通知策略

# 工作时间：快速通知
define serviceescalation {
    host_name               web-server-01
    service_description     HTTP
    first_notification      1
    last_notification       0
    notification_interval   5
    contact_groups          web-ops
    escalation_period       workhours
    escalation_options      w,u,c,r
}

# 非工作时间：只通知严重问题
define serviceescalation {
    host_name               web-server-01
    service_description     HTTP
    first_notification      1
    last_notification       0
    notification_interval   30
    contact_groups          on-call-ops
    escalation_period       nonworkhours
    escalation_options      c,r
}

六、注意事项

注意事项	说明
升级范围	避免升级到不必要的人员
通知频率	过于频繁的通知会导致告警疲劳
依赖关系	正确配置避免假阳性告警
维护窗口	与变更管理流程配合
RECOVERY 通知	确保恢复通知也升级到相关人员
测试	升级策略需要定期测试验证

七、本章小结

通知升级实现分层响应，确保问题得到及时处理
服务依赖避免级联告警，减少告警噪音
维护时段管理计划停机，避免误告警
事件处理器实现自动故障修复
业务场景决定通知策略设计

下一章：第11章：性能数据与可视化 - 学习性能数据采集和图表展示。