强曰为道

与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

第 11 章:监控与日志分析

第 11 章:监控与日志分析

你无法管理你看不到的东西——监控是邮件服务器运维的眼睛。


11.1 邮件队列管理

11.1.1 查看队列状态

# 查看邮件队列
mailq

# 详细队列信息
postqueue -p

# 队列统计
qshape

# 队列目录大小
du -sh /var/spool/postfix/{active,deferred,corrupt,hold,maildrop,incoming}

# 查看特定队列
find /var/spool/postfix/deferred -type f | wc -l

11.1.2 队列操作命令

命令作用
mailq显示队列中的邮件
postqueue -p详细显示队列
postqueue -f刷新队列(尝试立即发送)
postqueue -s domain刷新特定域名的队列
postsuper -d ID删除特定邮件
postsuper -d ALL删除所有邮件
postsuper -h ID暂停特定邮件
postsuper -H ID恢复特定邮件
postcat -q ID查看邮件内容

11.1.3 队列管理脚本

#!/bin/bash
# mail-queue-status.sh — 邮件队列状态报告

echo "=== 邮件队列状态 $(date) ==="
echo ""

# 队列统计
TOTAL=$(find /var/spool/postfix/active /var/spool/postfix/deferred -type f 2>/dev/null | wc -l)
ACTIVE=$(find /var/spool/postfix/active -type f 2>/dev/null | wc -l)
DEFERRED=$(find /var/spool/postfix/deferred -type f 2>/dev/null | wc -l)
HOLD=$(find /var/spool/postfix/hold -type f 2>/dev/null | wc -l)

echo "队列统计:"
echo "  总计: $TOTAL"
echo "  活跃: $ACTIVE"
echo "  延迟: $DEFERRED"
echo "  暂停: $HOLD"
echo ""

# 告警阈值
if [ $DEFERRED -gt 1000 ]; then
    echo "⚠️ 警告:延迟队列超过 1000 封!"
fi

# 队列大小
echo "队列目录大小:"
du -sh /var/spool/postfix/{active,deferred,hold} 2>/dev/null
echo ""

# 最旧的邮件
echo "最旧的延迟邮件:"
find /var/spool/postfix/deferred -type f -printf '%T+ %p\n' 2>/dev/null | sort | head -5
echo ""

# 延迟邮件的目标域名分布
echo "延迟邮件目标域名 TOP 10:"
qshape deferred 2>/dev/null | head -12

11.1.4 队列清理策略

# /etc/postfix/main.cf — 队列配置

# 队列扫描间隔
queue_run_delay = 300s

# 最小退信间隔
minimal_backoff_time = 300s

# 最大退信间隔
maximal_backoff_time = 4000s

# 邮件最大生命周期
maximal_queue_lifetime = 5d

# 退信最大生命周期
bounce_queue_lifetime = 2d

11.2 日志分析

11.2.1 Postfix 日志格式

# 日志文件位置
/var/log/mail.log          # Debian/Ubuntu
/var/log/maillog           # RHEL/CentOS

# 日志示例
May 10 10:15:23 mail postfix/smtpd[12345]: connect from unknown[203.0.113.50]
May 10 10:15:24 mail postfix/smtpd[12345]: 1A2B3C4D5E: client=unknown[203.0.113.50]
May 10 10:15:24 mail postfix/cleanup[12346]: 1A2B3C4D5E: message-id=<[email protected]>
May 10 10:15:24 mail postfix/qmgr[12347]: 1A2B3C4D5E: from=<[email protected]>, size=1234, nrcpt=1
May 10 10:15:25 mail postfix/smtp[12348]: 1A2B3C4D5E: to=<[email protected]>, relay=mail.example.com[203.0.113.10]:25, delay=2, status=sent (250 OK)
May 10 10:15:25 mail postfix/qmgr[12347]: 1A2B3C4D5E: removed

11.2.2 常用日志分析命令

# 查看今天的邮件
grep "$(date +%b' '%d)" /var/log/mail.log

# 统计发送状态
grep "status=" /var/log/mail.log | awk '{print $NF}' | sort | uniq -c | sort -rn

# 查看失败的邮件
grep "status=bounced\|status=deferred\|status=reject" /var/log/mail.log

# 查看特定队列 ID 的完整流程
grep "1A2B3C4D5E" /var/log/mail.log

# 查看连接来源 TOP 10
grep "connect from" /var/log/mail.log | awk '{print $NF}' | sort | uniq -c | sort -rn | head -10

# 查看被拒绝的连接
grep "reject:" /var/log/mail.log

# 查看 TLS 连接
grep "TLS" /var/log/mail.log | tail -20

# 查看认证失败
grep "auth failed\|authentication failed" /var/log/mail.log

# 统计每小时邮件量
awk '{print $2}' /var/log/mail.log | cut -d: -f1 | uniq -c

11.2.3 日志分析脚本

#!/bin/bash
# mail-log-stats.sh — 邮件日志统计脚本

LOG_FILE="/var/log/mail.log"
DATE=$(date +%b' '%d)

echo "=== 邮件日志统计 ($DATE) ==="
echo ""

# 连接统计
CONNECTIONS=$(grep "$DATE" "$LOG_FILE" | grep -c "connect from")
echo "连接数: $CONNECTIONS"

# 发送统计
SENT=$(grep "$DATE" "$LOG_FILE" | grep -c "status=sent")
BOUNCED=$(grep "$DATE" "$LOG_FILE" | grep -c "status=bounced")
DEFERRED=$(grep "$DATE" "$LOG_FILE" | grep -c "status=deferred")
REJECTED=$(grep "$DATE" "$LOG_FILE" | grep -c "status=reject")

echo "发送成功: $SENT"
echo "退信: $BOUNCED"
echo "延迟: $DEFERRED"
echo "拒绝: $REJECTED"
echo ""

# 退信原因 TOP 5
echo "退信原因 TOP 5:"
grep "status=bounced" "$LOG_FILE" | grep -oP 'dsn="[^"]*"' | sort | uniq -c | sort -rn | head -5
echo ""

# 被拒绝的域名 TOP 5
echo "被拒绝的域名 TOP 5:"
grep "reject:" "$LOG_FILE" | grep -oP 'to=<[^>]*>' | sed 's/to=<//;s/>//' | cut -d@ -f2 | sort | uniq -c | sort -rn | head -5
echo ""

# 发送量最大的域名 TOP 5
echo "发送量最大的域名 TOP 5:"
grep "status=sent" "$LOG_FILE" | grep -oP 'to=<[^>]*>' | sed 's/to=<//;s/>//' | cut -d@ -f2 | sort | uniq -c | sort -rn | head -5

11.3 系统监控

11.3.1 Postfix 服务监控

# 检查 Postfix 状态
sudo systemctl status postfix

# 检查关键进程
ps aux | grep -E "master|smtpd|qmgr|cleanup"

# 检查端口
sudo ss -tlnp | grep -E ":(25|587|465)"

# 检查服务健康
postfix check

11.3.2 健康检查脚本

#!/bin/bash
# mail-health-check.sh — 邮件服务器健康检查

ERRORS=0
WARNINGS=0

echo "=== 邮件服务器健康检查 $(date) ==="
echo ""

# 1. 检查 Postfix 服务
echo "[1/8] 检查 Postfix 服务..."
if systemctl is-active --quiet postfix; then
    echo "  ✅ Postfix 运行正常"
else
    echo "  ❌ Postfix 未运行!"
    ((ERRORS++))
fi

# 2. 检查 Dovecot 服务
echo "[2/8] 检查 Dovecot 服务..."
if systemctl is-active --quiet dovecot; then
    echo "  ✅ Dovecot 运行正常"
else
    echo "  ❌ Dovecot 未运行!"
    ((ERRORS++))
fi

# 3. 检查端口监听
echo "[3/8] 检查端口监听..."
for port in 25 587 993; do
    if ss -tln | grep -q ":$port "; then
        echo "  ✅ 端口 $port 监听中"
    else
        echo "  ❌ 端口 $port 未监听!"
        ((ERRORS++))
    fi
done

# 4. 检查磁盘空间
echo "[4/8] 检查磁盘空间..."
DISK_USAGE=$(df -h / | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $DISK_USAGE -lt 80 ]; then
    echo "  ✅ 磁盘使用率: ${DISK_USAGE}%"
elif [ $DISK_USAGE -lt 90 ]; then
    echo "  ⚠️ 磁盘使用率较高: ${DISK_USAGE}%"
    ((WARNINGS++))
else
    echo "  ❌ 磁盘空间严重不足: ${DISK_USAGE}%!"
    ((ERRORS++))
fi

# 5. 检查邮件队列
echo "[5/8] 检查邮件队列..."
QUEUE_SIZE=$(find /var/spool/postfix/deferred -type f 2>/dev/null | wc -l)
if [ $QUEUE_SIZE -lt 100 ]; then
    echo "  ✅ 延迟队列: $QUEUE_SIZE 封"
elif [ $QUEUE_SIZE -lt 1000 ]; then
    echo "  ⚠️ 延迟队列较大: $QUEUE_SIZE 封"
    ((WARNINGS++))
else
    echo "  ❌ 延迟队列过大: $QUEUE_SIZE 封!"
    ((ERRORS++))
fi

# 6. 检查证书有效期
echo "[6/8] 检查 TLS 证书..."
CERT_FILE="/etc/letsencrypt/live/mail.example.com/cert.pem"
if [ -f "$CERT_FILE" ]; then
    EXPIRY=$(openssl x509 -enddate -noout -in "$CERT_FILE" | cut -d= -f2)
    EXPIRY_EPOCH=$(date -d "$EXPIRY" +%s)
    NOW_EPOCH=$(date +%s)
    DAYS_LEFT=$(( (EXPIRY_EPOCH - NOW_EPOCH) / 86400 ))
    
    if [ $DAYS_LEFT -gt 30 ]; then
        echo "  ✅ 证书剩余 $DAYS_LEFT 天"
    elif [ $DAYS_LEFT -gt 7 ]; then
        echo "  ⚠️ 证书即将过期: 剩余 $DAYS_LEFT 天"
        ((WARNINGS++))
    else
        echo "  ❌ 证书即将过期: 剩余 $DAYS_LEFT 天!"
        ((ERRORS++))
    fi
else
    echo "  ❌ 证书文件不存在!"
    ((ERRORS++))
fi

# 7. 检查日志错误
echo "[7/8] 检查最近日志错误..."
RECENT_ERRORS=$(grep "$(date +%b' '%d)" /var/log/mail.log | grep -ci "error\|fatal\|panic")
if [ $RECENT_ERRORS -eq 0 ]; then
    echo "  ✅ 今日无严重错误"
elif [ $RECENT_ERRORS -lt 10 ]; then
    echo "  ⚠️ 今日有 $RECENT_ERRORS 条错误"
    ((WARNINGS++))
else
    echo "  ❌ 今日有 $RECENT_ERRORS 条错误!"
    ((ERRORS++))
fi

# 8. 检查内存使用
echo "[8/8] 检查内存使用..."
MEM_USAGE=$(free | awk '/Mem:/ {printf "%.0f", $3/$2*100}')
if [ $MEM_USAGE -lt 80 ]; then
    echo "  ✅ 内存使用率: ${MEM_USAGE}%"
elif [ $MEM_USAGE -lt 90 ]; then
    echo "  ⚠️ 内存使用率较高: ${MEM_USAGE}%"
    ((WARNINGS++))
else
    echo "  ❌ 内存使用率过高: ${MEM_USAGE}%!"
    ((ERRORS++))
fi

echo ""
echo "=== 检查完成 ==="
echo "错误: $ERRORS | 警告: $WARNINGS"

if [ $ERRORS -gt 0 ]; then
    exit 2
elif [ $WARNINGS -gt 0 ]; then
    exit 1
else
    exit 0
fi

11.4 Prometheus 监控集成

11.4.1 安装 Postfix Exporter

# 下载 postfix_exporter
wget https://github.com/kumina/postfix_exporter/releases/download/0.3.0/postfix_exporter-0.3.0.linux-amd64.tar.gz
tar xzf postfix_exporter-0.3.0.linux-amd64.tar.gz
sudo mv postfix_exporter-0.3.0.linux-amd64/postfix_exporter /usr/local/bin/

# 创建 systemd 服务
sudo tee /etc/systemd/system/postfix_exporter.service << 'EOF'
[Unit]
Description=Postfix Exporter
After=network.target

[Service]
Type=simple
User=root
ExecStart=/usr/local/bin/postfix_exporter \
    --postfix.showq.url=http://localhost:10099/showq \
    --web.listen-address=:9154
Restart=always

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl daemon-reload
sudo systemctl enable --now postfix_exporter

11.4.2 配置 Prometheus

# /etc/prometheus/prometheus.yml — 添加 Postfix 监控

scrape_configs:
  - job_name: 'postfix'
    static_configs:
      - targets: ['localhost:9154']
  
  - job_name: 'node'
    static_configs:
      - targets: ['localhost:9100']

11.4.3 Postfix Exporter 指标

指标说明
postfix_showq_messages队列中的邮件数量
postfix_showq_message_size_bytes邮件大小分布
postfix_smtpd_connects_totalSMTP 连接总数
postfix_smtpd_disconnects_totalSMTP 断开总数
postfix_smtp_sent_total发送邮件总数
postfix_smtp_deferred_total延迟邮件总数
postfix_cleanup_messages_total清理邮件总数

11.4.4 Grafana 仪表板

{
  "dashboard": {
    "title": "Postfix Mail Server",
    "panels": [
      {
        "title": "Queue Size",
        "type": "stat",
        "targets": [{
          "expr": "postfix_showq_messages"
        }]
      },
      {
        "title": "SMTP Connections",
        "type": "graph",
        "targets": [{
          "expr": "rate(postfix_smtpd_connects_total[5m])"
        }]
      },
      {
        "title": "Delivery Rate",
        "type": "graph",
        "targets": [{
          "expr": "rate(postfix_smtp_sent_total[5m])"
        }]
      },
      {
        "title": "Bounce Rate",
        "type": "graph",
        "targets": [{
          "expr": "rate(postfix_smtp_deferred_total[5m])"
        }]
      }
    ]
  }
}

11.5 告警配置

11.5.1 告警规则示例

# /etc/prometheus/rules/postfix.yml

groups:
  - name: postfix_alerts
    rules:
      # 队列堵塞告警
      - alert: PostfixQueueBacklog
        expr: postfix_showq_messages > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "邮件队列积压"
          description: "邮件队列中积压了 {{ $value }} 封邮件"
      
      # 服务宕机告警
      - alert: PostfixDown
        expr: up{job="postfix"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Postfix 服务宕机"
      
      # 磁盘空间告警
      - alert: MailDiskSpaceLow
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 20
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "磁盘空间不足"
          description: "磁盘剩余空间: {{ $value }}%"
      
      # 证书过期告警
      - alert: CertificateExpiringSoon
        expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "TLS 证书即将过期"
          description: "证书将在 {{ $value }} 天后过期"

11.5.2 邮件告警通知

#!/bin/bash
# mail-alert.sh — 邮件告警脚本

ALERT_TO="[email protected]"
ALERT_SUBJECT="邮件服务器告警"
ALERT_BODY="$1"

echo "$ALERT_BODY" | mail -s "$ALERT_SUBJECT" "$ALERT_TO"

11.5.3 日志轮转配置

# /etc/logrotate.d/postfix

/var/log/mail.log {
    daily
    rotate 30
    compress
    delaycompress
    missingok
    notifempty
    create 0640 root adm
    sharedscripts
    postrotate
        /usr/lib/rsyslog/rsyslog-rotate
    endscript
}

11.6 业务场景:生产环境监控方案

场景描述

一家中型企业需要全面的邮件服务器监控:

  • 实时监控队列状态
  • 告警通知(邮件 + 企业微信/钉钉)
  • 历史趋势分析
  • 自动化运维

监控架构

┌─────────────────────────────────────────┐
│              监控面板 (Grafana)            │
└─────────────┬───────────────────────────┘
              │
┌─────────────▼───────────────────────────┐
│           Prometheus + Alertmanager       │
└───────┬─────────┬─────────┬─────────────┘
        │         │         │
┌───────▼──┐ ┌────▼────┐ ┌─▼───────────┐
│Postfix   │ │Node     │ │Blackbox     │
│Exporter  │ │Exporter │ │Exporter     │
└──────────┘ └─────────┘ └─────────────┘

11.7 注意事项

⚠️ 日志安全

  • 邮件日志可能包含敏感信息(发件人、收件人)
  • 限制日志文件权限
  • 定期清理旧日志

⚠️ 监控性能

  • Prometheus 指标采集可能影响性能
  • 合理设置采集间隔
  • 避免过多的告警规则

💡 日志分析建议

  • 使用 ELK (Elasticsearch + Logstash + Kibana) 进行大规模日志分析
  • 定期生成日志报告
  • 建立基线,便于异常检测

11.8 扩展阅读


上一章← 第 10 章:Webmail 与邮件客户端集成 下一章第 12 章:安全加固 →