第 11 章:日志分析
第 11 章:日志分析
日志是系统的"黑匣子"。掌握日志分析,你就能在问题发生时快速定位根因。
11.1 常见日志格式
Nginx/Apache 访问日志
192.168.1.1 - - [15/Jan/2024:10:23:45 +0800] "GET /index.html HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0"
| 字段 | 位置 | 说明 |
|---|---|---|
| 客户端IP | $1 | 客户端地址 |
| 用户标识 | $2 | 通常为 - |
| 认证用户 | $3 | 通常为 - |
| 时间戳 | $4 $5 | [15/Jan/2024:10:23:45 +0800] |
| 请求方法 | $6 | "GET |
| 请求路径 | $7 | /index.html |
| 协议版本 | $8 | HTTP/1.1" |
| 状态码 | $9 | 200 |
| 响应大小 | $10 | 1234 |
| 来源页 | $11 | "https://example.com" |
| 用户代理 | $12+ | "Mozilla/5.0" |
Syslog 格式
Jan 15 10:23:45 hostname sshd[12345]: Accepted password for user from 192.168.1.1 port 22 ssh2
应用日志格式
2024-01-15 10:23:45 [ERROR] module=auth message="Login failed" user=alice ip=192.168.1.1
11.2 访问日志分析
基本统计
# 创建示例日志
cat > access.log << 'EOF'
192.168.1.1 - - [15/Jan/2024:10:00:00 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.2 - - [15/Jan/2024:10:00:01 +0800] "GET /style.css HTTP/1.1" 200 5678
192.168.1.1 - - [15/Jan/2024:10:00:02 +0800] "GET /api/users HTTP/1.1" 200 890
192.168.1.3 - - [15/Jan/2024:10:00:03 +0800] "POST /api/login HTTP/1.1" 401 123
192.168.1.1 - - [15/Jan/2024:10:00:04 +0800] "GET /missing.html HTTP/1.1" 404 0
192.168.1.2 - - [15/Jan/2024:10:00:05 +0800] "GET /admin HTTP/1.1" 403 0
192.168.1.4 - - [15/Jan/2024:10:00:06 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.1 - - [15/Jan/2024:10:00:07 +0800] "GET /api/data HTTP/1.1" 500 0
192.168.1.3 - - [15/Jan/2024:10:00:08 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.5 - - [15/Jan/2024:10:00:09 +0800] "GET /robots.txt HTTP/1.1" 200 50
192.168.1.1 - - [15/Jan/2024:10:01:00 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.6 - - [15/Jan/2024:10:01:01 +0800] "POST /api/login HTTP/1.1" 200 456
192.168.1.7 - - [15/Jan/2024:10:01:02 +0800] "GET /dashboard HTTP/1.1" 301 0
192.168.1.8 - - [15/Jan/2024:10:01:03 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.1 - - [15/Jan/2024:10:01:04 +0800] "GET /api/data HTTP/1.1" 200 5678
EOF
# 统计总请求数
$ wc -l < access.log
→ 15
# 统计独立 IP 数
$ awk '{print $1}' access.log | sort -u | wc -l
→ 8
# 统计各状态码分布
$ awk '{count[$9]++} END {
for (s in count) printf "%-6s %4d\n", s, count[s]
}' access.log | sort
→ 200 9
→ 301 1
→ 401 1
→ 403 1
→ 404 1
→ 500 1
流量统计
# 计算总传输量
$ awk '{sum+=$10} END {printf "总传输量: %.2f KB (%.2f MB)\n", sum/1024, sum/1048576}' access.log
# 每个 IP 的传输量
$ awk '{ip_bytes[$1]+=$10} END {
for (ip in ip_bytes) printf "%12.2f KB %s\n", ip_bytes[ip]/1024, ip
}' access.log | sort -rn
# 每个请求路径的平均响应大小
$ awk '{
path_count[$7]++
path_bytes[$7]+=$10
} END {
for (p in path_count)
printf "%8.0f B %s\n", path_bytes[p]/path_count[p], p
}' access.log | sort -rn
11.3 错误分析
错误请求分类
# 列出所有错误请求
$ awk '$9 >= 400 {print $9, $6, $7, $1}' access.log
# 按状态码统计错误
$ awk '$9 >= 400 {count[$9]++} END {
for (s in count) printf "%s: %d 次\n", s, count[s]
}' access.log | sort
# 按请求路径统计错误
$ awk '$9 >= 400 {count[$7]++} END {
for (p in count) printf "%4d %s\n", count[p], p
}' access.log | sort -rn
# 找出触发 500 错误的 IP
$ awk '$9 == 500 {print $1}' access.log | sort | uniq -c | sort -rn
🏢 场景:错误趋势分析
# 按分钟统计错误数量
$ awk '$9 >= 400 {
# 提取时间中的分钟部分
split($4, t, ":")
minute = t[2]":"t[3]
count[minute]++
} END {
for (m in count) printf "%s %d\n", m, count[m]
}' access.log | sort
# 错误率变化趋势
$ awk '{
split($4, t, ":")
minute = t[2]":"t[3]
total[minute]++
if ($9 >= 400) errors[minute]++
} END {
for (m in total) {
e = (m in errors) ? errors[m] : 0
printf "%s 总请求: %3d 错误: %3d 错误率: %.1f%%\n", m, total[m], e, e/total[m]*100
}
}' access.log | sort
11.4 趋势分析
时间维度分析
# 每小时请求数趋势
$ awk -F'[/: ]' '{
hour = $7
count[hour]++
} END {
for (h in count) {
printf "%s:00 %4d ", h, count[h]
for (i=0; i<count[h]; i++) printf "█"
printf "\n"
}
}' access.log | sort
# 每天请求数趋势(适用于多天日志)
$ awk '{
split($4, d, ":")
date = substr(d[1], 2) # 去掉开头的 [
count[date]++
} END {
for (d in count) printf "%s %d\n", d, count[d]
}' access.log | sort
峰值检测
# 找出请求最频繁的时段
$ awk '{
split($4, t, ":")
hour = t[2]
count[hour]++
} END {
max_count = 0
for (h in count) {
if (count[h] > max_count) {
max_count = count[h]
peak_hour = h
}
}
printf "峰值时段: %s:00 (%d 次请求)\n", peak_hour, max_count
}' access.log
# 找出请求最频繁的分钟
$ awk '{
split($4, t, ":")
minute = t[2]":"t[3]
count[minute]++
} END {
for (m in count) {
if (count[m] > max) {
max = count[m]
peak = m
}
}
printf "峰值分钟: %s (%d 次请求)\n", peak, max
}' access.log
11.5 告警系统
🏢 场景:实时日志告警
#!/bin/bash
# log_alert.sh — 实时日志告警
LOG_FILE="/var/log/nginx/access.log"
ALERT_THRESHOLD=10 # 每分钟错误数阈值
SLACK_WEBHOOK="https://hooks.slack.com/services/xxx"
tail -f "$LOG_FILE" | awk -v threshold="$ALERT_THRESHOLD" '
BEGIN {
minute = ""
errors = 0
}
{
# 提取当前分钟
split($4, t, ":")
current_minute = t[2]":"t[3]
# 分钟切换时检查阈值
if (current_minute != minute) {
if (errors >= threshold) {
printf "🚨 告警: %s 分钟内错误数 %d 超过阈值 %d\n", minute, errors, threshold
# 可以发送到 Slack/钉钉等
# system("curl -X POST ...")
}
minute = current_minute
errors = 0
}
# 统计错误
if ($9 >= 400) errors++
}'
🏢 场景:异常 IP 检测
#!/bin/bash
# detect_anomaly.sh — 检测异常 IP
LOG_FILE="/var/log/nginx/access.log"
THRESHOLD=100 # 每小时请求数阈值
awk -v threshold="$THRESHOLD" '
{
ip = $1
split($4, t, ":")
hour = t[2]
key = ip":"hour
count[key]++
}
END {
for (k in count) {
if (count[k] >= threshold) {
split(k, parts, ":")
printf "⚠️ 异常 IP: %s 在 %s:00 时段请求 %d 次\n", parts[1], parts[2], count[k]
}
}
}' "$LOG_FILE"
🏢 场景:慢请求检测
# 假设日志格式中包含响应时间字段
# 192.168.1.1 - - [15/Jan/2024:10:00:00 +0800] "GET /api/data HTTP/1.1" 200 1234 1.234
cat > slow_access.log << 'EOF'
192.168.1.1 GET /api/users 200 0.045
192.168.1.2 GET /api/data 200 2.345
192.168.1.1 POST /api/upload 200 5.678
192.168.1.3 GET /index.html 200 0.012
192.168.1.1 GET /api/report 200 10.234
192.168.1.4 GET /api/users 200 0.089
EOF
# 找出响应时间超过 1 秒的请求
$ awk '$5 > 1.0 {
printf "⚠️ 慢请求: %s %s %s 响应时间 %.3fs\n", $1, $2, $3, $5
}' slow_access.log
# 统计各接口的平均响应时间
$ awk '{
api = $2 " " $3
time_sum[api] += $5
time_count[api]++
if ($5 > time_max[api]) time_max[api] = $5
} END {
printf "%-30s %10s %10s %10s\n", "接口", "请求数", "平均(ms)", "最大(ms)"
printf "%-30s %10s %10s %10s\n", "------------------------------", "----------", "----------", "----------"
for (a in time_sum)
printf "%-30s %10d %10.3f %10.3f\n", a, time_count[a], time_sum[a]/time_count[a], time_max[a]
}' slow_access.log
11.6 高级日志分析
用户行为分析
# 用户访问路径追踪
$ awk '{
ip = $1
path = $7
time = $4 " " $5
printf "%s %s %s\n", ip, time, path
}' access.log | sort | awk '
BEGIN { prev_ip = "" }
{
if ($1 != prev_ip) {
if (prev_ip != "") print ""
printf "用户 %s:\n", $1
prev_ip = $1
}
printf " %s → %s\n", $2, $3
}'
# 找出典型的用户访问模式
$ awk '{print $7}' access.log | awk '
BEGIN { prev = "START" }
{
pattern = prev " → " $0
count[pattern]++
prev = $0
}
END {
for (p in count) printf "%4d %s\n", count[p], p
}' | sort -rn | head -10
会话分析
# 基于 IP 的会话分析(简化版)
$ awk '{
ip = $1
split($4, t, ":")
hour = t[2]
minute = t[3]
seconds = t[4]
gsub(/\]/, "", seconds)
time_in_seconds = hour*3600 + minute*60 + seconds
if (ip in last_time) {
gap = time_in_seconds - last_time[ip]
if (gap > 1800) { # 超过 30 分钟认为新会话
sessions[ip]++
}
} else {
sessions[ip] = 1
}
last_time[ip] = time_in_seconds
requests[ip]++
} END {
printf "%-16s %8s %10s\n", "IP", "请求数", "会话数"
for (ip in requests)
printf "%-16s %8d %10d\n", ip, requests[ip], sessions[ip]
}' access.log
爬虫检测
# 检测可能的爬虫
$ awk '{
ip = $1
path = $7
count[ip]++
# 检测高频访问特定路径的 IP
if (path ~ /\.(css|js|png|jpg|gif|ico)$/) {
static[ip]++
}
} END {
for (ip in count) {
total = count[ip]
static_count = (ip in static) ? static[ip] : 0
if (total > 50 || (static_count > 0 && static_count/total > 0.8)) {
printf "🤖 疑似爬虫: %-16s 总请求: %d 静态资源: %d (%.0f%%)\n",
ip, total, static_count, static_count/total*100
}
}
}' access.log
11.7 日志分析报告
综合分析报告
#!/bin/bash
# log_report.sh — 生成日志分析报告
LOG_FILE="${1:-/var/log/nginx/access.log}"
REPORT_FILE="log_report_$(date +%Y%m%d_%H%M%S).txt"
{
echo "╔══════════════════════════════════════════════════════╗"
echo "║ Nginx 日志分析报告 ║"
echo "╠══════════════════════════════════════════════════════╣"
echo "║ 生成时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo "║ 日志文件: ${LOG_FILE}"
echo "╚══════════════════════════════════════════════════════╝"
echo ""
echo "=== 总体统计 ==="
awk '{
total++
bytes += $10
if ($9 >= 400) errors++
} END {
printf "总请求数: %d\n", total
printf "错误请求数: %d (%.2f%%)\n", errors, errors/total*100
printf "总传输量: %.2f MB\n", bytes/1048576
printf "平均响应大小: %.0f B\n", bytes/total
}' "$LOG_FILE"
echo ""
echo "=== 状态码分布 ==="
awk '{count[$9]++} END {
for (s in count) printf " %s: %d\n", s, count[s]
}' "$LOG_FILE" | sort
echo ""
echo "=== Top 10 IP ==="
awk '{count[$1]++} END {
for (ip in count) printf "%6d %s\n", count[ip], ip
}' "$LOG_FILE" | sort -rn | head -10
echo ""
echo "=== Top 10 请求路径 ==="
awk '{count[$7]++} END {
for (p in count) printf "%6d %s\n", count[p], p
}' "$LOG_FILE" | sort -rn | head -10
echo ""
echo "=== 错误请求详情 ==="
awk '$9 >= 400 {
printf " %s %s %s %s\n", $9, $1, $6, $7
}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10
} > "$REPORT_FILE"
echo "报告已生成: ${REPORT_FILE}"
11.8 日志分析速查
# 基本统计
wc -l < logfile # 总行数
awk '{print $1}' log | sort -u | wc -l # 独立 IP 数
awk '{print $9}' log | sort | uniq -c # 状态码分布
# 错误分析
awk '$9 >= 400' log # 所有错误
awk '$9 == 500' log # 服务器错误
awk '$9 == 404' log # 未找到
# 流量分析
awk '{sum+=$10} END{print sum}' log # 总字节数
awk '{ip[$1]+=$10} END{for(i in ip) print ip[i], i}' log | sort -rn # IP 流量
# 时间分析
awk -F'[/: ]' '{print $7}' log | sort | uniq -c # 按小时统计
awk '{print $7}' log | sort | uniq -c | sort -rn # 热门路径
# 安全分析
awk '{print $1}' log | sort | uniq -c | sort -rn | head -20 # 高频 IP
awk '$9 == 401 || $9 == 403' log # 认证失败
扩展阅读
- GoAccess — 实时日志分析工具
- AWStats — 高级 Web 统计
- Elastic Stack — 企业级日志分析
下一章:第 12 章:报告生成 — 数据汇总、格式化输出、HTML/CSV 报告。