AWK & SED 生产力教程 / 第 7 章：文本处理实战

第 7 章：文本处理实战

理论是银，实践是金。这一章，我们用 AWK 和 SED 解决真实的文本处理问题。

7.1 日志文件分析

Nginx 访问日志格式

192.168.1.1 - - [15/Jan/2024:10:23:45 +0800] "GET /index.html HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0"

字段编号（AWK 默认空格分隔）：

字段	内容	示例
$1	客户端 IP	192.168.1.1
$4	时间戳（含方括号）	[15/Jan/2024:10:23:45
$6	请求方法	“GET
$7	请求路径	/index.html
$9	状态码	200
$10	响应大小	1234

常用日志分析命令

# 创建示例日志
cat > access.log << 'EOF'
192.168.1.1 - - [15/Jan/2024:10:00:00 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.2 - - [15/Jan/2024:10:00:01 +0800] "GET /style.css HTTP/1.1" 200 5678
192.168.1.1 - - [15/Jan/2024:10:00:02 +0800] "GET /api/users HTTP/1.1" 200 890
192.168.1.3 - - [15/Jan/2024:10:00:03 +0800] "POST /api/login HTTP/1.1" 401 123
192.168.1.1 - - [15/Jan/2024:10:00:04 +0800] "GET /missing.html HTTP/1.1" 404 0
192.168.1.2 - - [15/Jan/2024:10:00:05 +0800] "GET /admin HTTP/1.1" 403 0
192.168.1.4 - - [15/Jan/2024:10:00:06 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.1 - - [15/Jan/2024:10:00:07 +0800] "GET /api/data HTTP/1.1" 500 0
192.168.1.3 - - [15/Jan/2024:10:00:08 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.5 - - [15/Jan/2024:10:00:09 +0800] "GET /robots.txt HTTP/1.1" 200 50
EOF

# 统计各状态码数量和占比
$ awk '{
    count[$9]++
    total++
} END {
    for (s in count)
        printf "%-6s %4d (%5.1f%%)\n", s, count[s], count[s]/total*100
}' access.log | sort
→ 200      6 ( 60.0%)
→ 401      1 ( 10.0%)
→ 403      1 ( 10.0%)
→ 404      1 ( 10.0%)
→ 500      1 ( 10.0%)

# 统计各 IP 的请求数
$ awk '{count[$1]++} END {for (ip in count) printf "%-16s %d\n", ip, count[ip]}' access.log | sort -k2 -rn
→ 192.168.1.1     4
→ 192.168.1.2     2
→ 192.168.1.3     2
→ 192.168.1.4     1
→ 192.168.1.5     1

# 统计各请求路径的访问量
$ awk '{path[$7]++} END {for (p in path) printf "%4d %s\n", path[p], p}' access.log | sort -rn
→    3 /index.html
→    1 /style.css
→    1 /api/users
→    1 /api/login
→    1 /missing.html
→    1 /admin
→    1 /api/data
→    1 /robots.txt

# 找出 4xx/5xx 错误请求
$ awk '$9 >= 400 {print $1, $6, $7, $9}' access.log
→ 192.168.1.3 "POST /api/login 401
→ 192.168.1.1 "GET /missing.html 404
→ 192.168.1.2 "GET /admin 403
→ 192.168.1.1 "GET /api/data 500

# 计算每小时请求数
$ awk -F'[/: ]' '{
    hour = $7
    count[hour]++
} END {
    for (h in count) printf "%s:00  %d 次请求\n", h, count[h]
}' access.log | sort

🏢 场景：异常 IP 检测

# 找出请求超过 3 次的 IP（可能是爬虫或攻击）
$ awk '{count[$1]++} END {
    for (ip in count)
        if (count[ip] > 3)
            printf "⚠️  异常 IP: %-16s 请求数: %d\n", ip, count[ip]
}' access.log
→ ⚠️  异常 IP: 192.168.1.1     请求数: 4

7.2 CSV 文件处理

基本 CSV 处理

cat > data.csv << 'EOF'
Name,Department,Salary,Bonus
Alice,Engineering,15000,2000
Bob,Marketing,12000,1500
Carol,Engineering,16000,2500
Dave,Sales,11000,1000
Eve,Engineering,14000,1800
Frank,Marketing,13000,1600
EOF

# 提取特定列
$ awk -F, '{print $1, $3}' data.csv
→ Name Salary
→ Alice 15000
→ Bob 12000
...

# 跳过表头
$ awk -F, 'NR>1 {print $1, $3}' data.csv

# 计算总工资和平均工资
$ awk -F, 'NR>1 {sum+=$3; n++} END {printf "总工资: %d, 平均: %.0f\n", sum, sum/n}' data.csv
→ 总工资: 81000, 平均: 13500

CSV 转换

# CSV 转 TSV
$ awk -F, '{OFS="\t"; $1=$1; print}' data.csv

# CSV 转 Markdown 表格
$ awk -F, '
NR==1 {
    printf "| "
    for (i=1; i<=NF; i++) printf "%s | ", $i
    printf "\n|"
    for (i=1; i<=NF; i++) printf "---|"
    printf "\n"
}
NR>1 {
    printf "| "
    for (i=1; i<=NF; i++) printf "%s | ", $i
    printf "\n"
}' data.csv

带引号的 CSV

⚠️ 注意：标准 CSV 中字段可以包含逗号和引号。AWK 无法正确处理这种复杂情况，建议使用 csvtool 或 Python。

# 简单的去引号
$ sed 's/"//g' quoted.csv

# 使用 csvtool（如果安装了的话）
$ csvtool col 1-3 data.csv

🏢 场景：部门统计报告

$ awk -F, '
NR > 1 {
    dept_count[$2]++
    dept_sum[$2] += $3
    dept_bonus[$2] += $4
}
END {
    printf "%-15s %5s %12s %12s %12s\n", "部门", "人数", "总薪资", "总奖金", "平均薪资"
    printf "%-15s %5s %12s %12s %12s\n", "-----", "----", "------", "------", "------"
    for (d in dept_count)
        printf "%-15s %5d %12d %12d %12.0f\n", d, dept_count[d], dept_sum[d], dept_bonus[d], dept_sum[d]/dept_count[d]
}' data.csv

7.3 JSON 提取

简单 JSON 处理

⚠️ 重要：AWK/SED 不适合处理复杂的嵌套 JSON。简单的 key-value 提取可以，复杂场景请用 jq。

# 简单 JSON 的 key-value 提取
cat > config.json << 'EOF'
{
    "name": "MyApp",
    "version": "2.1.0",
    "port": 8080,
    "debug": true
}
EOF

# 提取所有 key: value 对
$ grep -oP '"(\w+)"\s*:\s*"?\K[^",]+' config.json
→ MyApp
→ 2.1.0
→ 8080
→ true

# 用 AWK 提取
$ awk -F'[":,]+' '/".*"/ {
    for (i=1; i<=NF; i++) {
        if ($i ~ /"/) {
            gsub(/"/, "", $i)
            gsub(/^[ \t]+|[ \t]+$/, "", $i)
            if (prev) { print prev "=" $i; prev="" }
            else prev = $i
        }
    }
}' config.json

日志中的 JSON 行

# JSON Lines 格式（每行一个 JSON）
cat > jsonl.log << 'EOF'
{"timestamp":"2024-01-15T10:00:00","level":"info","message":"Server started","port":8080}
{"timestamp":"2024-01-15T10:00:01","level":"error","message":"Connection failed","host":"db.example.com"}
{"timestamp":"2024-01-15T10:00:02","level":"warn","message":"High memory usage","percent":85}
{"timestamp":"2024-01-15T10:00:03","level":"info","message":"Request processed","duration":45}
EOF

# 提取 level 和 message
$ grep -oP '"level"\s*:\s*"\K[^"]+' jsonl.log
→ info
→ error
→ warn
→ info

# 用 sed 提取（简化）
$ sed -E 's/.*"level":"([^"]+)".*"message":"([^"]+)".*/\1: \2/' jsonl.log
→ info: Server started
→ error: Connection failed
→ warn: High memory usage
→ info: Request processed

# 找出所有 error 级别的日志
$ awk '/"level":"error"/' jsonl.log

💡 推荐：对于复杂的 JSON 处理，请使用 jq：
jq -r 'select(.level == "error") | .message' jsonl.log

7.4 表格格式化

对齐输出

cat > report_data.txt << 'EOF'
Alice Engineering 15000
Bob Marketing 12000
Carol Engineering 16000
Dave Sales 11000
EOF

# 左对齐表格
$ awk '{
    printf "%-12s %-15s %10d\n", $1, $2, $3
}' report_data.txt
→ Alice        Engineering          15000
→ Bob          Marketing            12000
→ Carol        Engineering          16000
→ Dave         Sales                11000

# 带分隔线的表格
$ awk '
BEGIN {
    sep = sprintf("%-12s %-15s %10s", "------------", "---------------", "----------")
}
NR==1 { print sep }
{
    printf "%-12s %-15s %10d\n", $1, $2, $3
}
END { print sep }
' report_data.txt

统计汇总表

$ awk '
{
    dept_sum[$2] += $3
    dept_count[$2]++
    total_sum += $3
    total_count++
}
END {
    printf "%-15s %8s %12s %12s\n", "部门", "人数", "总薪资", "平均薪资"
    printf "%-15s %8s %12s %12s\n", "==============", "========", "============", "============"
    
    for (d in dept_count) {
        printf "%-15s %8d %12d %12.0f\n", d, dept_count[d], dept_sum[d], dept_sum[d]/dept_count[d]
    }
    
    printf "%-15s %8s %12s %12s\n", "==============", "========", "============", "============"
    printf "%-15s %8d %12d %12.0f\n", "合计", total_count, total_sum, total_sum/total_count
}' report_data.txt

7.5 文本转换

大小写转换

# 转大写
$ awk '{print toupper($0)}' file
$ tr '[:lower:]' '[:upper:]' < file

# 转小写
$ awk '{print tolower($0)}' file

# 首字母大写
$ awk '{
    for (i=1; i<=NF; i++) {
        $i = toupper(substr($i,1,1)) tolower(substr($i,2))
    }
    print
}' file

行列转换

# 转置矩阵
$ printf "1 2 3\n4 5 6\n7 8 9\n" | awk '{
    for (i=1; i<=NF; i++) {
        a[NR,i] = $i
        if (NR > maxrow) maxrow = NR
        if (i > maxcol) maxcol = i
    }
} END {
    for (i=1; i<=maxcol; i++) {
        for (j=1; j<=maxrow; j++) {
            printf "%s%s", a[j,i], (j<maxrow ? " " : "\n")
        }
    }
}'
→ 1 4 7
→ 2 5 8
→ 3 6 9

多行转单行

# 每 3 行合并为一行
$ seq 9 | paste - - -
→ 1	2	3
→ 4	5	6
→ 7	8	9

# 或用 AWK
$ seq 9 | awk 'ORS = NR%3 ? " " : "\n"'
→ 1 2 3
→ 4 5 6
→ 7 8 9

单行转多行

# 逗号分隔的值变成每行一个
$ echo "Alice,Bob,Carol,Dave" | awk -F, '{for(i=1;i<=NF;i++) print $i}'
→ Alice
→ Bob
→ Carol
→ Dave

7.6 文本清洗

去除多余空白

# 去掉行首行尾空格
$ sed 's/^[[:space:]]*//; s/[[:space:]]*$//' file
$ awk '{gsub(/^[[:space:]]+|[[:space:]]+$/, ""); print}' file

# 压缩连续空格为单个空格
$ sed -E 's/[[:space:]]+/ /g' file

# 去掉空行
$ sed '/^$/d' file
$ awk 'NF > 0' file

去除重复行

# 去重（保持顺序）
$ awk '!seen[$0]++' file

# 只保留重复的行
$ awk 'seen[$0]++' file

# 去除连续重复行（类似 uniq）
$ awk 'prev != $0 {print; prev=$0}' file

编码转换相关

# 去掉 BOM（UTF-8 文件头）
$ sed '1s/^\xEF\xBB\xBF//' file

# 去掉 Windows 回车符 \r
$ sed 's/\r$//' file
$ tr -d '\r' < file

# 添加 BOM
$ printf '\xEF\xBB\xBF' | cat - file > file_with_bom

7.7 数据格式转换

INI 格式解析

cat > config.ini << 'EOF'
[database]
host = localhost
port = 5432
name = myapp

[server]
host = 0.0.0.0
port = 8080
debug = true
EOF

# 提取指定 section 的所有键值
$ awk -F'[[:space:]]*=[[:space:]]*' '
    /^\[/ { section = substr($0, 2, length($0)-2) }
    section == "database" && /=/ { print $1 "=" $2 }
' config.ini
→ host=localhost
→ port=5432
→ name=myapp

属性文件转 Shell 变量

$ awk -F= '
    /^[[:space:]]*#/ { next }
    /^[[:space:]]*$/ { next }
    /=/ {
        gsub(/^[[:space:]]+|[[:space:]]+$/, "", $1)
        gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2)
        printf "export %s=\"%s\"\n", toupper($1), $2
    }
' config.ini
→ export HOST="localhost"
→ export PORT="5432"
→ export NAME="myapp"

YAML 简单解析

cat > simple.yaml << 'EOF'
name: MyApp
version: "2.1"
port: 8080
features:
  - auth
  - logging
EOF

# 提取顶级键值对（简单 YAML）
$ awk -F': ' '
    NF == 2 && !/^[[:space:]]/ {
        gsub(/^[[:space:]]+|[[:space:]]+$/, "", $1)
        gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2)
        printf "%s=%s\n", $1, $2
    }
' simple.yaml
→ name=MyApp
→ version="2.1"
→ port=8080

💡 提示：复杂的 YAML/JSON 处理请使用专门的工具（yq, jq），AWK/SED 适合简单的扁平格式。

7.8 综合实战

🏢 场景：系统配置审计

# 审计 /etc/ssh/sshd_config
$ awk '
    /^[[:space:]]*#/ { next }     # 跳过注释
    /^[[:space:]]*$/ { next }     # 跳过空行
    {
        gsub(/^[[:space:]]+|[[:space:]]+$/, "")
        split($0, a, /[[:space:]]+/)
        key = a[1]
        val = a[2]
        config[key] = val
    }
    END {
        print "=== SSH 配置审计 ==="
        
        # 检查危险配置
        if (config["PermitRootLogin"] == "yes")
            print "⚠️  PermitRootLogin = yes (建议改为 no)"
        if (config["PasswordAuthentication"] == "yes")
            print "⚠️  PasswordAuthentication = yes (建议使用密钥)"
        if (config["PermitEmptyPasswords"] == "yes")
            print "🔴 PermitEmptyPasswords = yes (高风险!)"
        if (config["X11Forwarding"] == "yes")
            print "⚠️  X11Forwarding = yes (非必要可关闭)"
        
        # 正常配置
        print "\n--- 当前配置 ---"
        for (k in config)
            printf "%-30s = %s\n", k, config[k]
    }
' /etc/ssh/sshd_config 2>/dev/null || echo "需要 root 权限读取配置"

🏢 场景：自动格式化代码中的日志语句

# 将 printf 语句统一格式化
cat > code.c << 'EOF'
printf("debug: user %s logged in\n",name);
printf("error: connection failed from %s\n",addr);
printf("info: request processed %d ms\n",duration);
EOF

# 在日志级别前添加时间戳
$ sed -E 's/printf\("(debug|error|info|warn):/printf("[%%s] \1: ", get_timestamp()); printf("/' code.c

7.9 速查命令集

# 文本处理常用组合
# 去掉空行和注释
sed '/^#/d; /^$/d' file

# 提取第 2 列并排序去重
awk '{print $2}' file | sort -u

# 统计行数（排除空行）
awk 'NF>0 {count++} END{print count}' file

# 合并两个文件（按行）
paste file1 file2

# 将多行合并为逗号分隔的一行
paste -sd, file

# 将逗号分隔的值拆成每行一个
tr ',' '\n' < file

# 排序后取前 10
sort -k2 -rn file | head -10

# 统计词频
tr -s ' ' '\n' < file | sort | uniq -c | sort -rn

# 将 Windows 换行转 Unix
sed -i 's/\r$//' file

扩展阅读

jq Manual — JSON 处理推荐工具
GNU Coreutils — cut, paste, tr 等
csvkit — CSV 处理工具集

下一章：第 8 章：数据提取 — 日志解析、指标采集、报告生成。