第 7 章:文本处理实战
第 7 章:文本处理实战
理论是银,实践是金。这一章,我们用 AWK 和 SED 解决真实的文本处理问题。
7.1 日志文件分析
Nginx 访问日志格式
192.168.1.1 - - [15/Jan/2024:10:23:45 +0800] "GET /index.html HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0"
字段编号(AWK 默认空格分隔):
| 字段 | 内容 | 示例 |
|---|---|---|
| $1 | 客户端 IP | 192.168.1.1 |
| $4 | 时间戳(含方括号) | [15/Jan/2024:10:23:45 |
| $6 | 请求方法 | “GET |
| $7 | 请求路径 | /index.html |
| $9 | 状态码 | 200 |
| $10 | 响应大小 | 1234 |
常用日志分析命令
# 创建示例日志
cat > access.log << 'EOF'
192.168.1.1 - - [15/Jan/2024:10:00:00 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.2 - - [15/Jan/2024:10:00:01 +0800] "GET /style.css HTTP/1.1" 200 5678
192.168.1.1 - - [15/Jan/2024:10:00:02 +0800] "GET /api/users HTTP/1.1" 200 890
192.168.1.3 - - [15/Jan/2024:10:00:03 +0800] "POST /api/login HTTP/1.1" 401 123
192.168.1.1 - - [15/Jan/2024:10:00:04 +0800] "GET /missing.html HTTP/1.1" 404 0
192.168.1.2 - - [15/Jan/2024:10:00:05 +0800] "GET /admin HTTP/1.1" 403 0
192.168.1.4 - - [15/Jan/2024:10:00:06 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.1 - - [15/Jan/2024:10:00:07 +0800] "GET /api/data HTTP/1.1" 500 0
192.168.1.3 - - [15/Jan/2024:10:00:08 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.5 - - [15/Jan/2024:10:00:09 +0800] "GET /robots.txt HTTP/1.1" 200 50
EOF
# 统计各状态码数量和占比
$ awk '{
count[$9]++
total++
} END {
for (s in count)
printf "%-6s %4d (%5.1f%%)\n", s, count[s], count[s]/total*100
}' access.log | sort
→ 200 6 ( 60.0%)
→ 401 1 ( 10.0%)
→ 403 1 ( 10.0%)
→ 404 1 ( 10.0%)
→ 500 1 ( 10.0%)
# 统计各 IP 的请求数
$ awk '{count[$1]++} END {for (ip in count) printf "%-16s %d\n", ip, count[ip]}' access.log | sort -k2 -rn
→ 192.168.1.1 4
→ 192.168.1.2 2
→ 192.168.1.3 2
→ 192.168.1.4 1
→ 192.168.1.5 1
# 统计各请求路径的访问量
$ awk '{path[$7]++} END {for (p in path) printf "%4d %s\n", path[p], p}' access.log | sort -rn
→ 3 /index.html
→ 1 /style.css
→ 1 /api/users
→ 1 /api/login
→ 1 /missing.html
→ 1 /admin
→ 1 /api/data
→ 1 /robots.txt
# 找出 4xx/5xx 错误请求
$ awk '$9 >= 400 {print $1, $6, $7, $9}' access.log
→ 192.168.1.3 "POST /api/login 401
→ 192.168.1.1 "GET /missing.html 404
→ 192.168.1.2 "GET /admin 403
→ 192.168.1.1 "GET /api/data 500
# 计算每小时请求数
$ awk -F'[/: ]' '{
hour = $7
count[hour]++
} END {
for (h in count) printf "%s:00 %d 次请求\n", h, count[h]
}' access.log | sort
🏢 场景:异常 IP 检测
# 找出请求超过 3 次的 IP(可能是爬虫或攻击)
$ awk '{count[$1]++} END {
for (ip in count)
if (count[ip] > 3)
printf "⚠️ 异常 IP: %-16s 请求数: %d\n", ip, count[ip]
}' access.log
→ ⚠️ 异常 IP: 192.168.1.1 请求数: 4
7.2 CSV 文件处理
基本 CSV 处理
cat > data.csv << 'EOF'
Name,Department,Salary,Bonus
Alice,Engineering,15000,2000
Bob,Marketing,12000,1500
Carol,Engineering,16000,2500
Dave,Sales,11000,1000
Eve,Engineering,14000,1800
Frank,Marketing,13000,1600
EOF
# 提取特定列
$ awk -F, '{print $1, $3}' data.csv
→ Name Salary
→ Alice 15000
→ Bob 12000
...
# 跳过表头
$ awk -F, 'NR>1 {print $1, $3}' data.csv
# 计算总工资和平均工资
$ awk -F, 'NR>1 {sum+=$3; n++} END {printf "总工资: %d, 平均: %.0f\n", sum, sum/n}' data.csv
→ 总工资: 81000, 平均: 13500
CSV 转换
# CSV 转 TSV
$ awk -F, '{OFS="\t"; $1=$1; print}' data.csv
# CSV 转 Markdown 表格
$ awk -F, '
NR==1 {
printf "| "
for (i=1; i<=NF; i++) printf "%s | ", $i
printf "\n|"
for (i=1; i<=NF; i++) printf "---|"
printf "\n"
}
NR>1 {
printf "| "
for (i=1; i<=NF; i++) printf "%s | ", $i
printf "\n"
}' data.csv
带引号的 CSV
⚠️ 注意:标准 CSV 中字段可以包含逗号和引号。AWK 无法正确处理这种复杂情况,建议使用 csvtool 或 Python。
# 简单的去引号
$ sed 's/"//g' quoted.csv
# 使用 csvtool(如果安装了的话)
$ csvtool col 1-3 data.csv
🏢 场景:部门统计报告
$ awk -F, '
NR > 1 {
dept_count[$2]++
dept_sum[$2] += $3
dept_bonus[$2] += $4
}
END {
printf "%-15s %5s %12s %12s %12s\n", "部门", "人数", "总薪资", "总奖金", "平均薪资"
printf "%-15s %5s %12s %12s %12s\n", "-----", "----", "------", "------", "------"
for (d in dept_count)
printf "%-15s %5d %12d %12d %12.0f\n", d, dept_count[d], dept_sum[d], dept_bonus[d], dept_sum[d]/dept_count[d]
}' data.csv
7.3 JSON 提取
简单 JSON 处理
⚠️ 重要:AWK/SED 不适合处理复杂的嵌套 JSON。简单的 key-value 提取可以,复杂场景请用
jq。
# 简单 JSON 的 key-value 提取
cat > config.json << 'EOF'
{
"name": "MyApp",
"version": "2.1.0",
"port": 8080,
"debug": true
}
EOF
# 提取所有 key: value 对
$ grep -oP '"(\w+)"\s*:\s*"?\K[^",]+' config.json
→ MyApp
→ 2.1.0
→ 8080
→ true
# 用 AWK 提取
$ awk -F'[":,]+' '/".*"/ {
for (i=1; i<=NF; i++) {
if ($i ~ /"/) {
gsub(/"/, "", $i)
gsub(/^[ \t]+|[ \t]+$/, "", $i)
if (prev) { print prev "=" $i; prev="" }
else prev = $i
}
}
}' config.json
日志中的 JSON 行
# JSON Lines 格式(每行一个 JSON)
cat > jsonl.log << 'EOF'
{"timestamp":"2024-01-15T10:00:00","level":"info","message":"Server started","port":8080}
{"timestamp":"2024-01-15T10:00:01","level":"error","message":"Connection failed","host":"db.example.com"}
{"timestamp":"2024-01-15T10:00:02","level":"warn","message":"High memory usage","percent":85}
{"timestamp":"2024-01-15T10:00:03","level":"info","message":"Request processed","duration":45}
EOF
# 提取 level 和 message
$ grep -oP '"level"\s*:\s*"\K[^"]+' jsonl.log
→ info
→ error
→ warn
→ info
# 用 sed 提取(简化)
$ sed -E 's/.*"level":"([^"]+)".*"message":"([^"]+)".*/\1: \2/' jsonl.log
→ info: Server started
→ error: Connection failed
→ warn: High memory usage
→ info: Request processed
# 找出所有 error 级别的日志
$ awk '/"level":"error"/' jsonl.log
💡 推荐:对于复杂的 JSON 处理,请使用
jq:jq -r 'select(.level == "error") | .message' jsonl.log
7.4 表格格式化
对齐输出
cat > report_data.txt << 'EOF'
Alice Engineering 15000
Bob Marketing 12000
Carol Engineering 16000
Dave Sales 11000
EOF
# 左对齐表格
$ awk '{
printf "%-12s %-15s %10d\n", $1, $2, $3
}' report_data.txt
→ Alice Engineering 15000
→ Bob Marketing 12000
→ Carol Engineering 16000
→ Dave Sales 11000
# 带分隔线的表格
$ awk '
BEGIN {
sep = sprintf("%-12s %-15s %10s", "------------", "---------------", "----------")
}
NR==1 { print sep }
{
printf "%-12s %-15s %10d\n", $1, $2, $3
}
END { print sep }
' report_data.txt
统计汇总表
$ awk '
{
dept_sum[$2] += $3
dept_count[$2]++
total_sum += $3
total_count++
}
END {
printf "%-15s %8s %12s %12s\n", "部门", "人数", "总薪资", "平均薪资"
printf "%-15s %8s %12s %12s\n", "==============", "========", "============", "============"
for (d in dept_count) {
printf "%-15s %8d %12d %12.0f\n", d, dept_count[d], dept_sum[d], dept_sum[d]/dept_count[d]
}
printf "%-15s %8s %12s %12s\n", "==============", "========", "============", "============"
printf "%-15s %8d %12d %12.0f\n", "合计", total_count, total_sum, total_sum/total_count
}' report_data.txt
7.5 文本转换
大小写转换
# 转大写
$ awk '{print toupper($0)}' file
$ tr '[:lower:]' '[:upper:]' < file
# 转小写
$ awk '{print tolower($0)}' file
# 首字母大写
$ awk '{
for (i=1; i<=NF; i++) {
$i = toupper(substr($i,1,1)) tolower(substr($i,2))
}
print
}' file
行列转换
# 转置矩阵
$ printf "1 2 3\n4 5 6\n7 8 9\n" | awk '{
for (i=1; i<=NF; i++) {
a[NR,i] = $i
if (NR > maxrow) maxrow = NR
if (i > maxcol) maxcol = i
}
} END {
for (i=1; i<=maxcol; i++) {
for (j=1; j<=maxrow; j++) {
printf "%s%s", a[j,i], (j<maxrow ? " " : "\n")
}
}
}'
→ 1 4 7
→ 2 5 8
→ 3 6 9
多行转单行
# 每 3 行合并为一行
$ seq 9 | paste - - -
→ 1 2 3
→ 4 5 6
→ 7 8 9
# 或用 AWK
$ seq 9 | awk 'ORS = NR%3 ? " " : "\n"'
→ 1 2 3
→ 4 5 6
→ 7 8 9
单行转多行
# 逗号分隔的值变成每行一个
$ echo "Alice,Bob,Carol,Dave" | awk -F, '{for(i=1;i<=NF;i++) print $i}'
→ Alice
→ Bob
→ Carol
→ Dave
7.6 文本清洗
去除多余空白
# 去掉行首行尾空格
$ sed 's/^[[:space:]]*//; s/[[:space:]]*$//' file
$ awk '{gsub(/^[[:space:]]+|[[:space:]]+$/, ""); print}' file
# 压缩连续空格为单个空格
$ sed -E 's/[[:space:]]+/ /g' file
# 去掉空行
$ sed '/^$/d' file
$ awk 'NF > 0' file
去除重复行
# 去重(保持顺序)
$ awk '!seen[$0]++' file
# 只保留重复的行
$ awk 'seen[$0]++' file
# 去除连续重复行(类似 uniq)
$ awk 'prev != $0 {print; prev=$0}' file
编码转换相关
# 去掉 BOM(UTF-8 文件头)
$ sed '1s/^\xEF\xBB\xBF//' file
# 去掉 Windows 回车符 \r
$ sed 's/\r$//' file
$ tr -d '\r' < file
# 添加 BOM
$ printf '\xEF\xBB\xBF' | cat - file > file_with_bom
7.7 数据格式转换
INI 格式解析
cat > config.ini << 'EOF'
[database]
host = localhost
port = 5432
name = myapp
[server]
host = 0.0.0.0
port = 8080
debug = true
EOF
# 提取指定 section 的所有键值
$ awk -F'[[:space:]]*=[[:space:]]*' '
/^\[/ { section = substr($0, 2, length($0)-2) }
section == "database" && /=/ { print $1 "=" $2 }
' config.ini
→ host=localhost
→ port=5432
→ name=myapp
属性文件转 Shell 变量
$ awk -F= '
/^[[:space:]]*#/ { next }
/^[[:space:]]*$/ { next }
/=/ {
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $1)
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2)
printf "export %s=\"%s\"\n", toupper($1), $2
}
' config.ini
→ export HOST="localhost"
→ export PORT="5432"
→ export NAME="myapp"
YAML 简单解析
cat > simple.yaml << 'EOF'
name: MyApp
version: "2.1"
port: 8080
features:
- auth
- logging
EOF
# 提取顶级键值对(简单 YAML)
$ awk -F': ' '
NF == 2 && !/^[[:space:]]/ {
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $1)
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2)
printf "%s=%s\n", $1, $2
}
' simple.yaml
→ name=MyApp
→ version="2.1"
→ port=8080
💡 提示:复杂的 YAML/JSON 处理请使用专门的工具(
yq,jq),AWK/SED 适合简单的扁平格式。
7.8 综合实战
🏢 场景:系统配置审计
# 审计 /etc/ssh/sshd_config
$ awk '
/^[[:space:]]*#/ { next } # 跳过注释
/^[[:space:]]*$/ { next } # 跳过空行
{
gsub(/^[[:space:]]+|[[:space:]]+$/, "")
split($0, a, /[[:space:]]+/)
key = a[1]
val = a[2]
config[key] = val
}
END {
print "=== SSH 配置审计 ==="
# 检查危险配置
if (config["PermitRootLogin"] == "yes")
print "⚠️ PermitRootLogin = yes (建议改为 no)"
if (config["PasswordAuthentication"] == "yes")
print "⚠️ PasswordAuthentication = yes (建议使用密钥)"
if (config["PermitEmptyPasswords"] == "yes")
print "🔴 PermitEmptyPasswords = yes (高风险!)"
if (config["X11Forwarding"] == "yes")
print "⚠️ X11Forwarding = yes (非必要可关闭)"
# 正常配置
print "\n--- 当前配置 ---"
for (k in config)
printf "%-30s = %s\n", k, config[k]
}
' /etc/ssh/sshd_config 2>/dev/null || echo "需要 root 权限读取配置"
🏢 场景:自动格式化代码中的日志语句
# 将 printf 语句统一格式化
cat > code.c << 'EOF'
printf("debug: user %s logged in\n",name);
printf("error: connection failed from %s\n",addr);
printf("info: request processed %d ms\n",duration);
EOF
# 在日志级别前添加时间戳
$ sed -E 's/printf\("(debug|error|info|warn):/printf("[%%s] \1: ", get_timestamp()); printf("/' code.c
7.9 速查命令集
# 文本处理常用组合
# 去掉空行和注释
sed '/^#/d; /^$/d' file
# 提取第 2 列并排序去重
awk '{print $2}' file | sort -u
# 统计行数(排除空行)
awk 'NF>0 {count++} END{print count}' file
# 合并两个文件(按行)
paste file1 file2
# 将多行合并为逗号分隔的一行
paste -sd, file
# 将逗号分隔的值拆成每行一个
tr ',' '\n' < file
# 排序后取前 10
sort -k2 -rn file | head -10
# 统计词频
tr -s ' ' '\n' < file | sort | uniq -c | sort -rn
# 将 Windows 换行转 Unix
sed -i 's/\r$//' file
扩展阅读
- jq Manual — JSON 处理推荐工具
- GNU Coreutils —
cut,paste,tr等 - csvkit — CSV 处理工具集
下一章:第 8 章:数据提取 — 日志解析、指标采集、报告生成。