Erlang/OTP 完全指南 / 20 - 性能优化

第 20 章：性能优化 — profiling、fprof、内存分析

本章学习如何定位 Erlang 程序的性能瓶颈，使用 profiling 工具和内存分析找到优化点。

20.1 性能分析工具概览

工具	用途	侵入性
`timer:tc/3`	测量函数执行时间	极低
`fprof`	函数级 profiling	中
`eprof`	函数执行时间统计	低
`cprof`	函数调用计数	低
`observer`	实时监控	无
`recon`	生产级诊断	低
`percept`	并发分析	中

20.2 基本计时

%% timer:tc/3 测量函数执行时间
{Time, Result} = timer:tc(lists, seq, [1, 1000000]),
io:format("Time: ~p μs, Result length: ~p~n", [Time, length(Result)]).

%% 测量代码块
benchmark() ->
    List = lists:seq(1, 100000),
    
    {T1, _} = timer:tc(lists, reverse, [List]),
    {T2, _} = timer:tc(lists, sort, [List]),
    
    io:format("reverse: ~p μs~n", [T1]),
    io:format("sort:    ~p μs~n", [T2]).

%% 计数器方式
-reduction_count

20.3 fprof — 函数级 Profiling

%% 启动 fprof
fprof:profile({my_module, my_function, [Args]}).

%% 或者分析代码块
fprof:apply(fun() ->
    lists:map(fun(X) -> X * 2 end, lists:seq(1, 10000))
end).

%% 查看结果
fprof:analyse().        %% 输出到文件
fprof:analyse(dest, []).  %% 输出到标准输出

%% 停止
fprof:stop().

20.3.1 fprof 输出解读

%% CALLS    ACC%   OWN%    FUNCTION
%% 100000   85.0   15.0    lists:seq/2
%% 100000   60.0   40.0    'my_fun/1'
%% 1          5.0    5.0    my_module:start/0

CALLS: 函数被调用次数
ACC%: 累计时间占比（包含子调用）
OWN%: 自身时间占比（不包含子调用）

20.4 eprof — 执行时间统计

%% 启动 eprof
eprof:start().

%% 分析特定进程
eprof:start_profiling([self()]).

%% 执行代码
lists:map(fun(X) -> X * 2 end, lists:seq(1, 100000)).

%% 停止 profiling 并查看结果
eprof:stop_profiling().
eprof:analyze().

20.5 cprof — 调用计数

%% 启动 cprof（开销最小）
cprof:start().

%% 执行代码
lists:seq(1, 1000).

%% 查看结果
cprof:pause().
{ok, {_, Count}} = cprof:data().

%% 查看特定模块
cprof:data(my_module).

20.6 内存分析

20.6.1 进程内存

%% 当前进程内存
process_info(self(), memory).

%% 所有进程内存汇总
erlang:memory().

%% [{total, 12345678},
%%  {processes, 8000000},
%%  {processes_used, 7500000},
%%  {system, 4345678},
%%  {atom, 500000},
%%  {atom_used, 450000},
%%  {binary, 1000000},
%%  {ets, 500000},
%%  {code, 3000000}]

%% 单个进程的详细信息
process_info(self(), [memory, heap_size, stack_size, message_queue_len]).

%% 系统内存信息
erlang:system_info(total_memory).

20.6.2 内存泄漏排查

%% 查找内存占用最高的进程
top_processes() ->
    Procs = [{P, process_info(P, memory)} || P <- erlang:processes()],
    Sorted = lists:sort(fun({_, {_, M1}}, {_, {_, M2}}) -> M1 >= M2 end,
                        Procs),
    lists:sublist(Sorted, 10).

%% 检查消息队列堆积
check_mailboxes() ->
    Procs = [{P, process_info(P, message_queue_len)} || P <- erlang:processes()],
    [{P, Len} || {P, {_, Len}} <- Procs, Len > 100].

20.6.3 garbage_collect

%% 手动触发 GC（调试用）
erlang:garbage_collect().               %% 当前进程
erlang:garbage_collect(Pid).           %% 指定进程
erlang:garbage_collect(self(), [{type, major}]). %% 完整 GC

%% GC 统计
{_, GcStats} = process_info(self(), garbage_collection).
io:format("GC count: ~p~n", [proplists:get_value(number_of_gcs, GcStats)]).

20.7 recon — 生产级诊断库

%% rebar.config 添加依赖
%% {deps, [{recon, "2.5.3"}]}.

%% 进程内存 Top N
recon:proc_count(memory, 10).

%% 消息队列 Top N
recon:proc_count(message_queue_len, 10).

%% 调用次数统计
recon:call_count({my_module, my_function, 1}, 5000).

%% 内存信息
recon:memory().

%% 获取端口信息
recon:inet_count(recv_oct, 10).

20.8 常见优化技巧

20.8.1 数据结构选择

场景	推荐	避免
Key-Value 查询	Map	元组列表
有序数据	`ordered_set` ETS	`lists:sort`
字符串拼接	IO List	`++` 操作
大文本	Binary	String (list)
高频读写	ETS	GenServer call
计数器	`ets:update_counter`	GenServer 状态

20.8.2 代码优化

%% ❌ 列表拼接 O(n²)
lists:flatten([lists:reverse(T), [H]]).

%% ✅ cons 操作 O(1)
[H | T].

%% ❌ 频繁 ++ 操作
Result = List1 ++ List2 ++ List3.

%% ✅ IO List
Result = [List1, List2, List3].

%% ❌ 递归非尾调用
map(F, [H|T]) -> [F(H) | map(F, T)].

%% ✅ 尾递归 + 反转
map(F, L) -> map(F, L, []).
map(_F, [], Acc) -> lists:reverse(Acc);
map(F, [H|T], Acc) -> map(F, T, [F(H) | Acc]).

%% ❌ 频繁 list_to_binary
Binary = list_to_binary("prefix" ++ Data ++ "suffix").

%% ✅ 直接构造 binary
Binary = <<"prefix", Data/binary, "suffix">>.

20.8.3 ETS 优化

%% 读多写少
ets:new(cache, [set, public, named_table, {read_concurrency, true}]).

%% 高写并发
ets:new(counters, [set, public, named_table, {write_concurrency, true}]).

%% 使用 compressed 节省内存
ets:new(big_table, [set, named_table, compressed]).

20.9 Benchmark 模板

%% benchmark.erl
-module(benchmark).
-export([run/3, compare/2]).

%% 运行 N 次取平均
-spec run(fun(() -> term()), pos_integer(), string()) -> ok.
run(Fun, Times, Label) ->
    Results = [begin
        {T, _} = timer:tc(Fun),
        T
    end || _ <- lists:seq(1, Times)],
    
    Avg = lists:sum(Results) / length(Results),
    Min = lists:min(Results),
    Max = lists:max(Results),
    
    io:format("[~s] Avg: ~.2f μs, Min: ~p μs, Max: ~p μs~n",
              [Label, Avg, Min, Max]).

%% 比较两个实现
compare(Fun1, Fun2) ->
    Times = 10000,
    {T1, _} = timer:tc(fun() -> [Fun1() || _ <- lists:seq(1, Times)] end),
    {T2, _} = timer:tc(fun() -> [Fun2() || _ <- lists:seq(1, Times)] end),
    
    io:format("Fun1: ~p μs, Fun2: ~p μs, Ratio: ~.2f~n",
              [T1, T2, T1 / T2]).

20.10 注意事项

⚠️ 性能陷阱

过早优化是万恶之源——先 profile 再优化
不要在生产环境使用 fprof（开销太大）
erlang:memory() 返回的是估算值
Binary 参考计数可能导致意外的内存占用
Timer 进程本身有开销，大量定时器需要考虑 timer_wheel

💡 最佳实践

使用 timer:tc/3 做初步测量
生产环境使用 recon 库
监控进程消息队列长度
使用 Observer 图形化监控
优化热点代码，不优化冷路径

20.11 扩展阅读

📖 Efficiency Guide
📖 recon — 生产级诊断工具
📖 fprof module

上一章：19 - 发布与部署 下一章：21 - Docker 容器化