OCaml 教程 / 性能优化与 Benchmark
性能优化与 Benchmark
性能优化是 OCaml 的强项——原生编译、低 GC 压力、尾调用优化。本节介绍如何测量和优化 OCaml 程序的性能。
core_bench 基准测试
opam install core_bench
open Core
open Core_bench
(* 基本基准测试 *)
let bench_list_operations =
Bench.Test.create ~name:"List.map" (fun () ->
ignore (List.map ~f:(fun x -> x + 1) (List.init 1000 ~f:Fn.id))
)
let bench_array_operations =
Bench.Test.create ~name:"Array.map" (fun () ->
ignore (Array.map ~f:(fun x -> x + 1) (Array.init 1000 ~f:Fn.id))
)
let bench_seq_operations =
Bench.Test.create ~name:"Seq.map" (fun () ->
ignore (Seq.map (fun x -> x + 1) (Seq.init 1000 (fun i -> i)) |> List.of_seq)
)
let () =
Bench.make_command [
bench_list_operations;
bench_array_operations;
bench_seq_operations;
] |> Command_unix.run
# 运行基准测试
dune exec ./bench.exe -- -quota 1 -ascii
# 输出示例
# Estimated testing time 3s (3 benchmarks x 1s). Change using -quota SECS.
#
# Name Time/Run mWd/Run Percentage
# ---------------- ---------- --------- ------------
# List.map 12.34μs 3.00kw 98.23%
# Array.map 8.56μs 1.00kw 67.89%
# Seq.map 12.56μs 5.00kw 100.00%
| 指标 | 说明 |
|---|---|
| Time/Run | 每次执行平均时间 |
| mWd/Run | 每次执行分配的 minor words |
| Percentage | 相对于最慢基准的百分比 |
| Alloc/Run | 总分配量(字节) |
性能分析工具
# gprof 分析
ocamlopt -p my_program.ml -o my_program
./my_program
gprof my_program gmon.out > analysis.txt
# perf 分析(Linux)
ocamlopt -g my_program.ml -o my_program
perf record ./my_program
perf report
# OCaml 内置统计
let () =
Gc.set { (Gc.get ()) with Gc.verbose = 0x01 };
(* ... 运行代码 ... *)
Gc.print_stat stdout
(* 手动计时 *)
let time_it label f =
let start = Unix.gettimeofday () in
let result = f () in
let elapsed = Unix.gettimeofday () -. start in
Printf.printf "%s: %.6f seconds\n" label elapsed;
result
let () =
time_it "排序" (fun () ->
let arr = Array.init 1000000 (fun _ -> Random.int 1000000) in
Array.sort compare arr
)
💡 提示:先测量,再优化。不要猜测瓶颈在哪里——让数据说话。
内联优化
(* 使用 [@inline] 提示编译器 *)
let[@inline] add a b = a + b
let[@inline] mul a b = a *. b
(* 高阶函数内联 *)
let[@inline] apply f x = f x
(* 热点函数标记 *)
let[@inline always] hot_function x =
(* 编译器会尽可能内联此函数 *)
x * 2 + 1
let[@inline never] cold_function x =
(* 此函数不会被内联 *)
Printf.printf "debug: %d\n" x
(* 模块签名中暴露内联 *)
module type MATH = sig
val add : int -> int -> int [@@inline]
val mul : int -> int -> int [@@inline]
end
| 属性 | 说明 | 使用场景 |
|---|---|---|
[@inline] | 建议内联 | 小型热点函数 |
[@inline always] | 强制内联 | 极热点函数 |
[@inline never] | 禁止内联 | 调试函数、冷路径 |
[@unboxed] | 拆箱 | 避免装箱开销 |
[@unrolled n] | 展开循环 | 固定次数循环 |
尾调用优化
(* 非尾递归 — 栈溢出 *)
let rec sum_bad acc = function
| [] -> acc
| x :: xs -> sum_bad (acc + x) xs (* 不是尾调用 *)
(* 尾递归版本 *)
let rec sum_good acc = function
| [] -> acc
| x :: xs -> sum_good (acc + x) xs (* 尾调用 *)
(* 使用 [@tail_mod_cons] OCaml 5.1+ *)
let[@tail_mod_cons] rec map f = function
| [] -> []
| x :: xs -> f x :: map f xs
(* 复杂尾递归:树遍历 *)
type tree = Leaf | Node of tree * int * tree
let sum_tree t =
let rec aux acc = function
| [] -> acc
| Leaf :: rest -> aux acc rest
| Node (l, v, r) :: rest -> aux (acc + v) (l :: r :: rest)
in
aux 0 [t]
(* CPS 变换实现尾调用 *)
let rec map_cps f xs k =
match xs with
| [] -> k []
| x :: xs' ->
map_cps f xs' (fun ys ->
k (f x :: ys))
let map f xs = map_cps f xs Fun.id
⚠️ 注意:OCaml 的尾调用优化仅对直接尾递归有效。match、if、try 的尾位置中的递归调用会被优化。
内存分配优化
(* 避免不必要的分配 *)
(* 不好:每次创建新列表 *)
let filter_bad pred xs =
List.fold_left (fun acc x ->
if pred x then x :: acc else acc
) [] xs
|> List.rev
(* 好:使用 Seq 延迟计算 *)
let filter_good pred xs =
Seq.filter pred (List.to_seq xs)
(* 使用 Buffer 避免字符串拼接 *)
let concat_strings_bad strings =
String.concat "" strings (* 多次分配 *)
let concat_strings_good strings =
let buf = Buffer.create 256 in
List.iter (Buffer.add_string buf) strings;
Buffer.contents buf (* 单次分配 *)
(* 使用 Array 替代 List *)
let sum_array arr =
let sum = ref 0 in
Array.iter (fun x -> sum := !sum + x) arr;
!sum
(* 复用缓冲区 *)
let process_items items =
let buf = Buffer.create 1024 in
List.iter (fun item ->
Buffer.clear buf; (* 重用缓冲区 *)
Buffer.add_string buf item;
(* ... 处理 ... *)
) items
(* 避免闭包捕获大对象 *)
let process data =
let len = Array.length data in (* 提取需要的值 *)
let rec aux i acc =
if i >= len then acc
else aux (i + 1) (acc + data.(i))
in
aux 0 0
| 技巧 | 说明 | 效果 |
|---|---|---|
Buffer | 字符串拼接 | 避免 O(n²) 分配 |
Array 替代 List | 随机访问 | 减少内存开销 |
Seq | 延迟计算 | 避免中间列表 |
| 复用缓冲区 | 重用 Buffer.t | 减少 GC 压力 |
[@unboxed] | 拆箱类型 | 避免装箱开销 |
数据结构选择
(* 不同数据结构的性能特征 *)
(* List — 有序集合,头部操作 O(1) *)
let list_ops =
let xs = List.init 10000 Fun.id in
let xs' = 0 :: xs in (* O(1) *)
let _ = List.nth xs 9999 in (* O(n) *)
List.length xs (* O(n) *)
(* Array — 随机访问 O(1) *)
let array_ops =
let arr = Array.init 10000 Fun.id in
arr.(5000) <- 42; (* O(1) *)
let _ = arr.(9999) in (* O(1) *)
Array.sort compare arr; (* O(n log n) *)
Array.length arr (* O(1) *)
(* Hashtbl — 哈希表 O(1) 平均 *)
let hashtbl_ops =
let tbl = Hashtbl.create 10000 in
Hashtbl.add tbl "key" 42; (* O(1) *)
Hashtbl.find_opt tbl "key"; (* O(1) *)
Hashtbl.remove tbl "key"; (* O(1) *)
Hashtbl.length tbl (* O(1) *)
(* Map (平衡树) — O(log n) *)
let map_ops =
let m = ref Map.empty in
m := Map.add "key" 42 !m; (* O(log n) *)
Map.find_opt "key" !m; (* O(log n) *)
m := Map.remove "key" !m; (* O(log n) *)
Map.cardinal !m (* O(n) *)
(* Set — O(log n) *)
let set_ops =
let s = ref Set.empty in
for i = 0 to 9999 do
s := Set.add i !s
done;
Set.mem 5000 !s; (* O(log n) *)
Set.cardinal !s (* O(n) *)
| 数据结构 | 查找 | 插入 | 删除 | 遍历 | 适用场景 |
|---|---|---|---|---|---|
| List | O(n) | O(1) | O(n) | O(n) | 小集合、栈 |
| Array | O(1) | O(n) | O(n) | O(n) | 随机访问、数值计算 |
| Hashtbl | O(1)* | O(1)* | O(1)* | O(n) | 键值存储 |
| Map | O(log n) | O(log n) | O(log n) | O(n) | 有序键值 |
| Set | O(log n) | O(log n) | O(log n) | O(n) | 集合运算 |
Gc 调优
(* 查看 GC 统计 *)
let print_gc_stats () =
let stat = Gc.stat () in
Printf.printf "Minor collections: %d\n" stat.Gc.minor_collections;
Printf.printf "Major collections: %d\n" stat.Gc.major_collections;
Printf.printf "Minor words: %d\n" stat.Gc.minor_words;
Printf.printf "Major words: %d\n" stat.Gc.major_words;
Printf.printf "Heap words: %d\n" stat.Gc.heap_words;
Printf.printf "Live words: %d\n" stat.Gc.live_words;
Printf.printf "Free words: %d\n" stat.Gc.free_words
(* GC 调优 *)
let tune_gc () =
Gc.set { (Gc.get ()) with
Gc.minor_heap_size = 256 * 1024; (* 256K words *)
Gc.major_heap_increment = 1024 * 1024; (* 1M words *)
Gc.space_overhead = 120; (* 默认 80 *)
Gc.max_overhead = 500;
Gc.stack_limit = 1024 * 1024; (* 1M words *)
}
(* 手动 GC 控制 *)
let with_gc_control f =
let old_settings = Gc.get () in
Gc.set { old_settings with Gc.space_overhead = 200 };
Fun.protect ~finally:(fun () -> Gc.set old_settings) f
(* 避免触发 major GC *)
let avoid_major_gc () =
(* 减少分配可以避免触发 major GC *)
let arr = Array.make 1000 0 in
for i = 0 to 999 do
arr.(i) <- i * 2 (* 不分配新内存 *)
done;
arr
(* 批量处理 *)
let process_large_dataset data =
let chunk_size = 10000 in
let total = Array.length data in
for i = 0 to total - 1 do
data.(i) <- transform data.(i);
if i mod chunk_size = 0 then
Gc.minor () (* 强制 minor GC *)
done
💡 提示:GC 调优应在实际负载下进行。先用默认设置运行,如果 GC 成为瓶颈再调优。
热点代码识别
(* 使用 flambda 优化 *)
(* dune 文件 *)
(*
(executable
(name my_program)
(ocamlopt_flags (:standard -O3))
(libraries ...))
*)
(* 编译标志 *)
(* -O0: 无优化(调试) *)
(* -O1: 基本优化 *)
(* -O2: 标准优化 *)
(* -O3: 激进优化 *)
(* 手动优化识别 *)
let profile_function name f =
let iterations = 1000 in
let start = Unix.gettimeofday () in
for _ = 1 to iterations do
ignore (f ())
done;
let elapsed = Unix.gettimeofday () -. start in
Printf.printf "%s: %.6f us/call\n" name (elapsed /. float iterations *. 1e6)
let () =
profile_function "快速路径" fast_path;
profile_function "慢速路径" slow_path
实际优化案例
序列化优化
(* 优化前:使用 Yojson *)
let serialize_before users =
List.map user_to_yojson users
|> (fun l -> `List l)
|> Yojson.Safe.to_string
(* 优化后:使用 Buffer 手动构建 *)
let serialize_after users =
let buf = Buffer.create 4096 in
Buffer.add_char buf '[';
List.iteri (fun i user ->
if i > 0 then Buffer.add_char buf ',';
Buffer.add_char buf '{';
Buffer.add_string buf "\"id\":";
Buffer.add_string buf (string_of_int user.id);
Buffer.add_string buf ",\"name\":\"";
Buffer.add_string buf user.name;
Buffer.add_string buf "\"";
Buffer.add_char buf '}';
) users;
Buffer.add_char buf ']';
Buffer.contents buf
JSON 解析优化
(* 优化前:完整解析 *)
let parse_full json_str =
let json = Yojson.Safe.from_string json_str in
Yojson.Util.(json |> member "data" |> to_list |> List.map (member "id" |> to_int))
(* 优化后:只解析需要的字段 *)
let parse_partial json_str =
(* 手动扫描 JSON 找到 "data" 数组中的 "id" 字段 *)
let len = String.length json_str in
let ids = ref [] in
let i = ref 0 in
(* ... 简化的流式解析 ... *)
List.rev !ids
IO 优化
(* 逐行读取 *)
let read_lines_slow filename =
let ic = open_in filename in
let lines = ref [] in
(try while true do
lines := input_line ic :: !lines
done with End_of_file -> ());
close_in ic;
List.rev !lines
(* 批量读取 *)
let read_file_fast filename =
let ic = open_in_bin filename in
let size = in_channel_length ic in
let content = really_input_string ic size in
close_in ic;
content
(* 缓冲写入 *)
let write_lines_fast filename lines =
let oc = open_out filename in
let buf = Buffer.create 4096 in
List.iter (fun line ->
Buffer.add_string buf line;
Buffer.add_char buf '\n';
if Buffer.length buf > 4096 then begin
output_string oc (Buffer.contents buf);
Buffer.clear buf
end
) lines;
if Buffer.length buf > 0 then
output_string oc (Buffer.contents buf);
close_out oc
| 优化点 | 原始 | 优化后 | 提升 |
|---|---|---|---|
| 序列化 | Yojson | Buffer 手动 | ~3x |
| 文件读取 | 逐行 | 批量 | ~5x |
| 字符串拼接 | ^ 连接 | Buffer | ~10x |
| 列表操作 | 递归 | 尾递归 | 栈安全 |
扩展阅读
上一节:测试框架 Alcotest/OUnit 下一节:分布式系统基础