强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

OCaml 教程 / 性能优化与 Benchmark

性能优化与 Benchmark

性能优化是 OCaml 的强项——原生编译、低 GC 压力、尾调用优化。本节介绍如何测量和优化 OCaml 程序的性能。

core_bench 基准测试

opam install core_bench
open Core
open Core_bench

(* 基本基准测试 *)
let bench_list_operations =
  Bench.Test.create ~name:"List.map" (fun () ->
    ignore (List.map ~f:(fun x -> x + 1) (List.init 1000 ~f:Fn.id))
  )

let bench_array_operations =
  Bench.Test.create ~name:"Array.map" (fun () ->
    ignore (Array.map ~f:(fun x -> x + 1) (Array.init 1000 ~f:Fn.id))
  )

let bench_seq_operations =
  Bench.Test.create ~name:"Seq.map" (fun () ->
    ignore (Seq.map (fun x -> x + 1) (Seq.init 1000 (fun i -> i)) |> List.of_seq)
  )

let () =
  Bench.make_command [
    bench_list_operations;
    bench_array_operations;
    bench_seq_operations;
  ] |> Command_unix.run
# 运行基准测试
dune exec ./bench.exe -- -quota 1 -ascii

# 输出示例
# Estimated testing time 3s (3 benchmarks x 1s). Change using -quota SECS.
#
# Name             Time/Run   mWd/Run   Percentage
# ---------------- ---------- --------- ------------
# List.map           12.34μs   3.00kw       98.23%
# Array.map           8.56μs   1.00kw       67.89%
# Seq.map            12.56μs   5.00kw      100.00%
指标说明
Time/Run每次执行平均时间
mWd/Run每次执行分配的 minor words
Percentage相对于最慢基准的百分比
Alloc/Run总分配量(字节)

性能分析工具

# gprof 分析
ocamlopt -p my_program.ml -o my_program
./my_program
gprof my_program gmon.out > analysis.txt

# perf 分析(Linux)
ocamlopt -g my_program.ml -o my_program
perf record ./my_program
perf report

# OCaml 内置统计
let () =
  Gc.set { (Gc.get ()) with Gc.verbose = 0x01 };
  (* ... 运行代码 ... *)
  Gc.print_stat stdout
(* 手动计时 *)
let time_it label f =
  let start = Unix.gettimeofday () in
  let result = f () in
  let elapsed = Unix.gettimeofday () -. start in
  Printf.printf "%s: %.6f seconds\n" label elapsed;
  result

let () =
  time_it "排序" (fun () ->
    let arr = Array.init 1000000 (fun _ -> Random.int 1000000) in
    Array.sort compare arr
  )

💡 提示:先测量,再优化。不要猜测瓶颈在哪里——让数据说话。

内联优化

(* 使用 [@inline] 提示编译器 *)
let[@inline] add a b = a + b
let[@inline] mul a b = a *. b

(* 高阶函数内联 *)
let[@inline] apply f x = f x

(* 热点函数标记 *)
let[@inline always] hot_function x =
  (* 编译器会尽可能内联此函数 *)
  x * 2 + 1

let[@inline never] cold_function x =
  (* 此函数不会被内联 *)
  Printf.printf "debug: %d\n" x

(* 模块签名中暴露内联 *)
module type MATH = sig
  val add : int -> int -> int [@@inline]
  val mul : int -> int -> int [@@inline]
end
属性说明使用场景
[@inline]建议内联小型热点函数
[@inline always]强制内联极热点函数
[@inline never]禁止内联调试函数、冷路径
[@unboxed]拆箱避免装箱开销
[@unrolled n]展开循环固定次数循环

尾调用优化

(* 非尾递归 — 栈溢出 *)
let rec sum_bad acc = function
  | [] -> acc
  | x :: xs -> sum_bad (acc + x) xs  (* 不是尾调用 *)

(* 尾递归版本 *)
let rec sum_good acc = function
  | [] -> acc
  | x :: xs -> sum_good (acc + x) xs  (* 尾调用 *)

(* 使用 [@tail_mod_cons] OCaml 5.1+ *)
let[@tail_mod_cons] rec map f = function
  | [] -> []
  | x :: xs -> f x :: map f xs

(* 复杂尾递归:树遍历 *)
type tree = Leaf | Node of tree * int * tree

let sum_tree t =
  let rec aux acc = function
    | [] -> acc
    | Leaf :: rest -> aux acc rest
    | Node (l, v, r) :: rest -> aux (acc + v) (l :: r :: rest)
  in
  aux 0 [t]

(* CPS 变换实现尾调用 *)
let rec map_cps f xs k =
  match xs with
  | [] -> k []
  | x :: xs' ->
    map_cps f xs' (fun ys ->
      k (f x :: ys))

let map f xs = map_cps f xs Fun.id

⚠️ 注意:OCaml 的尾调用优化仅对直接尾递归有效。matchiftry 的尾位置中的递归调用会被优化。

内存分配优化

(* 避免不必要的分配 *)

(* 不好:每次创建新列表 *)
let filter_bad pred xs =
  List.fold_left (fun acc x ->
    if pred x then x :: acc else acc
  ) [] xs
  |> List.rev

(* 好:使用 Seq 延迟计算 *)
let filter_good pred xs =
  Seq.filter pred (List.to_seq xs)

(* 使用 Buffer 避免字符串拼接 *)
let concat_strings_bad strings =
  String.concat "" strings  (* 多次分配 *)

let concat_strings_good strings =
  let buf = Buffer.create 256 in
  List.iter (Buffer.add_string buf) strings;
  Buffer.contents buf  (* 单次分配 *)

(* 使用 Array 替代 List *)
let sum_array arr =
  let sum = ref 0 in
  Array.iter (fun x -> sum := !sum + x) arr;
  !sum

(* 复用缓冲区 *)
let process_items items =
  let buf = Buffer.create 1024 in
  List.iter (fun item ->
    Buffer.clear buf;  (* 重用缓冲区 *)
    Buffer.add_string buf item;
    (* ... 处理 ... *)
  ) items

(* 避免闭包捕获大对象 *)
let process data =
  let len = Array.length data in  (* 提取需要的值 *)
  let rec aux i acc =
    if i >= len then acc
    else aux (i + 1) (acc + data.(i))
  in
  aux 0 0
技巧说明效果
Buffer字符串拼接避免 O(n²) 分配
Array 替代 List随机访问减少内存开销
Seq延迟计算避免中间列表
复用缓冲区重用 Buffer.t减少 GC 压力
[@unboxed]拆箱类型避免装箱开销

数据结构选择

(* 不同数据结构的性能特征 *)

(* List — 有序集合,头部操作 O(1) *)
let list_ops =
  let xs = List.init 10000 Fun.id in
  let xs' = 0 :: xs in        (* O(1) *)
  let _ = List.nth xs 9999 in  (* O(n) *)
  List.length xs               (* O(n) *)

(* Array — 随机访问 O(1) *)
let array_ops =
  let arr = Array.init 10000 Fun.id in
  arr.(5000) <- 42;            (* O(1) *)
  let _ = arr.(9999) in        (* O(1) *)
  Array.sort compare arr;      (* O(n log n) *)
  Array.length arr             (* O(1) *)

(* Hashtbl — 哈希表 O(1) 平均 *)
let hashtbl_ops =
  let tbl = Hashtbl.create 10000 in
  Hashtbl.add tbl "key" 42;          (* O(1) *)
  Hashtbl.find_opt tbl "key";        (* O(1) *)
  Hashtbl.remove tbl "key";          (* O(1) *)
  Hashtbl.length tbl                 (* O(1) *)

(* Map (平衡树) — O(log n) *)
let map_ops =
  let m = ref Map.empty in
  m := Map.add "key" 42 !m;          (* O(log n) *)
  Map.find_opt "key" !m;             (* O(log n) *)
  m := Map.remove "key" !m;          (* O(log n) *)
  Map.cardinal !m                    (* O(n) *)

(* Set — O(log n) *)
let set_ops =
  let s = ref Set.empty in
  for i = 0 to 9999 do
    s := Set.add i !s
  done;
  Set.mem 5000 !s;                   (* O(log n) *)
  Set.cardinal !s                    (* O(n) *)
数据结构查找插入删除遍历适用场景
ListO(n)O(1)O(n)O(n)小集合、栈
ArrayO(1)O(n)O(n)O(n)随机访问、数值计算
HashtblO(1)*O(1)*O(1)*O(n)键值存储
MapO(log n)O(log n)O(log n)O(n)有序键值
SetO(log n)O(log n)O(log n)O(n)集合运算

Gc 调优

(* 查看 GC 统计 *)
let print_gc_stats () =
  let stat = Gc.stat () in
  Printf.printf "Minor collections: %d\n" stat.Gc.minor_collections;
  Printf.printf "Major collections: %d\n" stat.Gc.major_collections;
  Printf.printf "Minor words: %d\n" stat.Gc.minor_words;
  Printf.printf "Major words: %d\n" stat.Gc.major_words;
  Printf.printf "Heap words: %d\n" stat.Gc.heap_words;
  Printf.printf "Live words: %d\n" stat.Gc.live_words;
  Printf.printf "Free words: %d\n" stat.Gc.free_words

(* GC 调优 *)
let tune_gc () =
  Gc.set { (Gc.get ()) with
    Gc.minor_heap_size = 256 * 1024;  (* 256K words *)
    Gc.major_heap_increment = 1024 * 1024;  (* 1M words *)
    Gc.space_overhead = 120;  (* 默认 80 *)
    Gc.max_overhead = 500;
    Gc.stack_limit = 1024 * 1024;  (* 1M words *)
  }

(* 手动 GC 控制 *)
let with_gc_control f =
  let old_settings = Gc.get () in
  Gc.set { old_settings with Gc.space_overhead = 200 };
  Fun.protect ~finally:(fun () -> Gc.set old_settings) f

(* 避免触发 major GC *)
let avoid_major_gc () =
  (* 减少分配可以避免触发 major GC *)
  let arr = Array.make 1000 0 in
  for i = 0 to 999 do
    arr.(i) <- i * 2  (* 不分配新内存 *)
  done;
  arr

(* 批量处理 *)
let process_large_dataset data =
  let chunk_size = 10000 in
  let total = Array.length data in
  for i = 0 to total - 1 do
    data.(i) <- transform data.(i);
    if i mod chunk_size = 0 then
      Gc.minor ()  (* 强制 minor GC *)
  done

💡 提示:GC 调优应在实际负载下进行。先用默认设置运行,如果 GC 成为瓶颈再调优。

热点代码识别

(* 使用 flambda 优化 *)
(* dune 文件 *)
(*
(executable
 (name my_program)
 (ocamlopt_flags (:standard -O3))
 (libraries ...))
*)

(* 编译标志 *)
(* -O0: 无优化(调试) *)
(* -O1: 基本优化 *)
(* -O2: 标准优化 *)
(* -O3: 激进优化 *)

(* 手动优化识别 *)
let profile_function name f =
  let iterations = 1000 in
  let start = Unix.gettimeofday () in
  for _ = 1 to iterations do
    ignore (f ())
  done;
  let elapsed = Unix.gettimeofday () -. start in
  Printf.printf "%s: %.6f us/call\n" name (elapsed /. float iterations *. 1e6)

let () =
  profile_function "快速路径" fast_path;
  profile_function "慢速路径" slow_path

实际优化案例

序列化优化

(* 优化前:使用 Yojson *)
let serialize_before users =
  List.map user_to_yojson users
  |> (fun l -> `List l)
  |> Yojson.Safe.to_string

(* 优化后:使用 Buffer 手动构建 *)
let serialize_after users =
  let buf = Buffer.create 4096 in
  Buffer.add_char buf '[';
  List.iteri (fun i user ->
    if i > 0 then Buffer.add_char buf ',';
    Buffer.add_char buf '{';
    Buffer.add_string buf "\"id\":";
    Buffer.add_string buf (string_of_int user.id);
    Buffer.add_string buf ",\"name\":\"";
    Buffer.add_string buf user.name;
    Buffer.add_string buf "\"";
    Buffer.add_char buf '}';
  ) users;
  Buffer.add_char buf ']';
  Buffer.contents buf

JSON 解析优化

(* 优化前:完整解析 *)
let parse_full json_str =
  let json = Yojson.Safe.from_string json_str in
  Yojson.Util.(json |> member "data" |> to_list |> List.map (member "id" |> to_int))

(* 优化后:只解析需要的字段 *)
let parse_partial json_str =
  (* 手动扫描 JSON 找到 "data" 数组中的 "id" 字段 *)
  let len = String.length json_str in
  let ids = ref [] in
  let i = ref 0 in
  (* ... 简化的流式解析 ... *)
  List.rev !ids

IO 优化

(* 逐行读取 *)
let read_lines_slow filename =
  let ic = open_in filename in
  let lines = ref [] in
  (try while true do
    lines := input_line ic :: !lines
  done with End_of_file -> ());
  close_in ic;
  List.rev !lines

(* 批量读取 *)
let read_file_fast filename =
  let ic = open_in_bin filename in
  let size = in_channel_length ic in
  let content = really_input_string ic size in
  close_in ic;
  content

(* 缓冲写入 *)
let write_lines_fast filename lines =
  let oc = open_out filename in
  let buf = Buffer.create 4096 in
  List.iter (fun line ->
    Buffer.add_string buf line;
    Buffer.add_char buf '\n';
    if Buffer.length buf > 4096 then begin
      output_string oc (Buffer.contents buf);
      Buffer.clear buf
    end
  ) lines;
  if Buffer.length buf > 0 then
    output_string oc (Buffer.contents buf);
  close_out oc
优化点原始优化后提升
序列化YojsonBuffer 手动~3x
文件读取逐行批量~5x
字符串拼接^ 连接Buffer~10x
列表操作递归尾递归栈安全

扩展阅读


上一节测试框架 Alcotest/OUnit 下一节分布式系统基础