OCaml 教程 / 编译器前端实践

编译器前端实践

编译器前端负责将源代码转换为中间表示。本节介绍词法分析、语法分析、AST 设计和类型检查的实践。

词法分析（ocamllex）

ocamllex 是 OCaml 的词法分析器生成器。

# 安装（通常随 OCaml 一起安装）
# ocamllex 生成 .ml 文件

(* lexer.mll — 词法分析器定义 *)
{
  open Parser  (* 引用语法分析器的 token 定义 *)
}

let digit = ['0'-'9']
let alpha = ['a'-'z' 'A'-'Z']
let ident = alpha (alpha | digit | '_')*
let whitespace = [' ' '\t' '\n' '\r']

rule token = parse
  | whitespace+ { token lexbuf }  (* 跳过空白 *)
  | "//" [^ '\n']* { token lexbuf }  (* 单行注释 *)
  | "/*" { comment 0 lexbuf }  (* 多行注释 *)
  | '\n' { Lexing.new_line lexbuf; token lexbuf }
  | digit+ as num { NUM (float_of_string num) }
  | ident as id {
    match id with
    | "let" -> LET
    | "in" -> IN
    | "if" -> IF
    | "then" -> THEN
    | "else" -> ELSE
    | "fun" -> FUN
    | "true" -> TRUE
    | "false" -> FALSE
    | "and" -> AND
    | "or" -> OR
    | "not" -> NOT
    | _ -> IDENT id
  }
  | '+' { PLUS }
  | '-' { MINUS }
  | '*' { STAR }
  | '/' { SLASH }
  | '=' { EQ }
  | '<' { LT }
  | '>' { GT }
  | "<=" { LE }
  | ">=" { GE }
  | "!=" { NEQ }
  | '(' { LPAREN }
  | ')' { RPAREN }
  | "->" { ARROW }
  | ',' { COMMA }
  | eof { EOF }
  | _ as c { failwith (Printf.sprintf "意外字符: %c" c) }

and comment depth = parse
  | "*/" { if depth = 0 then token lexbuf else comment (depth - 1) lexbuf }
  | "/*" { comment (depth + 1) lexbuf }
  | '\n' { Lexing.new_line lexbuf; comment depth lexbuf }
  | eof { failwith "注释未闭合" }
  | _ { comment depth lexbuf }

规则	说明
`token`	主词法规则
`comment`	处理嵌套注释
`as num`	绑定匹配的文本
`Lexing.new_line`	更新行号（用于错误报告）

⚠️ 注意：ocamllex 规则按顺序匹配，第一个匹配的规则优先。将关键字规则放在标识符规则之前。

语法分析（menhir）

Menhir 是 OCaml 的 LR(1) 语法分析器生成器。

opam install menhir

(* parser.mly — 语法分析器定义 *)
%{
  open Ast
%}

%token <float> NUM
%token <string> IDENT
%token LET IN IF THEN ELSE FUN
%token TRUE FALSE AND OR NOT
%token PLUS MINUS STAR SLASH
%token EQ LT GT LE GE NEQ
%token LPAREN RPAREN ARROW COMMA
%token EOF

%left OR
%left AND
%left EQ LT GT LE GE NEQ
%left PLUS MINUS
%left STAR SLASH
%nonassoc UNARY

%start <expr> program

%%

program:
  | e = expr; EOF { e }

expr:
  | e = let_expr { e }
  | e = if_expr { e }
  | e = fun_expr { e }
  | e = or_expr { e }

let_expr:
  | LET; id = IDENT; EQ; value = expr; IN; body = expr
    { ELet (id, value, body) }
  | LET; REC; id = IDENT; EQ; value = expr; IN; body = expr
    { ELetRec (id, value, body) }

if_expr:
  | IF; cond = expr; THEN; then_e = expr; ELSE; else_e = expr
    { EIf (cond, then_e, else_e) }

fun_expr:
  | FUN; params = nonempty_list(IDENT); ARROW; body = expr
    { EFun (params, body) }

or_expr:
  | left = and_expr; OR; right = or_expr { EOr (left, right) }
  | e = and_expr { e }

and_expr:
  | left = comp_expr; AND; right = and_expr { EAnd (left, right) }
  | e = comp_expr { e }

comp_expr:
  | left = add_expr; op = comp_op; right = add_expr { op left right }
  | e = add_expr { e }

%inline comp_op:
  | EQ { fun l r -> EEq (l, r) }
  | LT { fun l r -> ELt (l, r) }
  | GT { fun l r -> EGt (l, r) }
  | LE { fun l r -> EAnd (ELt (l, r), EEq (l, r)) }
  | GE { fun l r -> EOr (EGt (l, r), EEq (l, r)) }
  | NEQ { fun l r -> ENot (EEq (l, r)) }

add_expr:
  | left = add_expr; PLUS; right = mul_expr { EAdd (left, right) }
  | left = add_expr; MINUS; right = mul_expr { ESub (left, right) }
  | e = mul_expr { e }

mul_expr:
  | left = mul_expr; STAR; right = unary_expr { EMul (left, right) }
  | left = mul_expr; SLASH; right = unary_expr { EDiv (left, right) }
  | e = unary_expr { e }

unary_expr:
  | MINUS; e = unary_expr %prec UNARY { ENeg e }
  | NOT; e = unary_expr %prec UNARY { ENot e }
  | e = app_expr { e }

app_expr:
  | func = primary_expr; LPAREN; args = separated_list(COMMA, expr); RPAREN
    { EApp (func, args) }
  | e = primary_expr { e }

primary_expr:
  | n = NUM { ENum n }
  | TRUE { EBool true }
  | FALSE { EBool false }
  | id = IDENT { EVar id }
  | LPAREN; e = expr; RPAREN { e }

优先级	运算符	结合性
最高	`not`, `-`(一元)	右
	`*`, `/`	左
	`+`, `-`	左
	`=`, `<`, `>`	无
	`and`	左
最低	`or`	左

💡 提示：%left、%right、%nonassoc 声明运算符的优先级和结合性。越早声明优先级越低。

AST 设计

(* ast.ml — 抽象语法树定义 *)

(* 表达式 *)
type expr =
  | ENum of float
  | EBool of bool
  | EVar of string
  | EAdd of expr * expr
  | ESub of expr * expr
  | EMul of expr * expr
  | EDiv of expr * expr
  | EEq of expr * expr
  | ELt of expr * expr
  | EGt of expr * expr
  | EAnd of expr * expr
  | EOr of expr * expr
  | ENeg of expr
  | ENot of expr
  | EIf of expr * expr * expr
  | ELet of string * expr * expr
  | ELetRec of string * expr * expr
  | EFun of string list * expr
  | EApp of expr * expr list

(* 类型 *)
type typ =
  | TFloat
  | TBool
  | TArrow of typ list * typ
  | TVar of string

(* 类型环境 *)
type type_env = (string * typ) list

(* AST 辅助函数 *)
let rec expr_to_string = function
  | ENum n -> Printf.sprintf "%.2f" n
  | EBool b -> string_of_bool b
  | EVar id -> id
  | EAdd (l, r) -> Printf.sprintf "(%s + %s)" (expr_to_string l) (expr_to_string r)
  | ESub (l, r) -> Printf.sprintf "(%s - %s)" (expr_to_string l) (expr_to_string r)
  | EMul (l, r) -> Printf.sprintf "(%s * %s)" (expr_to_string l) (expr_to_string r)
  | EDiv (l, r) -> Printf.sprintf "(%s / %s)" (expr_to_string l) (expr_to_string r)
  | EEq (l, r) -> Printf.sprintf "(%s = %s)" (expr_to_string l) (expr_to_string r)
  | ELt (l, r) -> Printf.sprintf "(%s < %s)" (expr_to_string l) (expr_to_string r)
  | EGt (l, r) -> Printf.sprintf "(%s > %s)" (expr_to_string l) (expr_to_string r)
  | EAnd (l, r) -> Printf.sprintf "(%s and %s)" (expr_to_string l) (expr_to_string r)
  | EOr (l, r) -> Printf.sprintf "(%s or %s)" (expr_to_string l) (expr_to_string r)
  | ENeg e -> Printf.sprintf "(-%s)" (expr_to_string e)
  | ENot e -> Printf.sprintf "(not %s)" (expr_to_string e)
  | EIf (c, t, e) -> Printf.sprintf "if %s then %s else %s" (expr_to_string c) (expr_to_string t) (expr_to_string e)
  | ELet (id, v, b) -> Printf.sprintf "let %s = %s in %s" id (expr_to_string v) (expr_to_string b)
  | ELetRec (id, v, b) -> Printf.sprintf "let rec %s = %s in %s" id (expr_to_string v) (expr_to_string b)
  | EFun (ps, b) -> Printf.sprintf "fun %s -> %s" (String.concat " " ps) (expr_to_string b)
  | EApp (f, args) -> Printf.sprintf "%s(%s)" (expr_to_string f) (String.concat ", " (List.map expr_to_string args))

(* 位置信息 *)
type location = {
  file: string;
  line: int;
  col_start: int;
  col_end: int;
}

type 'a located = {
  value: 'a;
  loc: location;
}

(* 带位置的 AST *)
type expr_loc = expr located

AST 节点	说明	示例
`ENum`	数字字面量	`42`
`EVar`	变量引用	`x`
`ELet`	let 绑定	`let x = 1 in x`
`EFun`	lambda	`fun x -> x`
`EApp`	函数应用	`f(x)`
`EIf`	条件	`if x > 0 then x else -x`

类型检查器实现

(* typecheck.ml *)

open Ast

exception TypeError of string

let error msg = raise (TypeError msg)

let rec type_of env = function
  | ENum _ -> TFloat
  | EBool _ -> TBool
  | EVar id ->
    (try List.assoc id env
     with Not_found -> error (Printf.sprintf "未绑定变量: %s" id))
  | EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
    expect_type env TFloat l;
    expect_type env TFloat r;
    TFloat
  | EEq (l, r) ->
    let t = type_of env l in
    expect_type env t r;
    TBool
  | ELt (l, r) | EGt (l, r) ->
    expect_type env TFloat l;
    expect_type env TFloat r;
    TBool
  | EAnd (l, r) | EOr (l, r) ->
    expect_type env TBool l;
    expect_type env TBool r;
    TBool
  | ENeg e ->
    expect_type env TFloat e;
    TFloat
  | ENot e ->
    expect_type env TBool e;
    TBool
  | EIf (cond, then_e, else_e) ->
    expect_type env TBool cond;
    let t = type_of env then_e in
    expect_type env t else_e;
    t
  | ELet (id, value, body) ->
    let t = type_of env value in
    type_of ((id, t) :: env) body
  | ELetRec (id, value, body) ->
    let t = TVar "'a" in
    let env' = (id, t) :: env in
    let t' = type_of env' value in
    type_of ((id, t') :: env) body
  | EFun (params, body) ->
    let param_types = List.map (fun _ -> TVar "'a") params in
    let env' = List.fold_left2 (fun e p t -> (p, t) :: e) env params param_types in
    let body_type = type_of env' body in
    TArrow (param_types, body_type)
  | EApp (func, args) ->
    (match type_of env func with
     | TArrow (param_types, return_type) ->
       if List.length param_types <> List.length args then
         error "参数数量不匹配";
       List.iter2 (expect_type env) param_types args;
       return_type
     | _ -> error "不可调用非函数类型")

and expect_type env expected expr =
  let actual = type_of env expr in
  if not (types_equal expected actual) then
    error (Printf.sprintf "类型不匹配: 期望 %s, 得到 %s"
      (typ_to_string expected) (typ_to_string actual))

and types_equal t1 t2 =
  match t1, t2 with
  | TFloat, TFloat -> true
  | TBool, TBool -> true
  | TVar _, _ | _, TVar _ -> true  (* 类型变量匹配任何类型 *)
  | TArrow (p1, r1), TArrow (p2, r2) ->
    List.length p1 = List.length p2 &&
    List.for_all2 types_equal p1 p2 &&
    types_equal r1 r2
  | _ -> false

and typ_to_string = function
  | TFloat -> "float"
  | TBool -> "bool"
  | TVar s -> s
  | TArrow (params, ret) ->
    Printf.sprintf "(%s) -> %s"
      (String.concat ", " (List.map typ_to_string params))
      (typ_to_string ret)

(* 类型检查入口 *)
let check expr =
  try
    let t = type_of [] expr in
    Ok t
  with TypeError msg ->
    Error msg

💡 提示：完整的类型推导需要统一化算法（unification），上面是简化的类型检查实现。

类型推导实现

(* 完整的 Hindley-Milner 类型推导 *)
type typ =
  | TFloat
  | TBool
  | TVar of string
  | TArrow of typ list * typ

(* 替换 *)
type substitution = (string * typ) list

let rec apply_subst subst = function
  | TFloat -> TFloat
  | TBool -> TBool
  | TVar id as t ->
    (try List.assoc id subst with Not_found -> t)
  | TArrow (params, ret) ->
    TArrow (List.map (apply_subst subst) params, apply_subst subst ret)

let compose_subst s1 s2 =
  let s2' = List.map (fun (id, t) -> (id, apply_subst s1 t)) s2 in
  s1 @ s2'

(* 出现检查 *)
let rec occurs id = function
  | TFloat | TBool -> false
  | TVar id' -> id = id'
  | TArrow (params, ret) ->
    List.exists (occurs id) params || occurs id ret

(* 统一化 *)
let rec unify t1 t2 =
  match t1, t2 with
  | TFloat, TFloat | TBool, TBool -> []
  | TVar id, t | t, TVar id ->
    if t = TVar id then []
    else if occurs id t then
      raise (TypeError (Printf.sprintf "循环类型: %s 出现在 %s 中" id (typ_to_string t)))
    else [(id, t)]
  | TArrow (p1, r1), TArrow (p2, r2) ->
    if List.length p1 <> List.length p2 then
      raise (TypeError "函数参数数量不匹配");
    let s = List.fold_left2 (fun s a b ->
      let s' = unify (apply_subst s a) (apply_subst s b) in
      compose_subst s' s
    ) [] p1 p2 in
    let s' = unify (apply_subst s r1) (apply_subst s r2) in
    compose_subst s' s
  | _ ->
    raise (TypeError (Printf.sprintf "无法统一 %s 和 %s" (typ_to_string t1) (typ_to_string t2)))

(* 类型推导 *)
let next_var = ref 0
let fresh_var () =
  incr next_var;
  TVar (Printf.sprintf "'%d" !next_var)

let rec infer env = function
  | ENum _ -> ([], TFloat)
  | EBool _ -> ([], TBool)
  | EVar id ->
    (match List.assoc_opt id env with
     | Some t -> ([], instantiate t)
     | None -> raise (TypeError (Printf.sprintf "未绑定变量: %s" id)))
  | EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
    let (s1, t1) = infer env l in
    let (s2, t2) = infer (apply_env s1 env) r in
    let s3 = unify (apply_subst s2 t1) TFloat in
    let s4 = unify (apply_subst s3 t2) TFloat in
    (compose_subst s4 (compose_subst s3 (compose_subst s2 s1)), TFloat)
  | EIf (cond, then_e, else_e) ->
    let (s1, t1) = infer env cond in
    let s1' = unify t1 TBool in
    let s2 = compose_subst s1' s1 in
    let (s3, t2) = infer (apply_env s2 env) then_e in
    let (s4, t3) = infer (apply_env (compose_subst s3 s2) env) else_e in
    let s5 = unify (apply_subst s4 t2) t3 in
    (compose_subst s5 (compose_subst s4 (compose_subst s3 s2)), apply_subst s5 t3)
  | ELet (id, value, body) ->
    let (s1, t1) = infer env value in
    let env' = (id, generalize (apply_env s1 env) t1) :: apply_env s1 env in
    let (s2, t2) = infer env' body in
    (compose_subst s2 s1, t2)
  | EFun (params, body) ->
    let param_types = List.map (fun _ -> fresh_var ()) params in
    let env' = List.fold_left2 (fun e p t -> (p, Scheme ([], t)) :: e) env params param_types in
    let (s, t) = infer env' body in
    (s, TArrow (List.map (apply_subst s) param_types, t))
  | EApp (func, args) ->
    let (s1, t1) = infer env func in
    let (s2, arg_types) = List.fold_left_map (fun s arg ->
      let (s', t) = infer (apply_env s env) arg in
      (compose_subst s' s, t)
    ) s1 args in
    let ret_type = fresh_var () in
    let s3 = unify (apply_subst s2 t1) (TArrow (arg_types, ret_type)) in
    (compose_subst s3 s2, apply_subst s3 ret_type)

错误恢复

(* 带错误恢复的解析器 *)
type parse_error = {
  message: string;
  location: Lexing.position;
  expected: string list;
  got: string;
}

let parse_with_recovery lexbuf =
  let errors = ref [] in
  let rec parse_expr () =
    try
      Parser.expr Lexer.token lexbuf
    with
    | Parser.Error ->
      let pos = Lexing.lexeme_start_p lexbuf in
      let token = Lexing.lexeme lexbuf in
      errors := {
        message = "语法错误";
        location = pos;
        expected = ["表达式"];
        got = token;
      } :: !errors;
      (* 跳过到下一个同步点 *)
      skip_to_sync lexbuf;
      parse_expr ()
  and skip_to_sync lexbuf =
    let token = Lexer.token lexbuf in
    match token with
    | SEMICOLON | RPAREN | EOF -> ()
    | _ -> skip_to_sync lexbuf
  in
  let result = parse_expr () in
  (result, List.rev !errors)

(* 错误格式化 *)
let format_error err =
  Printf.sprintf "%s:%d:%d: %s, 期望 %s, 得到 '%s'"
    err.location.pos_fname
    err.location.pos_lnum
    (err.location.pos_cnum - err.location.pos_bol)
    err.message
    (String.concat " | " err.expected)
    err.got

⚠️ 注意：错误恢复要小心避免级联错误。同步点（如分号、括号）是好的恢复位置。

语义分析

(* 语义分析：变量作用域、未使用变量、死代码 *)
type semantic_error =
  | UnboundVariable of string
  | UnusedVariable of string
  | DuplicateBinding of string
  | DeadCode of location

let analyze expr =
  let errors = ref [] in
  let used = Hashtbl.create 16 in
  
  let rec check_expr env = function
    | EVar id ->
      (match List.assoc_opt id env with
       | Some _ -> Hashtbl.replace used id true
       | None -> errors := UnboundVariable id :: !errors)
    | ELet (id, value, body) ->
      check_expr env value;
      if List.mem_assoc id env then
        errors := DuplicateBinding id :: !errors;
      Hashtbl.replace used id false;
      check_expr ((id, ()) :: env) body;
      if not (Hashtbl.find used id) then
        errors := UnusedVariable id :: !errors
    | EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
      check_expr env l; check_expr env r
    | EIf (c, t, e) ->
      check_expr env c; check_expr env t; check_expr env e
    | EFun (params, body) ->
      let env' = List.fold_left (fun e p -> (p, ()) :: e) env params in
      check_expr env' body
    | EApp (func, args) ->
      check_expr env func;
      List.iter (check_expr env) args
    | _ -> ()
  in
  
  check_expr [] expr;
  List.rev !errors

中间代码生成

(* 简单的三地址码 *)
type ir_value =
  | IRNum of float
  | IRBool of bool
  | IRVar of string
  | IRTemp of int

type ir_instr =
  | IRAssign of ir_value * ir_value
  | IRAdd of ir_value * ir_value * ir_value
  | IRSub of ir_value * ir_value * ir_value
  | IRMul of ir_value * ir_value * ir_value
  | IRDiv of ir_value * ir_value * ir_value
  | IRLabel of string
  | IRJump of string
  | IRCondJump of ir_value * string * string
  | IRCall of ir_value * string * ir_value list
  | IRReturn of ir_value

type ir_block = {
  label: string;
  instructions: ir_instr list;
}

let next_temp = ref 0
let fresh_temp () =
  incr next_temp;
  IRTemp !next_temp

let compile_expr expr =
  let instrs = ref [] in
  let emit instr = instrs := instr :: !instrs in
  
  let rec compile = function
    | ENum n -> IRNum n
    | EBool b -> IRBool b
    | EVar id -> IRVar id
    | EAdd (l, r) ->
      let lv = compile l in
      let rv = compile r in
      let t = fresh_temp () in
      emit (IRAdd (t, lv, rv));
      t
    | ESub (l, r) ->
      let lv = compile l in
      let rv = compile r in
      let t = fresh_temp () in
      emit (IRSub (t, lv, rv));
      t
    | EMul (l, r) ->
      let lv = compile l in
      let rv = compile r in
      let t = fresh_temp () in
      emit (IRMul (t, lv, rv));
      t
    | EDiv (l, r) ->
      let lv = compile l in
      let rv = compile r in
      let t = fresh_temp () in
      emit (IRDiv (t, lv, rv));
      t
    | ELet (id, value, body) ->
      let v = compile value in
      emit (IRAssign (IRVar id, v));
      compile body
    | EIf (cond, then_e, else_e) ->
      let cv = compile cond in
      let then_label = Printf.sprintf "then_%d" !next_temp in
      let else_label = Printf.sprintf "else_%d" !next_temp in
      let end_label = Printf.sprintf "end_%d" !next_temp in
      emit (IRCondJump (cv, then_label, else_label));
      emit (IRLabel then_label);
      let tv = compile then_e in
      emit (IRAssign (IRVar "result", tv));
      emit (IRJump end_label);
      emit (IRLabel else_label);
      let ev = compile else_e in
      emit (IRAssign (IRVar "result", ev));
      emit (IRLabel end_label);
      IRVar "result"
    | EApp (func, args) ->
      let func_name = match func with EVar id -> id | _ -> "lambda" in
      let arg_values = List.map compile args in
      let t = fresh_temp () in
      emit (IRCall (t, func_name, arg_values));
      t
    | _ -> failwith "未实现"
  in
  
  let result = compile expr in
  List.rev !instrs

构建一个简单语言编译器

(* 编译器管线 *)
type compile_result = {
  ast: Ast.expr;
  ir: ir_instr list;
  errors: string list;
}

let compile source =
  (* 1. 词法分析 *)
  let lexbuf = Lexing.from_string source in
  Lexing.set_filename lexbuf "input";
  
  (* 2. 语法分析 *)
  let ast =
    try Parser.program Lexer.token lexbuf
    with Parser.Error ->
      let pos = Lexing.lexeme_start_p lexbuf in
      failwith (Printf.sprintf "语法错误: %s:%d" pos.pos_fname pos.pos_lnum)
  in
  
  (* 3. 语义分析 *)
  let semantic_errors = analyze ast in
  if semantic_errors <> [] then
    List.iter (fun e ->
      Printf.eprintf "警告: %s\n" (match e with
        | UnboundVariable id -> Printf.sprintf "未绑定变量: %s" id
        | UnusedVariable id -> Printf.sprintf "未使用变量: %s" id
        | DuplicateBinding id -> Printf.sprintf "重复绑定: %s" id
        | DeadCode _ -> "死代码")
    ) semantic_errors;
  
  (* 4. 类型检查 *)
  (match check ast with
   | Ok t -> Printf.printf "类型: %s\n" (typ_to_string t)
   | Error msg -> Printf.eprintf "类型错误: %s\n" msg);
  
  (* 5. 中间代码生成 *)
  let ir = compile_expr ast in
  
  { ast; ir; errors = [] }

(* 入口 *)
let () =
  let source = {|
    let x = 10 in
    let y = 20 in
    let f = fun a b -> a + b in
    f(x, y)
  |} in
  let result = compile source in
  Printf.printf "生成 %d 条指令\n" (List.length result.ir)

扩展阅读

上一节：形式验证与属性测试 下一节：完整项目：构建一个小型数据库