OCaml 教程 / 编译器前端实践
编译器前端实践
编译器前端负责将源代码转换为中间表示。本节介绍词法分析、语法分析、AST 设计和类型检查的实践。
词法分析(ocamllex)
ocamllex 是 OCaml 的词法分析器生成器。
# 安装(通常随 OCaml 一起安装)
# ocamllex 生成 .ml 文件
(* lexer.mll — 词法分析器定义 *)
{
open Parser (* 引用语法分析器的 token 定义 *)
}
let digit = ['0'-'9']
let alpha = ['a'-'z' 'A'-'Z']
let ident = alpha (alpha | digit | '_')*
let whitespace = [' ' '\t' '\n' '\r']
rule token = parse
| whitespace+ { token lexbuf } (* 跳过空白 *)
| "//" [^ '\n']* { token lexbuf } (* 单行注释 *)
| "/*" { comment 0 lexbuf } (* 多行注释 *)
| '\n' { Lexing.new_line lexbuf; token lexbuf }
| digit+ as num { NUM (float_of_string num) }
| ident as id {
match id with
| "let" -> LET
| "in" -> IN
| "if" -> IF
| "then" -> THEN
| "else" -> ELSE
| "fun" -> FUN
| "true" -> TRUE
| "false" -> FALSE
| "and" -> AND
| "or" -> OR
| "not" -> NOT
| _ -> IDENT id
}
| '+' { PLUS }
| '-' { MINUS }
| '*' { STAR }
| '/' { SLASH }
| '=' { EQ }
| '<' { LT }
| '>' { GT }
| "<=" { LE }
| ">=" { GE }
| "!=" { NEQ }
| '(' { LPAREN }
| ')' { RPAREN }
| "->" { ARROW }
| ',' { COMMA }
| eof { EOF }
| _ as c { failwith (Printf.sprintf "意外字符: %c" c) }
and comment depth = parse
| "*/" { if depth = 0 then token lexbuf else comment (depth - 1) lexbuf }
| "/*" { comment (depth + 1) lexbuf }
| '\n' { Lexing.new_line lexbuf; comment depth lexbuf }
| eof { failwith "注释未闭合" }
| _ { comment depth lexbuf }
| 规则 | 说明 |
|---|---|
token | 主词法规则 |
comment | 处理嵌套注释 |
as num | 绑定匹配的文本 |
Lexing.new_line | 更新行号(用于错误报告) |
⚠️ 注意:ocamllex 规则按顺序匹配,第一个匹配的规则优先。将关键字规则放在标识符规则之前。
语法分析(menhir)
Menhir 是 OCaml 的 LR(1) 语法分析器生成器。
opam install menhir
(* parser.mly — 语法分析器定义 *)
%{
open Ast
%}
%token <float> NUM
%token <string> IDENT
%token LET IN IF THEN ELSE FUN
%token TRUE FALSE AND OR NOT
%token PLUS MINUS STAR SLASH
%token EQ LT GT LE GE NEQ
%token LPAREN RPAREN ARROW COMMA
%token EOF
%left OR
%left AND
%left EQ LT GT LE GE NEQ
%left PLUS MINUS
%left STAR SLASH
%nonassoc UNARY
%start <expr> program
%%
program:
| e = expr; EOF { e }
expr:
| e = let_expr { e }
| e = if_expr { e }
| e = fun_expr { e }
| e = or_expr { e }
let_expr:
| LET; id = IDENT; EQ; value = expr; IN; body = expr
{ ELet (id, value, body) }
| LET; REC; id = IDENT; EQ; value = expr; IN; body = expr
{ ELetRec (id, value, body) }
if_expr:
| IF; cond = expr; THEN; then_e = expr; ELSE; else_e = expr
{ EIf (cond, then_e, else_e) }
fun_expr:
| FUN; params = nonempty_list(IDENT); ARROW; body = expr
{ EFun (params, body) }
or_expr:
| left = and_expr; OR; right = or_expr { EOr (left, right) }
| e = and_expr { e }
and_expr:
| left = comp_expr; AND; right = and_expr { EAnd (left, right) }
| e = comp_expr { e }
comp_expr:
| left = add_expr; op = comp_op; right = add_expr { op left right }
| e = add_expr { e }
%inline comp_op:
| EQ { fun l r -> EEq (l, r) }
| LT { fun l r -> ELt (l, r) }
| GT { fun l r -> EGt (l, r) }
| LE { fun l r -> EAnd (ELt (l, r), EEq (l, r)) }
| GE { fun l r -> EOr (EGt (l, r), EEq (l, r)) }
| NEQ { fun l r -> ENot (EEq (l, r)) }
add_expr:
| left = add_expr; PLUS; right = mul_expr { EAdd (left, right) }
| left = add_expr; MINUS; right = mul_expr { ESub (left, right) }
| e = mul_expr { e }
mul_expr:
| left = mul_expr; STAR; right = unary_expr { EMul (left, right) }
| left = mul_expr; SLASH; right = unary_expr { EDiv (left, right) }
| e = unary_expr { e }
unary_expr:
| MINUS; e = unary_expr %prec UNARY { ENeg e }
| NOT; e = unary_expr %prec UNARY { ENot e }
| e = app_expr { e }
app_expr:
| func = primary_expr; LPAREN; args = separated_list(COMMA, expr); RPAREN
{ EApp (func, args) }
| e = primary_expr { e }
primary_expr:
| n = NUM { ENum n }
| TRUE { EBool true }
| FALSE { EBool false }
| id = IDENT { EVar id }
| LPAREN; e = expr; RPAREN { e }
| 优先级 | 运算符 | 结合性 |
|---|---|---|
| 最高 | not, -(一元) | 右 |
*, / | 左 | |
+, - | 左 | |
=, <, > | 无 | |
and | 左 | |
| 最低 | or | 左 |
💡 提示:%left、%right、%nonassoc 声明运算符的优先级和结合性。越早声明优先级越低。
AST 设计
(* ast.ml — 抽象语法树定义 *)
(* 表达式 *)
type expr =
| ENum of float
| EBool of bool
| EVar of string
| EAdd of expr * expr
| ESub of expr * expr
| EMul of expr * expr
| EDiv of expr * expr
| EEq of expr * expr
| ELt of expr * expr
| EGt of expr * expr
| EAnd of expr * expr
| EOr of expr * expr
| ENeg of expr
| ENot of expr
| EIf of expr * expr * expr
| ELet of string * expr * expr
| ELetRec of string * expr * expr
| EFun of string list * expr
| EApp of expr * expr list
(* 类型 *)
type typ =
| TFloat
| TBool
| TArrow of typ list * typ
| TVar of string
(* 类型环境 *)
type type_env = (string * typ) list
(* AST 辅助函数 *)
let rec expr_to_string = function
| ENum n -> Printf.sprintf "%.2f" n
| EBool b -> string_of_bool b
| EVar id -> id
| EAdd (l, r) -> Printf.sprintf "(%s + %s)" (expr_to_string l) (expr_to_string r)
| ESub (l, r) -> Printf.sprintf "(%s - %s)" (expr_to_string l) (expr_to_string r)
| EMul (l, r) -> Printf.sprintf "(%s * %s)" (expr_to_string l) (expr_to_string r)
| EDiv (l, r) -> Printf.sprintf "(%s / %s)" (expr_to_string l) (expr_to_string r)
| EEq (l, r) -> Printf.sprintf "(%s = %s)" (expr_to_string l) (expr_to_string r)
| ELt (l, r) -> Printf.sprintf "(%s < %s)" (expr_to_string l) (expr_to_string r)
| EGt (l, r) -> Printf.sprintf "(%s > %s)" (expr_to_string l) (expr_to_string r)
| EAnd (l, r) -> Printf.sprintf "(%s and %s)" (expr_to_string l) (expr_to_string r)
| EOr (l, r) -> Printf.sprintf "(%s or %s)" (expr_to_string l) (expr_to_string r)
| ENeg e -> Printf.sprintf "(-%s)" (expr_to_string e)
| ENot e -> Printf.sprintf "(not %s)" (expr_to_string e)
| EIf (c, t, e) -> Printf.sprintf "if %s then %s else %s" (expr_to_string c) (expr_to_string t) (expr_to_string e)
| ELet (id, v, b) -> Printf.sprintf "let %s = %s in %s" id (expr_to_string v) (expr_to_string b)
| ELetRec (id, v, b) -> Printf.sprintf "let rec %s = %s in %s" id (expr_to_string v) (expr_to_string b)
| EFun (ps, b) -> Printf.sprintf "fun %s -> %s" (String.concat " " ps) (expr_to_string b)
| EApp (f, args) -> Printf.sprintf "%s(%s)" (expr_to_string f) (String.concat ", " (List.map expr_to_string args))
(* 位置信息 *)
type location = {
file: string;
line: int;
col_start: int;
col_end: int;
}
type 'a located = {
value: 'a;
loc: location;
}
(* 带位置的 AST *)
type expr_loc = expr located
| AST 节点 | 说明 | 示例 |
|---|---|---|
ENum | 数字字面量 | 42 |
EVar | 变量引用 | x |
ELet | let 绑定 | let x = 1 in x |
EFun | lambda | fun x -> x |
EApp | 函数应用 | f(x) |
EIf | 条件 | if x > 0 then x else -x |
类型检查器实现
(* typecheck.ml *)
open Ast
exception TypeError of string
let error msg = raise (TypeError msg)
let rec type_of env = function
| ENum _ -> TFloat
| EBool _ -> TBool
| EVar id ->
(try List.assoc id env
with Not_found -> error (Printf.sprintf "未绑定变量: %s" id))
| EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
expect_type env TFloat l;
expect_type env TFloat r;
TFloat
| EEq (l, r) ->
let t = type_of env l in
expect_type env t r;
TBool
| ELt (l, r) | EGt (l, r) ->
expect_type env TFloat l;
expect_type env TFloat r;
TBool
| EAnd (l, r) | EOr (l, r) ->
expect_type env TBool l;
expect_type env TBool r;
TBool
| ENeg e ->
expect_type env TFloat e;
TFloat
| ENot e ->
expect_type env TBool e;
TBool
| EIf (cond, then_e, else_e) ->
expect_type env TBool cond;
let t = type_of env then_e in
expect_type env t else_e;
t
| ELet (id, value, body) ->
let t = type_of env value in
type_of ((id, t) :: env) body
| ELetRec (id, value, body) ->
let t = TVar "'a" in
let env' = (id, t) :: env in
let t' = type_of env' value in
type_of ((id, t') :: env) body
| EFun (params, body) ->
let param_types = List.map (fun _ -> TVar "'a") params in
let env' = List.fold_left2 (fun e p t -> (p, t) :: e) env params param_types in
let body_type = type_of env' body in
TArrow (param_types, body_type)
| EApp (func, args) ->
(match type_of env func with
| TArrow (param_types, return_type) ->
if List.length param_types <> List.length args then
error "参数数量不匹配";
List.iter2 (expect_type env) param_types args;
return_type
| _ -> error "不可调用非函数类型")
and expect_type env expected expr =
let actual = type_of env expr in
if not (types_equal expected actual) then
error (Printf.sprintf "类型不匹配: 期望 %s, 得到 %s"
(typ_to_string expected) (typ_to_string actual))
and types_equal t1 t2 =
match t1, t2 with
| TFloat, TFloat -> true
| TBool, TBool -> true
| TVar _, _ | _, TVar _ -> true (* 类型变量匹配任何类型 *)
| TArrow (p1, r1), TArrow (p2, r2) ->
List.length p1 = List.length p2 &&
List.for_all2 types_equal p1 p2 &&
types_equal r1 r2
| _ -> false
and typ_to_string = function
| TFloat -> "float"
| TBool -> "bool"
| TVar s -> s
| TArrow (params, ret) ->
Printf.sprintf "(%s) -> %s"
(String.concat ", " (List.map typ_to_string params))
(typ_to_string ret)
(* 类型检查入口 *)
let check expr =
try
let t = type_of [] expr in
Ok t
with TypeError msg ->
Error msg
💡 提示:完整的类型推导需要统一化算法(unification),上面是简化的类型检查实现。
类型推导实现
(* 完整的 Hindley-Milner 类型推导 *)
type typ =
| TFloat
| TBool
| TVar of string
| TArrow of typ list * typ
(* 替换 *)
type substitution = (string * typ) list
let rec apply_subst subst = function
| TFloat -> TFloat
| TBool -> TBool
| TVar id as t ->
(try List.assoc id subst with Not_found -> t)
| TArrow (params, ret) ->
TArrow (List.map (apply_subst subst) params, apply_subst subst ret)
let compose_subst s1 s2 =
let s2' = List.map (fun (id, t) -> (id, apply_subst s1 t)) s2 in
s1 @ s2'
(* 出现检查 *)
let rec occurs id = function
| TFloat | TBool -> false
| TVar id' -> id = id'
| TArrow (params, ret) ->
List.exists (occurs id) params || occurs id ret
(* 统一化 *)
let rec unify t1 t2 =
match t1, t2 with
| TFloat, TFloat | TBool, TBool -> []
| TVar id, t | t, TVar id ->
if t = TVar id then []
else if occurs id t then
raise (TypeError (Printf.sprintf "循环类型: %s 出现在 %s 中" id (typ_to_string t)))
else [(id, t)]
| TArrow (p1, r1), TArrow (p2, r2) ->
if List.length p1 <> List.length p2 then
raise (TypeError "函数参数数量不匹配");
let s = List.fold_left2 (fun s a b ->
let s' = unify (apply_subst s a) (apply_subst s b) in
compose_subst s' s
) [] p1 p2 in
let s' = unify (apply_subst s r1) (apply_subst s r2) in
compose_subst s' s
| _ ->
raise (TypeError (Printf.sprintf "无法统一 %s 和 %s" (typ_to_string t1) (typ_to_string t2)))
(* 类型推导 *)
let next_var = ref 0
let fresh_var () =
incr next_var;
TVar (Printf.sprintf "'%d" !next_var)
let rec infer env = function
| ENum _ -> ([], TFloat)
| EBool _ -> ([], TBool)
| EVar id ->
(match List.assoc_opt id env with
| Some t -> ([], instantiate t)
| None -> raise (TypeError (Printf.sprintf "未绑定变量: %s" id)))
| EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
let (s1, t1) = infer env l in
let (s2, t2) = infer (apply_env s1 env) r in
let s3 = unify (apply_subst s2 t1) TFloat in
let s4 = unify (apply_subst s3 t2) TFloat in
(compose_subst s4 (compose_subst s3 (compose_subst s2 s1)), TFloat)
| EIf (cond, then_e, else_e) ->
let (s1, t1) = infer env cond in
let s1' = unify t1 TBool in
let s2 = compose_subst s1' s1 in
let (s3, t2) = infer (apply_env s2 env) then_e in
let (s4, t3) = infer (apply_env (compose_subst s3 s2) env) else_e in
let s5 = unify (apply_subst s4 t2) t3 in
(compose_subst s5 (compose_subst s4 (compose_subst s3 s2)), apply_subst s5 t3)
| ELet (id, value, body) ->
let (s1, t1) = infer env value in
let env' = (id, generalize (apply_env s1 env) t1) :: apply_env s1 env in
let (s2, t2) = infer env' body in
(compose_subst s2 s1, t2)
| EFun (params, body) ->
let param_types = List.map (fun _ -> fresh_var ()) params in
let env' = List.fold_left2 (fun e p t -> (p, Scheme ([], t)) :: e) env params param_types in
let (s, t) = infer env' body in
(s, TArrow (List.map (apply_subst s) param_types, t))
| EApp (func, args) ->
let (s1, t1) = infer env func in
let (s2, arg_types) = List.fold_left_map (fun s arg ->
let (s', t) = infer (apply_env s env) arg in
(compose_subst s' s, t)
) s1 args in
let ret_type = fresh_var () in
let s3 = unify (apply_subst s2 t1) (TArrow (arg_types, ret_type)) in
(compose_subst s3 s2, apply_subst s3 ret_type)
错误恢复
(* 带错误恢复的解析器 *)
type parse_error = {
message: string;
location: Lexing.position;
expected: string list;
got: string;
}
let parse_with_recovery lexbuf =
let errors = ref [] in
let rec parse_expr () =
try
Parser.expr Lexer.token lexbuf
with
| Parser.Error ->
let pos = Lexing.lexeme_start_p lexbuf in
let token = Lexing.lexeme lexbuf in
errors := {
message = "语法错误";
location = pos;
expected = ["表达式"];
got = token;
} :: !errors;
(* 跳过到下一个同步点 *)
skip_to_sync lexbuf;
parse_expr ()
and skip_to_sync lexbuf =
let token = Lexer.token lexbuf in
match token with
| SEMICOLON | RPAREN | EOF -> ()
| _ -> skip_to_sync lexbuf
in
let result = parse_expr () in
(result, List.rev !errors)
(* 错误格式化 *)
let format_error err =
Printf.sprintf "%s:%d:%d: %s, 期望 %s, 得到 '%s'"
err.location.pos_fname
err.location.pos_lnum
(err.location.pos_cnum - err.location.pos_bol)
err.message
(String.concat " | " err.expected)
err.got
⚠️ 注意:错误恢复要小心避免级联错误。同步点(如分号、括号)是好的恢复位置。
语义分析
(* 语义分析:变量作用域、未使用变量、死代码 *)
type semantic_error =
| UnboundVariable of string
| UnusedVariable of string
| DuplicateBinding of string
| DeadCode of location
let analyze expr =
let errors = ref [] in
let used = Hashtbl.create 16 in
let rec check_expr env = function
| EVar id ->
(match List.assoc_opt id env with
| Some _ -> Hashtbl.replace used id true
| None -> errors := UnboundVariable id :: !errors)
| ELet (id, value, body) ->
check_expr env value;
if List.mem_assoc id env then
errors := DuplicateBinding id :: !errors;
Hashtbl.replace used id false;
check_expr ((id, ()) :: env) body;
if not (Hashtbl.find used id) then
errors := UnusedVariable id :: !errors
| EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
check_expr env l; check_expr env r
| EIf (c, t, e) ->
check_expr env c; check_expr env t; check_expr env e
| EFun (params, body) ->
let env' = List.fold_left (fun e p -> (p, ()) :: e) env params in
check_expr env' body
| EApp (func, args) ->
check_expr env func;
List.iter (check_expr env) args
| _ -> ()
in
check_expr [] expr;
List.rev !errors
中间代码生成
(* 简单的三地址码 *)
type ir_value =
| IRNum of float
| IRBool of bool
| IRVar of string
| IRTemp of int
type ir_instr =
| IRAssign of ir_value * ir_value
| IRAdd of ir_value * ir_value * ir_value
| IRSub of ir_value * ir_value * ir_value
| IRMul of ir_value * ir_value * ir_value
| IRDiv of ir_value * ir_value * ir_value
| IRLabel of string
| IRJump of string
| IRCondJump of ir_value * string * string
| IRCall of ir_value * string * ir_value list
| IRReturn of ir_value
type ir_block = {
label: string;
instructions: ir_instr list;
}
let next_temp = ref 0
let fresh_temp () =
incr next_temp;
IRTemp !next_temp
let compile_expr expr =
let instrs = ref [] in
let emit instr = instrs := instr :: !instrs in
let rec compile = function
| ENum n -> IRNum n
| EBool b -> IRBool b
| EVar id -> IRVar id
| EAdd (l, r) ->
let lv = compile l in
let rv = compile r in
let t = fresh_temp () in
emit (IRAdd (t, lv, rv));
t
| ESub (l, r) ->
let lv = compile l in
let rv = compile r in
let t = fresh_temp () in
emit (IRSub (t, lv, rv));
t
| EMul (l, r) ->
let lv = compile l in
let rv = compile r in
let t = fresh_temp () in
emit (IRMul (t, lv, rv));
t
| EDiv (l, r) ->
let lv = compile l in
let rv = compile r in
let t = fresh_temp () in
emit (IRDiv (t, lv, rv));
t
| ELet (id, value, body) ->
let v = compile value in
emit (IRAssign (IRVar id, v));
compile body
| EIf (cond, then_e, else_e) ->
let cv = compile cond in
let then_label = Printf.sprintf "then_%d" !next_temp in
let else_label = Printf.sprintf "else_%d" !next_temp in
let end_label = Printf.sprintf "end_%d" !next_temp in
emit (IRCondJump (cv, then_label, else_label));
emit (IRLabel then_label);
let tv = compile then_e in
emit (IRAssign (IRVar "result", tv));
emit (IRJump end_label);
emit (IRLabel else_label);
let ev = compile else_e in
emit (IRAssign (IRVar "result", ev));
emit (IRLabel end_label);
IRVar "result"
| EApp (func, args) ->
let func_name = match func with EVar id -> id | _ -> "lambda" in
let arg_values = List.map compile args in
let t = fresh_temp () in
emit (IRCall (t, func_name, arg_values));
t
| _ -> failwith "未实现"
in
let result = compile expr in
List.rev !instrs
构建一个简单语言编译器
(* 编译器管线 *)
type compile_result = {
ast: Ast.expr;
ir: ir_instr list;
errors: string list;
}
let compile source =
(* 1. 词法分析 *)
let lexbuf = Lexing.from_string source in
Lexing.set_filename lexbuf "input";
(* 2. 语法分析 *)
let ast =
try Parser.program Lexer.token lexbuf
with Parser.Error ->
let pos = Lexing.lexeme_start_p lexbuf in
failwith (Printf.sprintf "语法错误: %s:%d" pos.pos_fname pos.pos_lnum)
in
(* 3. 语义分析 *)
let semantic_errors = analyze ast in
if semantic_errors <> [] then
List.iter (fun e ->
Printf.eprintf "警告: %s\n" (match e with
| UnboundVariable id -> Printf.sprintf "未绑定变量: %s" id
| UnusedVariable id -> Printf.sprintf "未使用变量: %s" id
| DuplicateBinding id -> Printf.sprintf "重复绑定: %s" id
| DeadCode _ -> "死代码")
) semantic_errors;
(* 4. 类型检查 *)
(match check ast with
| Ok t -> Printf.printf "类型: %s\n" (typ_to_string t)
| Error msg -> Printf.eprintf "类型错误: %s\n" msg);
(* 5. 中间代码生成 *)
let ir = compile_expr ast in
{ ast; ir; errors = [] }
(* 入口 *)
let () =
let source = {|
let x = 10 in
let y = 20 in
let f = fun a b -> a + b in
f(x, y)
|} in
let result = compile source in
Printf.printf "生成 %d 条指令\n" (List.length result.ir)
扩展阅读
上一节:形式验证与属性测试 下一节:完整项目:构建一个小型数据库