OCaml 教程 / 编写一个解释器
编写一个解释器
本节将从零开始构建一个简单的解释器,涵盖语言定义、词法分析、语法分析、求值和 REPL 实现。
语言定义
我们实现一个小型函数式语言,支持:
- 数字和布尔值
- 算术和比较运算
- 变量绑定(let)
- 条件表达式(if-else)
- 函数定义和调用
- 闭包
| 特性 | 语法示例 | 说明 |
|---|---|---|
| 数字 | 42, 3.14 | 浮点数 |
| 布尔 | true, false | 逻辑值 |
| 算术 | 1 + 2 * 3 | 支持优先级 |
| 比较 | x > 0 | 返回布尔值 |
| 绑定 | let x = 1 in x + 1 | let-in 表达式 |
| 函数 | fun x -> x + 1 | lambda 表达式 |
| 调用 | f(1, 2) | 函数应用 |
| 条件 | if x > 0 then x else -x | if-then-else |
AST 定义
(* 表达式 AST *)
type expr =
| ENum of float
| EBool of bool
| EVar of string
| EAdd of expr * expr
| ESub of expr * expr
| EMul of expr * expr
| EDiv of expr * expr
| EEq of expr * expr
| ELt of expr * expr
| EGt of expr * expr
| EAnd of expr * expr
| EOr of expr * expr
| ENeg of expr
| ENot of expr
| EIf of expr * expr * expr
| ELet of string * expr * expr
| EFun of string list * expr
| EApp of expr * expr list
| ELetRec of string * expr * expr
(* 值类型 *)
type value =
| VNum of float
| VBool of bool
| VClosure of string list * expr * env
| VBuiltin of (value list -> value)
and env = (string * value) list
💡 提示:AST 设计应尽量简单,保持每个节点的语义清晰。后续可以扩展,但简化比复杂化容易得多。
词法分析
(* Token 类型 *)
type token =
| TNum of float
| TBool of bool
| TIdent of string
| TPlus | TMinus | TStar | TSlash
| TEq | TLt | TGt | TLe | TGe | TNeq
| TAnd | TOr | TNot
| TLParen | TRParen
| TArrow | TComma
| TLet | TIn | TIf | TThen | TElse | TFun | TRec
| TEOF
(* 词法分析器 *)
let tokenize input =
let len = String.length input in
let pos = ref 0 in
let tokens = ref [] in
let peek () =
if !pos < len then Some input.[!pos] else None
in
let advance () =
let c = input.[!pos] in
incr pos;
c
in
let skip_whitespace () =
while !pos < len && (let c = input.[!pos] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do
incr pos
done
in
let read_number () =
let start = !pos in
while !pos < len && (let c = input.[!pos] in c >= '0' && c <= '9' || c = '.') do
incr pos
done;
float_of_string (String.sub input start (!pos - start))
in
let read_identifier () =
let start = !pos in
while !pos < len && (let c = input.[!pos] in
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') || c = '_') do
incr pos
done;
String.sub input start (!pos - start)
in
while !pos < len do
skip_whitespace ();
if !pos >= len then ()
else match peek () with
| Some c when c >= '0' && c <= '9' ->
tokens := TNum (read_number ()) :: !tokens
| Some c when (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ->
let id = read_identifier () in
let tok = match id with
| "true" -> TBool true
| "false" -> TBool false
| "let" -> TLet
| "in" -> TIn
| "if" -> TIf
| "then" -> TThen
| "else" -> TElse
| "fun" -> TFun
| "rec" -> TRec
| "and" -> TAnd
| "or" -> TOr
| "not" -> TNot
| _ -> TIdent id
in
tokens := tok :: !tokens
| Some '+' -> incr pos; tokens := TPlus :: !tokens
| Some '-' ->
incr pos;
if !pos < len && input.[!pos] = '>' then begin
incr pos;
tokens := TArrow :: !tokens
end else
tokens := TMinus :: !tokens
| Some '*' -> incr pos; tokens := TStar :: !tokens
| Some '/' -> incr pos; tokens := TSlash :: !tokens
| Some '=' -> incr pos; tokens := TEq :: !tokens
| Some '<' ->
incr pos;
if !pos < len && input.[!pos] = '=' then begin
incr pos;
tokens := TLe :: !tokens
end else
tokens := TLt :: !tokens
| Some '>' ->
incr pos;
if !pos < len && input.[!pos] = '=' then begin
incr pos;
tokens := TGe :: !tokens
end else
tokens := TGt :: !tokens
| Some '!' ->
incr pos;
if !pos < len && input.[!pos] = '=' then begin
incr pos;
tokens := TNeq :: !tokens
end else
tokens := TNot :: !tokens
| Some '(' -> incr pos; tokens := TLParen :: !tokens
| Some ')' -> incr pos; tokens := TRParen :: !tokens
| Some ',' -> incr pos; tokens := TComma :: !tokens
| Some c -> failwith (Printf.sprintf "意外字符: %c" c)
| None -> ()
done;
List.rev (TEOF :: !tokens)
⚠️ 注意:词法分析器需要正确处理多字符 token(如 ->、<=、!=),避免误识别。
语法分析
(* 递归下降解析器 *)
let parse tokens =
let pos = ref 0 in
let peek () = List.nth tokens !pos in
let advance () =
let t = List.nth tokens !pos in
incr pos;
t
in
let expect tok =
let t = advance () in
if t <> tok then
failwith (Printf.sprintf "期望 %s" (token_to_string tok))
in
let rec parse_expr () = parse_or ()
and parse_or () =
let left = parse_and () in
match peek () with
| TOr ->
ignore (advance ());
let right = parse_and () in
EOr (left, right)
| _ -> left
and parse_and () =
let left = parse_comparison () in
match peek () with
| TAnd ->
ignore (advance ());
let right = parse_comparison () in
EAnd (left, right)
| _ -> left
and parse_comparison () =
let left = parse_additive () in
match peek () with
| TEq -> ignore (advance ()); EEq (left, parse_additive ())
| TLt -> ignore (advance ()); ELt (left, parse_additive ())
| TGt -> ignore (advance ()); EGt (left, parse_additive ())
| TLe -> ignore (advance ()); EAnd (ELt (left, parse_additive ()), EEq (left, parse_additive ()))
| TGe -> ignore (advance ()); EOr (EGt (left, parse_additive ()), EEq (left, parse_additive ()))
| TNeq -> ignore (advance ()); ENot (EEq (left, parse_additive ()))
| _ -> left
and parse_additive () =
let left = parse_multiplicative () in
match peek () with
| TPlus -> ignore (advance ()); EAdd (left, parse_multiplicative ())
| TMinus -> ignore (advance ()); ESub (left, parse_multiplicative ())
| _ -> left
and parse_multiplicative () =
let left = parse_unary () in
match peek () with
| TStar -> ignore (advance ()); EMul (left, parse_unary ())
| TSlash -> ignore (advance ()); EDiv (left, parse_unary ())
| _ -> left
and parse_unary () =
match peek () with
| TMinus -> ignore (advance ()); ENeg (parse_unary ())
| TNot -> ignore (advance ()); ENot (parse_unary ())
| _ -> parse_primary ()
and parse_primary () =
match peek () with
| TNum n -> ignore (advance ()); ENum n
| TBool b -> ignore (advance ()); EBool b
| TIdent id ->
ignore (advance ());
if peek () = TLParen then begin
ignore (advance ());
let args = parse_args () in
expect TRParen;
EApp (EVar id, args)
end else
EVar id
| TLParen ->
ignore (advance ());
let e = parse_expr () in
expect TRParen;
e
| TLet ->
ignore (advance ());
let recursive = peek () = TRec in
if recursive then ignore (advance ());
let id = match advance () with TIdent s -> s | _ -> failwith "期望标识符" in
expect TEq;
let value = parse_expr () in
expect TIn;
let body = parse_expr () in
if recursive then ELetRec (id, value, body)
else ELet (id, value, body)
| TIf ->
ignore (advance ());
let cond = parse_expr () in
expect TThen;
let then_expr = parse_expr () in
expect TElse;
let else_expr = parse_expr () in
EIf (cond, then_expr, else_expr)
| TFun ->
ignore (advance ());
let params = parse_params () in
expect TArrow;
let body = parse_expr () in
EFun (params, body)
| _ -> failwith "意外的 token"
and parse_args () =
if peek () = TRParen then []
else
let arg = parse_expr () in
let rest = ref [] in
while peek () = TComma do
ignore (advance ());
rest := parse_expr () :: !rest
done;
arg :: List.rev !rest
and parse_params () =
match peek () with
| TIdent id ->
ignore (advance ());
let params = ref [id] in
while peek () = TComma do
ignore (advance ());
match advance () with
| TIdent p -> params := p :: !params
| _ -> failwith "期望参数名"
done;
List.rev !params
| _ -> failwith "期望参数列表"
in
let result = parse_expr () in
if peek () <> TEOF then failwith "解析未完成";
result
(* token 转字符串(用于错误消息) *)
let token_to_string = function
| TNum n -> Printf.sprintf "%.2f" n
| TBool b -> string_of_bool b
| TIdent s -> s
| TPlus -> "+" | TMinus -> "-" | TStar -> "*" | TSlash -> "/"
| TEq -> "=" | TLt -> "<" | TGt -> ">"
| TLParen -> "(" | TRParen -> ")"
| TLet -> "let" | TIn -> "in" | TIf -> "if"
| TThen -> "then" | TElse -> "else" | TFun -> "fun"
| _ -> "???"
求值器
(* 环境操作 *)
let env_empty = []
let env_extend env name value = (name, value) :: env
let env_extend_multi env names values =
List.fold_left2 (fun e n v -> env_extend e n v) env names values
let env_lookup env name =
try List.assoc name env
with Not_found -> failwith (Printf.sprintf "未绑定变量: %s" name)
(* 求值函数 *)
let rec eval env = function
| ENum n -> VNum n
| EBool b -> VBool b
| EVar id -> env_lookup env id
| EAdd (l, r) -> eval_binop env ( +. ) l r
| ESub (l, r) -> eval_binop env ( -. ) l r
| EMul (l, r) -> eval_binop env ( *. ) l r
| EDiv (l, r) -> eval_binop env ( /. ) l r
| EEq (l, r) ->
let lv = eval env l and rv = eval env r in
VBool (lv = rv)
| ELt (l, r) ->
(match eval env l, eval env r with
| VNum a, VNum b -> VBool (a < b)
| _ -> failwith "< 运算需要数字")
| EGt (l, r) ->
(match eval env l, eval env r with
| VNum a, VNum b -> VBool (a > b)
| _ -> failwith "> 运算需要数字")
| EAnd (l, r) ->
(match eval env l with
| VBool false -> VBool false
| VBool true -> eval env r
| _ -> failwith "and 运算需要布尔值")
| EOr (l, r) ->
(match eval env l with
| VBool true -> VBool true
| VBool false -> eval env r
| _ -> failwith "or 运算需要布尔值")
| ENeg e ->
(match eval env e with
| VNum n -> VNum (-.n)
| _ -> failwith "- 运算需要数字")
| ENot e ->
(match eval env e with
| VBool b -> VBool (not b)
| _ -> failwith "not 运算需要布尔值")
| EIf (cond, then_e, else_e) ->
(match eval env cond with
| VBool true -> eval env then_e
| VBool false -> eval env else_e
| _ -> failwith "条件必须是布尔值")
| ELet (name, value, body) ->
let v = eval env value in
eval (env_extend env name v) body
| EFun (params, body) ->
VClosure (params, body, env)
| EApp (func, args) ->
let fv = eval env func in
let argvs = List.map (eval env) args in
apply fv argvs
| ELetRec (name, value, body) ->
let rec_env = ref env in
let v = eval !rec_env value in
rec_env := env_extend !rec_env name v;
eval !rec_env body
and eval_binop env op l r =
match eval env l, eval env r with
| VNum a, VNum b -> VNum (op a b)
| _ -> failwith "算术运算需要数字"
and apply func args =
match func with
| VClosure (params, body, closure_env) ->
if List.length params <> List.length args then
failwith "参数数量不匹配";
let new_env = env_extend_multi closure_env params args in
eval new_env body
| VBuiltin f -> f args
| _ -> failwith "不可调用的值"
💡 提示:使用 VClosure 捕获定义时的环境,这是实现闭包的关键。函数在定义时"记住"了它所在的环境。
环境与闭包
(* 预定义环境 *)
let default_env = [
("pi", VNum 3.14159265358979);
("print", VBuiltin (fun args ->
List.iter (fun v ->
match v with
| VNum n -> Printf.printf "%.6g\n" n
| VBool b -> Printf.printf "%s\n" (string_of_bool b)
| VClosure _ -> Printf.printf "<fun>\n"
| VBuiltin _ -> Printf.printf "<builtin>\n"
) args;
VNum 0.0 (* 返回 unit *)
));
("sqrt", VBuiltin (fun args ->
match args with
| [VNum n] -> VNum (sqrt n)
| _ -> failwith "sqrt 需要一个数字参数"
));
("int_of_float", VBuiltin (fun args ->
match args with
| [VNum n] -> VNum (floor n)
| _ -> failwith "int_of_float 需要一个数字参数"
));
]
(* 闭包示例 *)
let closure_example = {|
let make_adder = fun x ->
fun y -> x + y in
let add5 = make_adder(5) in
let add10 = make_adder(10) in
print(add5(3)); (* 输出 8 *)
print(add10(3)) (* 输出 13 *)
|}
(* 递归函数示例 *)
let rec_example = {|
let rec factorial = fun n ->
if n = 0 then 1
else n * factorial(n - 1) in
print(factorial(5)); (* 输出 120 *)
print(factorial(10)) (* 输出 3628800 *)
|}
| 概念 | 说明 | 示例 |
|---|---|---|
| 环境 | 变量名到值的映射 | [("x", VNum 1)] |
| 闭包 | 函数 + 定义时的环境 | VClosure(params, body, env) |
| 递归 | let rec 绑定自身 | let rec f = fun n -> ... f(n-1) ... |
REPL 实现
(* 主 REPL 循环 *)
let run_repl () =
let env = ref default_env in
Printf.printf "MiniML 解释器 v0.1\n";
Printf.printf "输入 'quit' 退出\n\n";
let continue = ref true in
while !continue do
Printf.printf "> ";
flush stdout;
try
let line = input_line stdin in
if String.trim line = "quit" then
continue := false
else begin
let tokens = tokenize line in
let ast = parse tokens in
let result = eval !env ast in
match result with
| VNum n -> Printf.printf "= %.6g\n" n
| VBool b -> Printf.printf "= %s\n" (string_of_bool b)
| VClosure _ -> Printf.printf "= <fun>\n"
| VBuiltin _ -> Printf.printf "= <builtin>\n"
end
with
| End_of_file -> continue := false
| Failure msg -> Printf.printf "错误: %s\n" msg
| exn -> Printf.printf "未知错误: %s\n" (Printexc.to_string exn)
done;
Printf.printf "再见!\n"
(* 从文件执行 *)
let run_file filename =
let ic = open_in filename in
let content = really_input_string ic (in_channel_length ic) in
close_in ic;
let env = ref default_env in
let tokens = tokenize content in
let ast = parse tokens in
ignore (eval !env ast)
(* 入口 *)
let () =
if Array.length Sys.argv > 1 then
run_file Sys.argv.(1)
else
run_repl ()
⚠️ 注意:REPL 应该优雅地处理所有异常,避免因单个错误导致解释器崩溃。
错误处理
(* 位置感知的错误类型 *)
type location = {
line: int;
col: int;
filename: string;
}
type error =
| LexError of location * string
| ParseError of location * string
| RuntimeError of location * string
| TypeError of location * string
let format_error = function
| LexError (loc, msg) ->
Printf.sprintf "%s:%d:%d 词法错误: %s" loc.filename loc.line loc.col msg
| ParseError (loc, msg) ->
Printf.sprintf "%s:%d:%d 语法错误: %s" loc.filename loc.line loc.col msg
| RuntimeError (loc, msg) ->
Printf.sprintf "%s:%d:%d 运行时错误: %s" loc.filename loc.line loc.col msg
| TypeError (loc, msg) ->
Printf.sprintf "%s:%d:%d 类型错误: %s" loc.filename loc.line loc.col msg
(* 异常版本 *)
exception Error of error
let error loc kind msg = raise (Error (kind (loc, msg)))
(* 恢复模式解析 *)
let parse_with_recovery tokens =
try Ok (parse tokens)
with Failure msg -> Error msg
类型检查器
(* 简单类型系统 *)
type typ =
| TInt
| TFloat
| TBool
| TArrow of typ list * typ (* 函数类型 *)
| TVar of string (* 类型变量 *)
type type_env = (string * typ) list
(* 类型推导 *)
let rec infer_type env = function
| ENum _ -> TFloat
| EBool _ -> TBool
| EVar id ->
(try List.assoc id env
with Not_found -> failwith (Printf.sprintf "未绑定变量: %s" id))
| EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
check_type env TFloat l;
check_type env TFloat r;
TFloat
| EEq (l, r) ->
let lt = infer_type env l in
check_type env lt r;
TBool
| ELt (l, r) | EGt (l, r) ->
check_type env TFloat l;
check_type env TFloat r;
TBool
| EAnd (l, r) | EOr (l, r) ->
check_type env TBool l;
check_type env TBool r;
TBool
| ENeg e ->
check_type env TFloat e;
TFloat
| ENot e ->
check_type env TBool e;
TBool
| EIf (cond, then_e, else_e) ->
check_type env TBool cond;
let t = infer_type env then_e in
check_type env t else_e;
t
| ELet (name, value, body) ->
let vt = infer_type env value in
infer_type ((name, vt) :: env) body
| EFun (params, body) ->
let param_types = List.map (fun _ -> TVar "a") params in
let body_env = List.fold_left2 (fun e p t -> (p, t) :: e) env params param_types in
let body_type = infer_type body_env body in
TArrow (param_types, body_type)
| EApp (func, args) ->
(match infer_type env func with
| TArrow (param_types, return_type) ->
if List.length param_types <> List.length args then
failwith "参数数量不匹配";
List.iter2 (check_type env) param_types args;
return_type
| _ -> failwith "不可调用非函数类型")
| ELetRec (name, value, body) ->
let tv = TVar "a" in
let vt = infer_type ((name, tv) :: env) value in
infer_type ((name, vt) :: env) body
and check_type env expected expr =
let actual = infer_type env expr in
if actual <> expected then
failwith (Printf.sprintf "类型不匹配: 期望 %s, 得到 %s"
(type_to_string expected) (type_to_string actual))
and type_to_string = function
| TFloat -> "float"
| TInt -> "int"
| TBool -> "bool"
| TArrow (params, ret) ->
Printf.sprintf "(%s) -> %s"
(String.concat ", " (List.map type_to_string params))
(type_to_string ret)
| TVar s -> "'" ^ s
💡 提示:完整的类型推导需要统一化算法(unification),这里演示的是简化的类型检查。生产环境建议使用 Hindley-Milner 类型系统。
扩展:模式匹配
(* 扩展 AST 支持模式匹配 *)
type pattern =
| PVar of string
| PNum of float
| PBool of bool
| PWild
| PTuple of pattern list
type expr_extended = expr =
(* ... 原有表达式 ... *)
| EMatch of expr * (pattern * expr) list
(* 模式匹配求值 *)
let rec match_pattern env value pattern =
match pattern with
| PVar id -> Some (env_extend env id value)
| PNum n ->
(match value with
| VNum m when m = n -> Some env
| _ -> None)
| PBool b ->
(match value with
| VBool bv when bv = b -> Some env
| _ -> None)
| PWild -> Some env
| PTuple patterns ->
(match value with
| VTuple values when List.length values = List.length patterns ->
List.fold_left2 (fun acc p v ->
match acc with
| None -> None
| Some env -> match_pattern env v p
) (Some env) patterns values
| _ -> None)
let eval_match env value cases =
let rec try_cases = function
| [] -> failwith "模式匹配不完整"
| (pattern, body) :: rest ->
match match_pattern env value pattern with
| Some new_env -> eval new_env body
| None -> try_cases rest
in
try_cases cases
扩展阅读
- Crafting Interpreters — 经典编译器教程
- OCaml 实现的 Scheme 解释器
- PL Zoo — 编程语言动物园
- Real World OCaml - 编译器前端
上一节:解析器组合子 下一节:Web 开发(Dream/Opium)