强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

OCaml 教程 / 编写一个解释器

编写一个解释器

本节将从零开始构建一个简单的解释器,涵盖语言定义、词法分析、语法分析、求值和 REPL 实现。

语言定义

我们实现一个小型函数式语言,支持:

  • 数字和布尔值
  • 算术和比较运算
  • 变量绑定(let)
  • 条件表达式(if-else)
  • 函数定义和调用
  • 闭包
特性语法示例说明
数字42, 3.14浮点数
布尔true, false逻辑值
算术1 + 2 * 3支持优先级
比较x > 0返回布尔值
绑定let x = 1 in x + 1let-in 表达式
函数fun x -> x + 1lambda 表达式
调用f(1, 2)函数应用
条件if x > 0 then x else -xif-then-else

AST 定义

(* 表达式 AST *)
type expr =
  | ENum of float
  | EBool of bool
  | EVar of string
  | EAdd of expr * expr
  | ESub of expr * expr
  | EMul of expr * expr
  | EDiv of expr * expr
  | EEq of expr * expr
  | ELt of expr * expr
  | EGt of expr * expr
  | EAnd of expr * expr
  | EOr of expr * expr
  | ENeg of expr
  | ENot of expr
  | EIf of expr * expr * expr
  | ELet of string * expr * expr
  | EFun of string list * expr
  | EApp of expr * expr list
  | ELetRec of string * expr * expr

(* 值类型 *)
type value =
  | VNum of float
  | VBool of bool
  | VClosure of string list * expr * env
  | VBuiltin of (value list -> value)

and env = (string * value) list

💡 提示:AST 设计应尽量简单,保持每个节点的语义清晰。后续可以扩展,但简化比复杂化容易得多。

词法分析

(* Token 类型 *)
type token =
  | TNum of float
  | TBool of bool
  | TIdent of string
  | TPlus | TMinus | TStar | TSlash
  | TEq | TLt | TGt | TLe | TGe | TNeq
  | TAnd | TOr | TNot
  | TLParen | TRParen
  | TArrow | TComma
  | TLet | TIn | TIf | TThen | TElse | TFun | TRec
  | TEOF

(* 词法分析器 *)
let tokenize input =
  let len = String.length input in
  let pos = ref 0 in
  let tokens = ref [] in
  
  let peek () = 
    if !pos < len then Some input.[!pos] else None 
  in
  let advance () = 
    let c = input.[!pos] in 
    incr pos; 
    c 
  in
  let skip_whitespace () =
    while !pos < len && (let c = input.[!pos] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do
      incr pos
    done
  in
  let read_number () =
    let start = !pos in
    while !pos < len && (let c = input.[!pos] in c >= '0' && c <= '9' || c = '.') do
      incr pos
    done;
    float_of_string (String.sub input start (!pos - start))
  in
  let read_identifier () =
    let start = !pos in
    while !pos < len && (let c = input.[!pos] in 
      (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || 
      (c >= '0' && c <= '9') || c = '_') do
      incr pos
    done;
    String.sub input start (!pos - start)
  in
  
  while !pos < len do
    skip_whitespace ();
    if !pos >= len then ()
    else match peek () with
    | Some c when c >= '0' && c <= '9' ->
      tokens := TNum (read_number ()) :: !tokens
    | Some c when (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ->
      let id = read_identifier () in
      let tok = match id with
        | "true" -> TBool true
        | "false" -> TBool false
        | "let" -> TLet
        | "in" -> TIn
        | "if" -> TIf
        | "then" -> TThen
        | "else" -> TElse
        | "fun" -> TFun
        | "rec" -> TRec
        | "and" -> TAnd
        | "or" -> TOr
        | "not" -> TNot
        | _ -> TIdent id
      in
      tokens := tok :: !tokens
    | Some '+' -> incr pos; tokens := TPlus :: !tokens
    | Some '-' -> 
      incr pos;
      if !pos < len && input.[!pos] = '>' then begin
        incr pos;
        tokens := TArrow :: !tokens
      end else
        tokens := TMinus :: !tokens
    | Some '*' -> incr pos; tokens := TStar :: !tokens
    | Some '/' -> incr pos; tokens := TSlash :: !tokens
    | Some '=' -> incr pos; tokens := TEq :: !tokens
    | Some '<' ->
      incr pos;
      if !pos < len && input.[!pos] = '=' then begin
        incr pos;
        tokens := TLe :: !tokens
      end else
        tokens := TLt :: !tokens
    | Some '>' ->
      incr pos;
      if !pos < len && input.[!pos] = '=' then begin
        incr pos;
        tokens := TGe :: !tokens
      end else
        tokens := TGt :: !tokens
    | Some '!' ->
      incr pos;
      if !pos < len && input.[!pos] = '=' then begin
        incr pos;
        tokens := TNeq :: !tokens
      end else
        tokens := TNot :: !tokens
    | Some '(' -> incr pos; tokens := TLParen :: !tokens
    | Some ')' -> incr pos; tokens := TRParen :: !tokens
    | Some ',' -> incr pos; tokens := TComma :: !tokens
    | Some c -> failwith (Printf.sprintf "意外字符: %c" c)
    | None -> ()
  done;
  
  List.rev (TEOF :: !tokens)

⚠️ 注意:词法分析器需要正确处理多字符 token(如 -><=!=),避免误识别。

语法分析

(* 递归下降解析器 *)
let parse tokens =
  let pos = ref 0 in
  let peek () = List.nth tokens !pos in
  let advance () =
    let t = List.nth tokens !pos in
    incr pos;
    t
  in
  let expect tok =
    let t = advance () in
    if t <> tok then
      failwith (Printf.sprintf "期望 %s" (token_to_string tok))
  in
  
  let rec parse_expr () = parse_or ()
  
  and parse_or () =
    let left = parse_and () in
    match peek () with
    | TOr ->
      ignore (advance ());
      let right = parse_and () in
      EOr (left, right)
    | _ -> left
  
  and parse_and () =
    let left = parse_comparison () in
    match peek () with
    | TAnd ->
      ignore (advance ());
      let right = parse_comparison () in
      EAnd (left, right)
    | _ -> left
  
  and parse_comparison () =
    let left = parse_additive () in
    match peek () with
    | TEq -> ignore (advance ()); EEq (left, parse_additive ())
    | TLt -> ignore (advance ()); ELt (left, parse_additive ())
    | TGt -> ignore (advance ()); EGt (left, parse_additive ())
    | TLe -> ignore (advance ()); EAnd (ELt (left, parse_additive ()), EEq (left, parse_additive ()))
    | TGe -> ignore (advance ()); EOr (EGt (left, parse_additive ()), EEq (left, parse_additive ()))
    | TNeq -> ignore (advance ()); ENot (EEq (left, parse_additive ()))
    | _ -> left
  
  and parse_additive () =
    let left = parse_multiplicative () in
    match peek () with
    | TPlus -> ignore (advance ()); EAdd (left, parse_multiplicative ())
    | TMinus -> ignore (advance ()); ESub (left, parse_multiplicative ())
    | _ -> left
  
  and parse_multiplicative () =
    let left = parse_unary () in
    match peek () with
    | TStar -> ignore (advance ()); EMul (left, parse_unary ())
    | TSlash -> ignore (advance ()); EDiv (left, parse_unary ())
    | _ -> left
  
  and parse_unary () =
    match peek () with
    | TMinus -> ignore (advance ()); ENeg (parse_unary ())
    | TNot -> ignore (advance ()); ENot (parse_unary ())
    | _ -> parse_primary ()
  
  and parse_primary () =
    match peek () with
    | TNum n -> ignore (advance ()); ENum n
    | TBool b -> ignore (advance ()); EBool b
    | TIdent id ->
      ignore (advance ());
      if peek () = TLParen then begin
        ignore (advance ());
        let args = parse_args () in
        expect TRParen;
        EApp (EVar id, args)
      end else
        EVar id
    | TLParen ->
      ignore (advance ());
      let e = parse_expr () in
      expect TRParen;
      e
    | TLet ->
      ignore (advance ());
      let recursive = peek () = TRec in
      if recursive then ignore (advance ());
      let id = match advance () with TIdent s -> s | _ -> failwith "期望标识符" in
      expect TEq;
      let value = parse_expr () in
      expect TIn;
      let body = parse_expr () in
      if recursive then ELetRec (id, value, body)
      else ELet (id, value, body)
    | TIf ->
      ignore (advance ());
      let cond = parse_expr () in
      expect TThen;
      let then_expr = parse_expr () in
      expect TElse;
      let else_expr = parse_expr () in
      EIf (cond, then_expr, else_expr)
    | TFun ->
      ignore (advance ());
      let params = parse_params () in
      expect TArrow;
      let body = parse_expr () in
      EFun (params, body)
    | _ -> failwith "意外的 token"
  
  and parse_args () =
    if peek () = TRParen then []
    else
      let arg = parse_expr () in
      let rest = ref [] in
      while peek () = TComma do
        ignore (advance ());
        rest := parse_expr () :: !rest
      done;
      arg :: List.rev !rest
  
  and parse_params () =
    match peek () with
    | TIdent id ->
      ignore (advance ());
      let params = ref [id] in
      while peek () = TComma do
        ignore (advance ());
        match advance () with
        | TIdent p -> params := p :: !params
        | _ -> failwith "期望参数名"
      done;
      List.rev !params
    | _ -> failwith "期望参数列表"
  in
  
  let result = parse_expr () in
  if peek () <> TEOF then failwith "解析未完成";
  result

(* token 转字符串(用于错误消息) *)
let token_to_string = function
  | TNum n -> Printf.sprintf "%.2f" n
  | TBool b -> string_of_bool b
  | TIdent s -> s
  | TPlus -> "+" | TMinus -> "-" | TStar -> "*" | TSlash -> "/"
  | TEq -> "=" | TLt -> "<" | TGt -> ">"
  | TLParen -> "(" | TRParen -> ")"
  | TLet -> "let" | TIn -> "in" | TIf -> "if"
  | TThen -> "then" | TElse -> "else" | TFun -> "fun"
  | _ -> "???"

求值器

(* 环境操作 *)
let env_empty = []
let env_extend env name value = (name, value) :: env
let env_extend_multi env names values =
  List.fold_left2 (fun e n v -> env_extend e n v) env names values
let env_lookup env name =
  try List.assoc name env
  with Not_found -> failwith (Printf.sprintf "未绑定变量: %s" name)

(* 求值函数 *)
let rec eval env = function
  | ENum n -> VNum n
  | EBool b -> VBool b
  | EVar id -> env_lookup env id
  | EAdd (l, r) -> eval_binop env ( +. ) l r
  | ESub (l, r) -> eval_binop env ( -. ) l r
  | EMul (l, r) -> eval_binop env ( *. ) l r
  | EDiv (l, r) -> eval_binop env ( /. ) l r
  | EEq (l, r) ->
    let lv = eval env l and rv = eval env r in
    VBool (lv = rv)
  | ELt (l, r) ->
    (match eval env l, eval env r with
     | VNum a, VNum b -> VBool (a < b)
     | _ -> failwith "< 运算需要数字")
  | EGt (l, r) ->
    (match eval env l, eval env r with
     | VNum a, VNum b -> VBool (a > b)
     | _ -> failwith "> 运算需要数字")
  | EAnd (l, r) ->
    (match eval env l with
     | VBool false -> VBool false
     | VBool true -> eval env r
     | _ -> failwith "and 运算需要布尔值")
  | EOr (l, r) ->
    (match eval env l with
     | VBool true -> VBool true
     | VBool false -> eval env r
     | _ -> failwith "or 运算需要布尔值")
  | ENeg e ->
    (match eval env e with
     | VNum n -> VNum (-.n)
     | _ -> failwith "- 运算需要数字")
  | ENot e ->
    (match eval env e with
     | VBool b -> VBool (not b)
     | _ -> failwith "not 运算需要布尔值")
  | EIf (cond, then_e, else_e) ->
    (match eval env cond with
     | VBool true -> eval env then_e
     | VBool false -> eval env else_e
     | _ -> failwith "条件必须是布尔值")
  | ELet (name, value, body) ->
    let v = eval env value in
    eval (env_extend env name v) body
  | EFun (params, body) ->
    VClosure (params, body, env)
  | EApp (func, args) ->
    let fv = eval env func in
    let argvs = List.map (eval env) args in
    apply fv argvs
  | ELetRec (name, value, body) ->
    let rec_env = ref env in
    let v = eval !rec_env value in
    rec_env := env_extend !rec_env name v;
    eval !rec_env body

and eval_binop env op l r =
  match eval env l, eval env r with
  | VNum a, VNum b -> VNum (op a b)
  | _ -> failwith "算术运算需要数字"

and apply func args =
  match func with
  | VClosure (params, body, closure_env) ->
    if List.length params <> List.length args then
      failwith "参数数量不匹配";
    let new_env = env_extend_multi closure_env params args in
    eval new_env body
  | VBuiltin f -> f args
  | _ -> failwith "不可调用的值"

💡 提示:使用 VClosure 捕获定义时的环境,这是实现闭包的关键。函数在定义时"记住"了它所在的环境。

环境与闭包

(* 预定义环境 *)
let default_env = [
  ("pi", VNum 3.14159265358979);
  ("print", VBuiltin (fun args ->
    List.iter (fun v ->
      match v with
      | VNum n -> Printf.printf "%.6g\n" n
      | VBool b -> Printf.printf "%s\n" (string_of_bool b)
      | VClosure _ -> Printf.printf "<fun>\n"
      | VBuiltin _ -> Printf.printf "<builtin>\n"
    ) args;
    VNum 0.0  (* 返回 unit *)
  ));
  ("sqrt", VBuiltin (fun args ->
    match args with
    | [VNum n] -> VNum (sqrt n)
    | _ -> failwith "sqrt 需要一个数字参数"
  ));
  ("int_of_float", VBuiltin (fun args ->
    match args with
    | [VNum n] -> VNum (floor n)
    | _ -> failwith "int_of_float 需要一个数字参数"
  ));
]

(* 闭包示例 *)
let closure_example = {|
let make_adder = fun x ->
  fun y -> x + y in

let add5 = make_adder(5) in
let add10 = make_adder(10) in

print(add5(3));   (* 输出 8 *)
print(add10(3))   (* 输出 13 *)
|}

(* 递归函数示例 *)
let rec_example = {|
let rec factorial = fun n ->
  if n = 0 then 1
  else n * factorial(n - 1) in

print(factorial(5));  (* 输出 120 *)
print(factorial(10))  (* 输出 3628800 *)
|}
概念说明示例
环境变量名到值的映射[("x", VNum 1)]
闭包函数 + 定义时的环境VClosure(params, body, env)
递归let rec 绑定自身let rec f = fun n -> ... f(n-1) ...

REPL 实现

(* 主 REPL 循环 *)
let run_repl () =
  let env = ref default_env in
  Printf.printf "MiniML 解释器 v0.1\n";
  Printf.printf "输入 'quit' 退出\n\n";
  
  let continue = ref true in
  while !continue do
    Printf.printf "> ";
    flush stdout;
    
    try
      let line = input_line stdin in
      if String.trim line = "quit" then
        continue := false
      else begin
        let tokens = tokenize line in
        let ast = parse tokens in
        let result = eval !env ast in
        match result with
        | VNum n -> Printf.printf "= %.6g\n" n
        | VBool b -> Printf.printf "= %s\n" (string_of_bool b)
        | VClosure _ -> Printf.printf "= <fun>\n"
        | VBuiltin _ -> Printf.printf "= <builtin>\n"
      end
    with
    | End_of_file -> continue := false
    | Failure msg -> Printf.printf "错误: %s\n" msg
    | exn -> Printf.printf "未知错误: %s\n" (Printexc.to_string exn)
  done;
  Printf.printf "再见!\n"

(* 从文件执行 *)
let run_file filename =
  let ic = open_in filename in
  let content = really_input_string ic (in_channel_length ic) in
  close_in ic;
  
  let env = ref default_env in
  let tokens = tokenize content in
  let ast = parse tokens in
  ignore (eval !env ast)

(* 入口 *)
let () =
  if Array.length Sys.argv > 1 then
    run_file Sys.argv.(1)
  else
    run_repl ()

⚠️ 注意:REPL 应该优雅地处理所有异常,避免因单个错误导致解释器崩溃。

错误处理

(* 位置感知的错误类型 *)
type location = {
  line: int;
  col: int;
  filename: string;
}

type error =
  | LexError of location * string
  | ParseError of location * string
  | RuntimeError of location * string
  | TypeError of location * string

let format_error = function
  | LexError (loc, msg) ->
    Printf.sprintf "%s:%d:%d 词法错误: %s" loc.filename loc.line loc.col msg
  | ParseError (loc, msg) ->
    Printf.sprintf "%s:%d:%d 语法错误: %s" loc.filename loc.line loc.col msg
  | RuntimeError (loc, msg) ->
    Printf.sprintf "%s:%d:%d 运行时错误: %s" loc.filename loc.line loc.col msg
  | TypeError (loc, msg) ->
    Printf.sprintf "%s:%d:%d 类型错误: %s" loc.filename loc.line loc.col msg

(* 异常版本 *)
exception Error of error

let error loc kind msg = raise (Error (kind (loc, msg)))

(* 恢复模式解析 *)
let parse_with_recovery tokens =
  try Ok (parse tokens)
  with Failure msg -> Error msg

类型检查器

(* 简单类型系统 *)
type typ =
  | TInt
  | TFloat
  | TBool
  | TArrow of typ list * typ  (* 函数类型 *)
  | TVar of string            (* 类型变量 *)

type type_env = (string * typ) list

(* 类型推导 *)
let rec infer_type env = function
  | ENum _ -> TFloat
  | EBool _ -> TBool
  | EVar id ->
    (try List.assoc id env
     with Not_found -> failwith (Printf.sprintf "未绑定变量: %s" id))
  | EAdd (l, r) | ESub (l, r) | EMul (l, r) | EDiv (l, r) ->
    check_type env TFloat l;
    check_type env TFloat r;
    TFloat
  | EEq (l, r) ->
    let lt = infer_type env l in
    check_type env lt r;
    TBool
  | ELt (l, r) | EGt (l, r) ->
    check_type env TFloat l;
    check_type env TFloat r;
    TBool
  | EAnd (l, r) | EOr (l, r) ->
    check_type env TBool l;
    check_type env TBool r;
    TBool
  | ENeg e ->
    check_type env TFloat e;
    TFloat
  | ENot e ->
    check_type env TBool e;
    TBool
  | EIf (cond, then_e, else_e) ->
    check_type env TBool cond;
    let t = infer_type env then_e in
    check_type env t else_e;
    t
  | ELet (name, value, body) ->
    let vt = infer_type env value in
    infer_type ((name, vt) :: env) body
  | EFun (params, body) ->
    let param_types = List.map (fun _ -> TVar "a") params in
    let body_env = List.fold_left2 (fun e p t -> (p, t) :: e) env params param_types in
    let body_type = infer_type body_env body in
    TArrow (param_types, body_type)
  | EApp (func, args) ->
    (match infer_type env func with
     | TArrow (param_types, return_type) ->
       if List.length param_types <> List.length args then
         failwith "参数数量不匹配";
       List.iter2 (check_type env) param_types args;
       return_type
     | _ -> failwith "不可调用非函数类型")
  | ELetRec (name, value, body) ->
    let tv = TVar "a" in
    let vt = infer_type ((name, tv) :: env) value in
    infer_type ((name, vt) :: env) body

and check_type env expected expr =
  let actual = infer_type env expr in
  if actual <> expected then
    failwith (Printf.sprintf "类型不匹配: 期望 %s, 得到 %s"
      (type_to_string expected) (type_to_string actual))

and type_to_string = function
  | TFloat -> "float"
  | TInt -> "int"
  | TBool -> "bool"
  | TArrow (params, ret) ->
    Printf.sprintf "(%s) -> %s"
      (String.concat ", " (List.map type_to_string params))
      (type_to_string ret)
  | TVar s -> "'" ^ s

💡 提示:完整的类型推导需要统一化算法(unification),这里演示的是简化的类型检查。生产环境建议使用 Hindley-Milner 类型系统。

扩展:模式匹配

(* 扩展 AST 支持模式匹配 *)
type pattern =
  | PVar of string
  | PNum of float
  | PBool of bool
  | PWild
  | PTuple of pattern list

type expr_extended = expr =
  (* ... 原有表达式 ... *)
  | EMatch of expr * (pattern * expr) list

(* 模式匹配求值 *)
let rec match_pattern env value pattern =
  match pattern with
  | PVar id -> Some (env_extend env id value)
  | PNum n ->
    (match value with
     | VNum m when m = n -> Some env
     | _ -> None)
  | PBool b ->
    (match value with
     | VBool bv when bv = b -> Some env
     | _ -> None)
  | PWild -> Some env
  | PTuple patterns ->
    (match value with
     | VTuple values when List.length values = List.length patterns ->
       List.fold_left2 (fun acc p v ->
         match acc with
         | None -> None
         | Some env -> match_pattern env v p
       ) (Some env) patterns values
     | _ -> None)

let eval_match env value cases =
  let rec try_cases = function
    | [] -> failwith "模式匹配不完整"
    | (pattern, body) :: rest ->
      match match_pattern env value pattern with
      | Some new_env -> eval new_env body
      | None -> try_cases rest
  in
  try_cases cases

扩展阅读


上一节解析器组合子 下一节Web 开发(Dream/Opium)