强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

OCaml 教程 / 解析器组合子

解析器组合子 (Parser Combinators)

解析器组合子是一种函数式编程技术,通过将小型解析器组合成更复杂的解析器来构建完整的解析器。OCaml 生态系统中有多个优秀的解析器组合子库。

解析器类型定义

核心思想是将解析器定义为一个函数:接受输入流,返回解析结果和剩余输入。

(* 基本解析器类型 *)
type 'a parser = {
  run : string -> int -> ('a * int, string) result
}

(* 解析成功 *)
let ok value pos = Ok (value, pos)

(* 解析失败 *)
let err msg = Error msg

(* 运行解析器 *)
let parse p input =
  match p.run input 0 with
  | Ok (value, pos) ->
    if pos = String.length input then Ok value
    else Error (Printf.sprintf "未消耗完输入,剩余 %d 字符" (String.length input - pos))
  | Error msg -> Error msg
组件说明
parser 类型核心类型,封装解析逻辑
run 函数接受输入字符串和当前位置,返回结果
result 类型成功返回值和新位置,失败返回错误消息

基本解析器

字符解析器

(* 匹配特定字符 *)
let char expected =
  { run = fun input pos ->
    if pos >= String.length input then
      err (Printf.sprintf "期望 '%c' 但到达输入末尾" expected)
    else if input.[pos] = expected then
      ok expected (pos + 1)
    else
      err (Printf.sprintf "期望 '%c' 但得到 '%c'" expected input.[pos])
  }

(* 匹配满足条件的字符 *)
let satisfy name predicate =
  { run = fun input pos ->
    if pos >= String.length input then
      err (Printf.sprintf "期望 %s 但到达输入末尾" name)
    else if predicate input.[pos] then
      ok input.[pos] (pos + 1)
    else
      err (Printf.sprintf "期望 %s 但得到 '%c'" name input.[pos])
  }

(* 预定义字符类 *)
let digit = satisfy "digit" (fun c -> c >= '0' && c <= '9')
let alpha = satisfy "alpha" (fun c -> (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
let whitespace = satisfy "whitespace" (fun c -> c = ' ' || c = '\t' || c = '\n' || c = '\r')

字符串解析器

(* 匹配特定字符串 *)
let string expected =
  let len = String.length expected in
  { run = fun input pos ->
    if pos + len > String.length input then
      err (Printf.sprintf "期望 \"%s\" 但输入不足" expected)
    else
      let actual = String.sub input pos len in
      if actual = expected then ok expected (pos + len)
      else err (Printf.sprintf "期望 \"%s\" 但得到 \"%s\"" expected actual)
  }

(* 示例 *)
let _ = parse (string "hello") "hello world"   (* Ok "hello" *)
let _ = parse (string "hello") "world"          (* Error ... *)

💡 提示satisfy 是最灵活的基本解析器,几乎所有字符匹配都可以通过它实现。

组合子

Functor 与 Applicative

(* map:转换解析结果 *)
let map f p =
  { run = fun input pos ->
    match p.run input pos with
    | Ok (value, pos') -> ok (f value) pos'
    | Error msg -> err msg
  }

(* pure:将值包装为解析器 *)
let pure value =
  { run = fun pos -> ok value pos }

(* apply:函数应用 *)
let apply fp xp =
  { run = fun input pos ->
    match fp.run input pos with
    | Ok (f, pos') ->
      (match xp.run input pos' with
       | Ok (x, pos'') -> ok (f x) pos''
       | Error msg -> err msg)
    | Error msg -> err msg
  }

(* 语法糖 *)
let ( <*> ) = apply
let ( <$> ) = map

Choice(选择)

(* 尝试第一个解析器,失败则尝试第二个 *)
let (<|>) p1 p2 =
  { run = fun input pos ->
    match p1.run input pos with
    | Ok _ as result -> result
    | Error _ -> p2.run input pos
  }

(* 从列表中选择 *)
let choice parsers =
  List.fold_left (<|>) (List.hd parsers) (List.tl parsers)

(* 示例:匹配字母或数字 *)
let alphanum = alpha <|> digit

⚠️ 注意<|> 会回溯到原始位置尝试下一个解析器。对于前缀相同的选项,可能需要 try_ 组合子来避免歧义。

Sequence(序列)

(* 顺序执行两个解析器,忽略第一个结果 *)
let (>>) p1 p2 =
  { run = fun input pos ->
    match p1.run input pos with
    | Ok (_, pos') -> p2.run input pos'
    | Error msg -> err msg
  }

(* 顺序执行两个解析器,忽略第二个结果 *)
let (<<) p1 p2 =
  { run = fun input pos ->
    match p1.run input pos with
    | Ok (v, pos') ->
      (match p2.run input pos' with
       | Ok (_, pos'') -> ok v pos''
       | Error msg -> err msg)
    | Error msg -> err msg
  }

(* 组合两个解析器的结果 *)
let ( .>>. ) p1 p2 =
  { run = fun input pos ->
    match p1.run input pos with
    | Ok (v1, pos') ->
      (match p2.run input pos' with
       | Ok (v2, pos'') -> ok (v1, v2) pos''
       | Error msg -> err msg)
    | Error msg -> err msg
  }

Many(重复)

(* 零次或多次 *)
let many p =
  { run = fun input pos ->
    let rec loop pos acc =
      match p.run input pos with
      | Ok (v, pos') -> loop pos' (v :: acc)
      | Error _ -> ok (List.rev acc) pos
    in
    loop pos []
  }

(* 一次或多次 *)
let many1 p =
  { run = fun input pos ->
    let rec loop pos acc =
      match p.run input pos with
      | Ok (v, pos') -> loop pos' (v :: acc)
      | Error _ ->
        if acc = [] then err "期望至少一个匹配"
        else ok (List.rev acc) pos
    in
    loop pos []
  }

(* 示例:解析多个空格 *)
let spaces = many whitespace
let spaces1 = many1 whitespace

Sep_by(分隔符)

(* 使用分隔符分隔的列表 *)
let sep_by p sep =
  { run = fun input pos ->
    match p.run input pos with
    | Error _ -> ok [] pos
    | Ok (first, pos') ->
      let rec loop pos acc =
        match sep.run input pos with
        | Error _ -> ok (List.rev acc) pos
        | Ok (_, pos') ->
          (match p.run input pos' with
           | Ok (v, pos'') -> loop pos'' (v :: acc)
           | Error _ -> ok (List.rev acc) pos)
      in
      loop pos' [first]
  }

(* 使用分隔符分隔的列表(至少一个) *)
let sep_by1 p sep =
  { run = fun input pos ->
    match p.run input pos with
    | Error msg -> err msg
    | Ok (first, pos') ->
      let rec loop pos acc =
        match sep.run input pos with
        | Error _ -> ok (List.rev acc) pos
        | Ok (_, pos') ->
          (match p.run input pos' with
           | Ok (v, pos'') -> loop pos'' (v :: acc)
           | Error _ -> ok (List.rev acc) pos)
      in
      loop pos' [first]
  }

(* 示例:逗号分隔的数字 *)
let comma = char ',' >> spaces
let number_list = sep_by (many1 digit) comma

递归解析

(* 延迟求值解决递归定义 *)
let lazy_parser f =
  { run = fun input pos -> (f ()).run input pos }

(* 示例:递归解析括号表达式 *)
let rec expr () =
  (number .>>. lazy_parser paren_expr)
  |> map (fun (n, e) -> `Paren (n, e))

and paren_expr () =
  (char '(' >> spaces >> lazy_parser expr << spaces << char ')')
  |> map (fun e -> e)

and number () =
  many1 digit
  |> map (fun ds -> `Num (int_of_string (String.concat "" (List.map (String.make 1) ds))))

(* 简化的递归表达式解析 *)
let expr_parser = lazy_parser expr

💡 提示:递归解析是解析器组合子的核心优势之一,使得解析嵌套结构(如 JSON、XML)变得非常自然。

AST 构建

(* 定义抽象语法树 *)
type expr =
  | Num of float
  | Add of expr * expr
  | Mul of expr * expr
  | Sub of expr * expr
  | Div of expr * expr

type stmt =
  | Let of string * expr
  | Print of expr
  | Block of stmt list

(* 将解析结果转换为 AST *)
let make_num ds =
  Num (float_of_string (String.concat "" (List.map (String.make 1) ds)))

let make_binop op left right =
  match op with
  | '+' -> Add (left, right)
  | '*' -> Mul (left, right)
  | '-' -> Sub (left, right)
  | '/' -> Div (left, right)
  | _ -> failwith "未知运算符"

(* 运算符解析器 *)
let add_op = char '+' >> spaces >> return make_add
let mul_op = char '*' >> spaces >> return make_mul
AST 节点说明示例
Num数字字面量42
Add加法1 + 2
Mul乘法3 * 4
Sub减法5 - 1
Div除法10 / 2

错误报告

(* 带位置信息的错误 *)
type 'a positioned = {
  value: 'a;
  line: int;
  col: int;
}

type parse_error = {
  message: string;
  line: int;
  col: int;
  context: string option;
}

(* 计算行号和列号 *)
let compute_position input pos =
  let line = ref 1 in
  let col = ref 1 in
  for i = 0 to pos - 1 do
    if input.[i] = '\n' then begin
      incr line;
      col := 1
    end else
      incr col
  done;
  (!line, !col)

(* 带位置的错误报告 *)
let with_position p =
  { run = fun input pos ->
    let (line, col) = compute_position input pos in
    match p.run input pos with
    | Ok (value, pos') ->
      let (line', col') = compute_position input pos' in
      ok { value; line = line'; col = col' } pos'
    | Error msg ->
      let error = {
        message = msg;
        line;
        col;
        context = Some (Printf.sprintf "在第 %d 行,第 %d 列" line col)
      } in
      Error (Printf.sprintf "%s: %s" 
        (Option.value error.context ~default:"")
        error.message)
  }

(* 期望错误消息 *)
let expected expected_msg p =
  { run = fun input pos ->
    match p.run input pos with
    | Ok _ as result -> result
    | Error _ -> err (Printf.sprintf "期望 %s" expected_msg)
  }

⚠️ 注意:良好的错误消息对于用户体验至关重要。始终包含位置信息和期望内容。

解析器优化

(* 记忆化:避免重复解析 *)
let memoize p =
  let cache = Hashtbl.create 64 in
  { run = fun input pos ->
    match Hashtbl.find_opt cache (input, pos) with
    | Some result -> result
    | None ->
      let result = p.run input pos in
      Hashtbl.add cache (input, pos) result;
      result
  }

(* 避免不必要的回溯:使用 try_ *)
let try_ p =
  { run = fun input pos ->
    match p.run input pos with
    | Ok _ as result -> result
    | Error _ -> err "" (* 重置错误位置 *)
  }

(* 预测性解析:使用 lookahead *)
let lookahead p =
  { run = fun input pos ->
    match p.run input pos with
    | Ok (value, _) -> ok value pos  (* 不消耗输入 *)
    | Error msg -> err msg
  }

(* 性能对比 *)
let inefficient = many (char 'a' <|> char 'b')  (* 每次都尝试 'a' *)
let efficient = many (satisfy "letter" (fun c -> c = 'a' || c = 'b'))  (* 单次检查 *)
优化技术适用场景效果
记忆化递归解析、重复模式避免指数级复杂度
try_前缀歧义减少不必要回溯
lookahead前瞻检查避免消耗输入
字符类多选项单次匹配多个选项

Angstrom 库使用

Angstrom 是 OCaml 生产级的解析器组合子库。

(* 安装:opam install angstrom *)

open Angstrom

(* 基本解析器 *)
let p_number =
  take_while1 (function '0' .. '9' -> true | _ -> false)
  >>| int_of_string

let p_identifier =
  take_while1 (function
    | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> true
    | _ -> false)

let p_whitespace = skip_while (function ' ' | '\t' | '\n' | '\r' -> true | _ -> false)

(* JSON 解析器示例 *)
type json =
  | JObject of (string * json) list
  | JArray of json list
  | JString of string
  | JNumber of float
  | JBool of bool
  | JNull

let rec p_json () =
  p_whitespace *> 
  (choice [
    p_object;
    p_array;
    p_string >>| (fun s -> JString s);
    p_number_float >>| (fun f -> JNumber f);
    string "true" *> return (JBool true);
    string "false" *> return (JBool false);
    string "null" *> return JNull;
  ])

and p_object =
  char '{' *> p_whitespace *>
  sep_by (p_key_value) (char ',' *> p_whitespace) <*
  p_whitespace <* char '}'
  >>| (fun kvs -> JObject kvs)

and p_key_value =
  p_string <* (p_whitespace *> char ':' *> p_whitespace) .>>. p_json ()

and p_array =
  char '[' *> p_whitespace *>
  sep_by (p_json ()) (char ',' *> p_whitespace) <*
  p_whitespace <* char ']'
  >>| (fun items -> JArray items)

and p_string =
  char '"' *> take_while (function '"' -> false | _ -> true) <* char '"'

and p_number_float =
  take_while1 (function '0' .. '9' | '.' -> true | _ -> false)
  >>| float_of_string

(* 运行解析 *)
let parse_json input =
  parse_string ~consume:All (p_json ()) input

let _ = parse_json {|{"name": "John", "age": 30, "scores": [90, 85, 95]}|}

💡 提示:Angstrom 使用 >>= (bind) 和 >>| (map) 操作符,比自定义库更高效且经过充分测试。

实际案例

数学表达式解析器

open Angstrom

type expr =
  | Num of float
  | Add of expr * expr
  | Sub of expr * expr
  | Mul of expr * expr
  | Div of expr * expr
  | Neg of expr

let ws = skip_while (function ' ' | '\t' -> true | _ -> false)

let p_number =
  take_while1 (function '0' .. '9' | '.' -> true | _ -> false)
  >>| (fun s -> Num (float_of_string s))

let p_parens p = char '(' *> ws *> p <* ws <* char ')'

let rec p_expr () =
  let expr = p_expr () in
  ws *> (choice [p_term; p_parens expr]) <* ws

and p_term () =
  let factor = p_factor () in
  ws *> lift2 (fun left rest ->
    List.fold_left (fun acc (op, right) -> op acc right) left rest
  ) (p_product ()) (many (ws *> (choice [
    char '+' *> ws *> return (fun a b -> Add (a, b));
    char '-' *> ws *> return (fun a b -> Sub (a, b));
  ]) .>>. p_product ()))

and p_product () =
  ws *> lift2 (fun left rest ->
    List.fold_left (fun acc (op, right) -> op acc right) left rest
  ) (p_factor ()) (many (ws *> (choice [
    char '*' *> ws *> return (fun a b -> Mul (a, b));
    char '/' *> ws *> return (fun a b -> Div (a, b));
  ]) .>>. p_factor ()))

and p_factor () =
  ws *> (choice [
    char '-' *> ws *> p_factor () >>| (fun e -> Neg e);
    p_number;
    p_parens (p_expr ());
  ])

let parse_expr input =
  parse_string ~consume:All (p_expr ()) input

(* 测试 *)
let _ = parse_expr "1 + 2 * (3 - 4)"
(* Ok (Add (Num 1., Mul (Num 2., Sub (Num 3., Num 4.)))) *)

INI 文件解析器

type ini_value = string
type ini_section = (string * ini_value) list
type ini_file = ini_section * (string * ini_section) list  (* (全局, 各节) *)

let ws = skip_while (function ' ' | '\t' -> true | _ -> false)
let eol = char '\n' <|> (string "\r\n" *> return '\n')

let p_comment =
  char ';' *> skip_while (fun c -> c <> '\n') *> optional eol

let p_identifier =
  take_while1 (function
    | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' | '-' -> true
    | _ -> false)

let p_value =
  take_while (fun c -> c <> '\n' && c <> ';')
  >>| String.trim

let p_key_value =
  lift3 (fun key _ value -> (key, value))
    p_identifier
    (ws *> char '=' *> ws)
    p_value

let p_section_header =
  char '[' *> take_while (fun c -> c <> ']') <* char ']'

let p_section =
  lift2 (fun name kvs -> (name, kvs))
    (p_section_header <* optional eol)
    (many (ws *> p_key_value <* optional eol <* many p_comment))

let p_ini =
  lift2 (fun global sections -> (global, sections))
    (many (ws *> p_key_value <* optional eol))
    (many p_section)

let parse_ini input =
  parse_string ~consume:All p_ini input

(* 测试 *)
let test_ini = {|
; 配置文件
name = MyApp
version = 1.0

[database]
host = localhost
port = 5432

[server]
address = 0.0.0.0
port = 8080
|}

let _ = parse_ini test_ini

JSON 解析器(完整)

(* 使用 Angstrom 实现完整的 JSON 解析器 *)
open Angstrom

type json =
  | JObject of (string * json) list
  | JArray of json list
  | JString of string
  | JNumber of float
  | JBool of bool
  | JNull

let is_space = function ' ' | '\t' | '\n' | '\r' -> true | _ -> false
let ws = skip_while is_space

let is_ident_char = function
  | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> true
  | _ -> false

let p_string_literal =
  char '"' *> 
  (fix (fun _ ->
    let p_char = 
      satisfy (function '"' | '\\' -> false | _ -> true)
    in
    let p_escape = 
      char '\\' *> (choice [
        char '"' *> return '"';
        char '\\' *> return '\\';
        char '/' *> return '/';
        char 'n' *> return '\n';
        char 'r' *> return '\r';
        char 't' *> return '\t';
        char 'b' *> return '\b';
        char 'f' *> return '\f';
      ])
    in
    many (p_char <|> p_escape) >>| (fun cs ->
      let buf = Buffer.create 16 in
      List.iter (Buffer.add_char buf) cs;
      Buffer.contents buf)
  )) <* char '"'

let p_number =
  take_while1 (function
    | '0' .. '9' | '.' | '-' | 'e' | 'E' | '+' -> true
    | _ -> false)
  >>| float_of_string

let rec p_json () =
  ws *> (choice [
    p_object;
    p_array;
    p_string_literal >>| (fun s -> JString s);
    p_number >>| (fun f -> JNumber f);
    string "true" *> return (JBool true);
    string "false" *> return (JBool false);
    string "null" *> return JNull;
  ])

and p_key_value =
  ws *> p_string_literal <* ws <* char ':' <* ws .>>. p_json ()

and p_object =
  char '{' *> ws *>
  sep_by p_key_value (char ',' *> ws) <*
  ws <* char '}'
  >>| (fun kvs -> JObject kvs)

and p_array =
  char '[' *> ws *>
  sep_by (p_json ()) (char ',' *> ws) <*
  ws <* char ']'
  >>| (fun items -> JArray items)

let parse_json input =
  parse_string ~consume:All (p_json ()) input

(* 测试复杂 JSON *)
let test_json = {|{
  "name": "Alice",
  "age": 30,
  "hobbies": ["reading", "coding"],
  "address": {
    "city": "Beijing",
    "zip": "100000"
  },
  "active": true,
  "score": 95.5,
  "notes": null
}|}

let _ = parse_json test_json

⚠️ 注意:JSON 字符串中的转义字符(如 \n, \")必须正确处理,否则会导致解析失败或数据损坏。

扩展阅读


下一节编写一个解释器 — 学习如何基于解析器构建完整的解释器