OCaml 教程 / 解析器组合子
解析器组合子 (Parser Combinators)
解析器组合子是一种函数式编程技术,通过将小型解析器组合成更复杂的解析器来构建完整的解析器。OCaml 生态系统中有多个优秀的解析器组合子库。
解析器类型定义
核心思想是将解析器定义为一个函数:接受输入流,返回解析结果和剩余输入。
(* 基本解析器类型 *)
type 'a parser = {
run : string -> int -> ('a * int, string) result
}
(* 解析成功 *)
let ok value pos = Ok (value, pos)
(* 解析失败 *)
let err msg = Error msg
(* 运行解析器 *)
let parse p input =
match p.run input 0 with
| Ok (value, pos) ->
if pos = String.length input then Ok value
else Error (Printf.sprintf "未消耗完输入,剩余 %d 字符" (String.length input - pos))
| Error msg -> Error msg
| 组件 | 说明 |
|---|---|
parser 类型 | 核心类型,封装解析逻辑 |
run 函数 | 接受输入字符串和当前位置,返回结果 |
result 类型 | 成功返回值和新位置,失败返回错误消息 |
基本解析器
字符解析器
(* 匹配特定字符 *)
let char expected =
{ run = fun input pos ->
if pos >= String.length input then
err (Printf.sprintf "期望 '%c' 但到达输入末尾" expected)
else if input.[pos] = expected then
ok expected (pos + 1)
else
err (Printf.sprintf "期望 '%c' 但得到 '%c'" expected input.[pos])
}
(* 匹配满足条件的字符 *)
let satisfy name predicate =
{ run = fun input pos ->
if pos >= String.length input then
err (Printf.sprintf "期望 %s 但到达输入末尾" name)
else if predicate input.[pos] then
ok input.[pos] (pos + 1)
else
err (Printf.sprintf "期望 %s 但得到 '%c'" name input.[pos])
}
(* 预定义字符类 *)
let digit = satisfy "digit" (fun c -> c >= '0' && c <= '9')
let alpha = satisfy "alpha" (fun c -> (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
let whitespace = satisfy "whitespace" (fun c -> c = ' ' || c = '\t' || c = '\n' || c = '\r')
字符串解析器
(* 匹配特定字符串 *)
let string expected =
let len = String.length expected in
{ run = fun input pos ->
if pos + len > String.length input then
err (Printf.sprintf "期望 \"%s\" 但输入不足" expected)
else
let actual = String.sub input pos len in
if actual = expected then ok expected (pos + len)
else err (Printf.sprintf "期望 \"%s\" 但得到 \"%s\"" expected actual)
}
(* 示例 *)
let _ = parse (string "hello") "hello world" (* Ok "hello" *)
let _ = parse (string "hello") "world" (* Error ... *)
💡 提示:satisfy 是最灵活的基本解析器,几乎所有字符匹配都可以通过它实现。
组合子
Functor 与 Applicative
(* map:转换解析结果 *)
let map f p =
{ run = fun input pos ->
match p.run input pos with
| Ok (value, pos') -> ok (f value) pos'
| Error msg -> err msg
}
(* pure:将值包装为解析器 *)
let pure value =
{ run = fun pos -> ok value pos }
(* apply:函数应用 *)
let apply fp xp =
{ run = fun input pos ->
match fp.run input pos with
| Ok (f, pos') ->
(match xp.run input pos' with
| Ok (x, pos'') -> ok (f x) pos''
| Error msg -> err msg)
| Error msg -> err msg
}
(* 语法糖 *)
let ( <*> ) = apply
let ( <$> ) = map
Choice(选择)
(* 尝试第一个解析器,失败则尝试第二个 *)
let (<|>) p1 p2 =
{ run = fun input pos ->
match p1.run input pos with
| Ok _ as result -> result
| Error _ -> p2.run input pos
}
(* 从列表中选择 *)
let choice parsers =
List.fold_left (<|>) (List.hd parsers) (List.tl parsers)
(* 示例:匹配字母或数字 *)
let alphanum = alpha <|> digit
⚠️ 注意:<|> 会回溯到原始位置尝试下一个解析器。对于前缀相同的选项,可能需要 try_ 组合子来避免歧义。
Sequence(序列)
(* 顺序执行两个解析器,忽略第一个结果 *)
let (>>) p1 p2 =
{ run = fun input pos ->
match p1.run input pos with
| Ok (_, pos') -> p2.run input pos'
| Error msg -> err msg
}
(* 顺序执行两个解析器,忽略第二个结果 *)
let (<<) p1 p2 =
{ run = fun input pos ->
match p1.run input pos with
| Ok (v, pos') ->
(match p2.run input pos' with
| Ok (_, pos'') -> ok v pos''
| Error msg -> err msg)
| Error msg -> err msg
}
(* 组合两个解析器的结果 *)
let ( .>>. ) p1 p2 =
{ run = fun input pos ->
match p1.run input pos with
| Ok (v1, pos') ->
(match p2.run input pos' with
| Ok (v2, pos'') -> ok (v1, v2) pos''
| Error msg -> err msg)
| Error msg -> err msg
}
Many(重复)
(* 零次或多次 *)
let many p =
{ run = fun input pos ->
let rec loop pos acc =
match p.run input pos with
| Ok (v, pos') -> loop pos' (v :: acc)
| Error _ -> ok (List.rev acc) pos
in
loop pos []
}
(* 一次或多次 *)
let many1 p =
{ run = fun input pos ->
let rec loop pos acc =
match p.run input pos with
| Ok (v, pos') -> loop pos' (v :: acc)
| Error _ ->
if acc = [] then err "期望至少一个匹配"
else ok (List.rev acc) pos
in
loop pos []
}
(* 示例:解析多个空格 *)
let spaces = many whitespace
let spaces1 = many1 whitespace
Sep_by(分隔符)
(* 使用分隔符分隔的列表 *)
let sep_by p sep =
{ run = fun input pos ->
match p.run input pos with
| Error _ -> ok [] pos
| Ok (first, pos') ->
let rec loop pos acc =
match sep.run input pos with
| Error _ -> ok (List.rev acc) pos
| Ok (_, pos') ->
(match p.run input pos' with
| Ok (v, pos'') -> loop pos'' (v :: acc)
| Error _ -> ok (List.rev acc) pos)
in
loop pos' [first]
}
(* 使用分隔符分隔的列表(至少一个) *)
let sep_by1 p sep =
{ run = fun input pos ->
match p.run input pos with
| Error msg -> err msg
| Ok (first, pos') ->
let rec loop pos acc =
match sep.run input pos with
| Error _ -> ok (List.rev acc) pos
| Ok (_, pos') ->
(match p.run input pos' with
| Ok (v, pos'') -> loop pos'' (v :: acc)
| Error _ -> ok (List.rev acc) pos)
in
loop pos' [first]
}
(* 示例:逗号分隔的数字 *)
let comma = char ',' >> spaces
let number_list = sep_by (many1 digit) comma
递归解析
(* 延迟求值解决递归定义 *)
let lazy_parser f =
{ run = fun input pos -> (f ()).run input pos }
(* 示例:递归解析括号表达式 *)
let rec expr () =
(number .>>. lazy_parser paren_expr)
|> map (fun (n, e) -> `Paren (n, e))
and paren_expr () =
(char '(' >> spaces >> lazy_parser expr << spaces << char ')')
|> map (fun e -> e)
and number () =
many1 digit
|> map (fun ds -> `Num (int_of_string (String.concat "" (List.map (String.make 1) ds))))
(* 简化的递归表达式解析 *)
let expr_parser = lazy_parser expr
💡 提示:递归解析是解析器组合子的核心优势之一,使得解析嵌套结构(如 JSON、XML)变得非常自然。
AST 构建
(* 定义抽象语法树 *)
type expr =
| Num of float
| Add of expr * expr
| Mul of expr * expr
| Sub of expr * expr
| Div of expr * expr
type stmt =
| Let of string * expr
| Print of expr
| Block of stmt list
(* 将解析结果转换为 AST *)
let make_num ds =
Num (float_of_string (String.concat "" (List.map (String.make 1) ds)))
let make_binop op left right =
match op with
| '+' -> Add (left, right)
| '*' -> Mul (left, right)
| '-' -> Sub (left, right)
| '/' -> Div (left, right)
| _ -> failwith "未知运算符"
(* 运算符解析器 *)
let add_op = char '+' >> spaces >> return make_add
let mul_op = char '*' >> spaces >> return make_mul
| AST 节点 | 说明 | 示例 |
|---|---|---|
Num | 数字字面量 | 42 |
Add | 加法 | 1 + 2 |
Mul | 乘法 | 3 * 4 |
Sub | 减法 | 5 - 1 |
Div | 除法 | 10 / 2 |
错误报告
(* 带位置信息的错误 *)
type 'a positioned = {
value: 'a;
line: int;
col: int;
}
type parse_error = {
message: string;
line: int;
col: int;
context: string option;
}
(* 计算行号和列号 *)
let compute_position input pos =
let line = ref 1 in
let col = ref 1 in
for i = 0 to pos - 1 do
if input.[i] = '\n' then begin
incr line;
col := 1
end else
incr col
done;
(!line, !col)
(* 带位置的错误报告 *)
let with_position p =
{ run = fun input pos ->
let (line, col) = compute_position input pos in
match p.run input pos with
| Ok (value, pos') ->
let (line', col') = compute_position input pos' in
ok { value; line = line'; col = col' } pos'
| Error msg ->
let error = {
message = msg;
line;
col;
context = Some (Printf.sprintf "在第 %d 行,第 %d 列" line col)
} in
Error (Printf.sprintf "%s: %s"
(Option.value error.context ~default:"")
error.message)
}
(* 期望错误消息 *)
let expected expected_msg p =
{ run = fun input pos ->
match p.run input pos with
| Ok _ as result -> result
| Error _ -> err (Printf.sprintf "期望 %s" expected_msg)
}
⚠️ 注意:良好的错误消息对于用户体验至关重要。始终包含位置信息和期望内容。
解析器优化
(* 记忆化:避免重复解析 *)
let memoize p =
let cache = Hashtbl.create 64 in
{ run = fun input pos ->
match Hashtbl.find_opt cache (input, pos) with
| Some result -> result
| None ->
let result = p.run input pos in
Hashtbl.add cache (input, pos) result;
result
}
(* 避免不必要的回溯:使用 try_ *)
let try_ p =
{ run = fun input pos ->
match p.run input pos with
| Ok _ as result -> result
| Error _ -> err "" (* 重置错误位置 *)
}
(* 预测性解析:使用 lookahead *)
let lookahead p =
{ run = fun input pos ->
match p.run input pos with
| Ok (value, _) -> ok value pos (* 不消耗输入 *)
| Error msg -> err msg
}
(* 性能对比 *)
let inefficient = many (char 'a' <|> char 'b') (* 每次都尝试 'a' *)
let efficient = many (satisfy "letter" (fun c -> c = 'a' || c = 'b')) (* 单次检查 *)
| 优化技术 | 适用场景 | 效果 |
|---|---|---|
| 记忆化 | 递归解析、重复模式 | 避免指数级复杂度 |
try_ | 前缀歧义 | 减少不必要回溯 |
lookahead | 前瞻检查 | 避免消耗输入 |
| 字符类 | 多选项 | 单次匹配多个选项 |
Angstrom 库使用
Angstrom 是 OCaml 生产级的解析器组合子库。
(* 安装:opam install angstrom *)
open Angstrom
(* 基本解析器 *)
let p_number =
take_while1 (function '0' .. '9' -> true | _ -> false)
>>| int_of_string
let p_identifier =
take_while1 (function
| 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> true
| _ -> false)
let p_whitespace = skip_while (function ' ' | '\t' | '\n' | '\r' -> true | _ -> false)
(* JSON 解析器示例 *)
type json =
| JObject of (string * json) list
| JArray of json list
| JString of string
| JNumber of float
| JBool of bool
| JNull
let rec p_json () =
p_whitespace *>
(choice [
p_object;
p_array;
p_string >>| (fun s -> JString s);
p_number_float >>| (fun f -> JNumber f);
string "true" *> return (JBool true);
string "false" *> return (JBool false);
string "null" *> return JNull;
])
and p_object =
char '{' *> p_whitespace *>
sep_by (p_key_value) (char ',' *> p_whitespace) <*
p_whitespace <* char '}'
>>| (fun kvs -> JObject kvs)
and p_key_value =
p_string <* (p_whitespace *> char ':' *> p_whitespace) .>>. p_json ()
and p_array =
char '[' *> p_whitespace *>
sep_by (p_json ()) (char ',' *> p_whitespace) <*
p_whitespace <* char ']'
>>| (fun items -> JArray items)
and p_string =
char '"' *> take_while (function '"' -> false | _ -> true) <* char '"'
and p_number_float =
take_while1 (function '0' .. '9' | '.' -> true | _ -> false)
>>| float_of_string
(* 运行解析 *)
let parse_json input =
parse_string ~consume:All (p_json ()) input
let _ = parse_json {|{"name": "John", "age": 30, "scores": [90, 85, 95]}|}
💡 提示:Angstrom 使用 >>= (bind) 和 >>| (map) 操作符,比自定义库更高效且经过充分测试。
实际案例
数学表达式解析器
open Angstrom
type expr =
| Num of float
| Add of expr * expr
| Sub of expr * expr
| Mul of expr * expr
| Div of expr * expr
| Neg of expr
let ws = skip_while (function ' ' | '\t' -> true | _ -> false)
let p_number =
take_while1 (function '0' .. '9' | '.' -> true | _ -> false)
>>| (fun s -> Num (float_of_string s))
let p_parens p = char '(' *> ws *> p <* ws <* char ')'
let rec p_expr () =
let expr = p_expr () in
ws *> (choice [p_term; p_parens expr]) <* ws
and p_term () =
let factor = p_factor () in
ws *> lift2 (fun left rest ->
List.fold_left (fun acc (op, right) -> op acc right) left rest
) (p_product ()) (many (ws *> (choice [
char '+' *> ws *> return (fun a b -> Add (a, b));
char '-' *> ws *> return (fun a b -> Sub (a, b));
]) .>>. p_product ()))
and p_product () =
ws *> lift2 (fun left rest ->
List.fold_left (fun acc (op, right) -> op acc right) left rest
) (p_factor ()) (many (ws *> (choice [
char '*' *> ws *> return (fun a b -> Mul (a, b));
char '/' *> ws *> return (fun a b -> Div (a, b));
]) .>>. p_factor ()))
and p_factor () =
ws *> (choice [
char '-' *> ws *> p_factor () >>| (fun e -> Neg e);
p_number;
p_parens (p_expr ());
])
let parse_expr input =
parse_string ~consume:All (p_expr ()) input
(* 测试 *)
let _ = parse_expr "1 + 2 * (3 - 4)"
(* Ok (Add (Num 1., Mul (Num 2., Sub (Num 3., Num 4.)))) *)
INI 文件解析器
type ini_value = string
type ini_section = (string * ini_value) list
type ini_file = ini_section * (string * ini_section) list (* (全局, 各节) *)
let ws = skip_while (function ' ' | '\t' -> true | _ -> false)
let eol = char '\n' <|> (string "\r\n" *> return '\n')
let p_comment =
char ';' *> skip_while (fun c -> c <> '\n') *> optional eol
let p_identifier =
take_while1 (function
| 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' | '-' -> true
| _ -> false)
let p_value =
take_while (fun c -> c <> '\n' && c <> ';')
>>| String.trim
let p_key_value =
lift3 (fun key _ value -> (key, value))
p_identifier
(ws *> char '=' *> ws)
p_value
let p_section_header =
char '[' *> take_while (fun c -> c <> ']') <* char ']'
let p_section =
lift2 (fun name kvs -> (name, kvs))
(p_section_header <* optional eol)
(many (ws *> p_key_value <* optional eol <* many p_comment))
let p_ini =
lift2 (fun global sections -> (global, sections))
(many (ws *> p_key_value <* optional eol))
(many p_section)
let parse_ini input =
parse_string ~consume:All p_ini input
(* 测试 *)
let test_ini = {|
; 配置文件
name = MyApp
version = 1.0
[database]
host = localhost
port = 5432
[server]
address = 0.0.0.0
port = 8080
|}
let _ = parse_ini test_ini
JSON 解析器(完整)
(* 使用 Angstrom 实现完整的 JSON 解析器 *)
open Angstrom
type json =
| JObject of (string * json) list
| JArray of json list
| JString of string
| JNumber of float
| JBool of bool
| JNull
let is_space = function ' ' | '\t' | '\n' | '\r' -> true | _ -> false
let ws = skip_while is_space
let is_ident_char = function
| 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> true
| _ -> false
let p_string_literal =
char '"' *>
(fix (fun _ ->
let p_char =
satisfy (function '"' | '\\' -> false | _ -> true)
in
let p_escape =
char '\\' *> (choice [
char '"' *> return '"';
char '\\' *> return '\\';
char '/' *> return '/';
char 'n' *> return '\n';
char 'r' *> return '\r';
char 't' *> return '\t';
char 'b' *> return '\b';
char 'f' *> return '\f';
])
in
many (p_char <|> p_escape) >>| (fun cs ->
let buf = Buffer.create 16 in
List.iter (Buffer.add_char buf) cs;
Buffer.contents buf)
)) <* char '"'
let p_number =
take_while1 (function
| '0' .. '9' | '.' | '-' | 'e' | 'E' | '+' -> true
| _ -> false)
>>| float_of_string
let rec p_json () =
ws *> (choice [
p_object;
p_array;
p_string_literal >>| (fun s -> JString s);
p_number >>| (fun f -> JNumber f);
string "true" *> return (JBool true);
string "false" *> return (JBool false);
string "null" *> return JNull;
])
and p_key_value =
ws *> p_string_literal <* ws <* char ':' <* ws .>>. p_json ()
and p_object =
char '{' *> ws *>
sep_by p_key_value (char ',' *> ws) <*
ws <* char '}'
>>| (fun kvs -> JObject kvs)
and p_array =
char '[' *> ws *>
sep_by (p_json ()) (char ',' *> ws) <*
ws <* char ']'
>>| (fun items -> JArray items)
let parse_json input =
parse_string ~consume:All (p_json ()) input
(* 测试复杂 JSON *)
let test_json = {|{
"name": "Alice",
"age": 30,
"hobbies": ["reading", "coding"],
"address": {
"city": "Beijing",
"zip": "100000"
},
"active": true,
"score": 95.5,
"notes": null
}|}
let _ = parse_json test_json
⚠️ 注意:JSON 字符串中的转义字符(如 \n, \")必须正确处理,否则会导致解析失败或数据损坏。
扩展阅读
下一节:编写一个解释器 — 学习如何基于解析器构建完整的解释器