OCaml 教程 / 字符串与字符处理
字符串与字符处理
概述
OCaml 的字符串是字节序列(byte sequence),默认不可变(从 4.06 开始)。char 类型表示单个字节(Latin-1 编码),不直接支持 Unicode。处理中文等多字节字符需要特殊库(如 Uutf)。
String 模块
创建字符串
(* 字面量 *)
let s = "Hello, OCaml!"
let empty = ""
let multiline = "line1\nline2\nline3"
(* 字符串长度 — 返回字节数 *)
let len = String.length "Hello" (* => 5 *)
let cn_len = String.length "你好" (* => 6,每个中文字符 3 字节 (UTF-8) *)
(* 重复字符串 *)
let dashes = String.make 20 '-' (* "--------------------" *)
(* 从字符列表创建 *)
let from_chars = String.of_seq (List.to_seq ['O'; 'C'; 'a'; 'm'; 'l'])
(* => "OCaml" *)
字符串索引与切片
let s = "Hello, World!"
(* 单字符访问 — 返回 char *)
let first = String.get s 0 (* => 'H' *)
let first' = s.[0] (* => 'H',语法糖 *)
(* 子串提取 *)
let sub = String.sub s 0 5 (* => "Hello",从索引 0 取 5 个字节 *)
let world = String.sub s 7 5 (* => "World" *)
(* 查找字符位置 *)
let pos = String.index s 'W' (* => 7,第一次出现 *)
let rpos = String.rindex s 'l' (* => 10,最后一次出现 *)
(* 安全查找 — 使用 opt 版本 *)
let opt_pos = String.index_opt s 'z' (* => None *)
let opt_pos' = String.index_opt s 'W' (* => Some 7 *)
(* 检查是否包含某字符 *)
let has_o = String.contains s 'o' (* => true *)
(* 前缀/后缀检查 *)
let starts_hello = String.starts_with ~prefix:"Hello" s (* => true *)
let ends_bang = String.ends_with ~suffix:"!" s (* => true *)
⚠️ 注意:String.sub s pos len 中 pos + len 不能超过 String.length s,否则会抛出 Invalid_argument 异常。
字符串变换
let s = " Hello, World! "
(* 去除空白 *)
let trimmed = String.trim s (* => "Hello, World!" *)
(* 大小写转换 *)
let upper = String.uppercase_ascii "hello" (* => "HELLO" *)
let lower = String.lowercase_ascii "HELLO" (* => "hello" *)
let cap = String.capitalize_ascii "hello" (* => "Hello" *)
(* 字符串映射 *)
let no_vowels = String.map (fun c ->
match c with
| 'a' | 'e' | 'i' | 'o' | 'u' -> '-'
| c -> c
) "Hello, World!"
(* => "H-ll-, W-rld!" *)
字符串分割与连接
(* 分割字符串 *)
let words = String.split_on_char ' ' "hello world ocaml"
(* => ["hello"; "world"; "ocaml"] *)
let csv = String.split_on_char ',' "a,b,c,d"
(* => ["a"; "b"; "c"; "d"] *)
(* 连接字符串 *)
let joined = String.concat ", " ["Alice"; "Bob"; "Charlie"]
(* => "Alice, Bob, Charlie" *)
let path = String.concat "/" [""; "home"; "user"; "file.txt"]
(* => "/home/user/file.txt" *)
(* 遍历字符串 *)
let () =
String.iter (fun c -> Printf.printf "%c " c) "Hello"
(* 输出: H e l l o *)
(* 带索引遍历 *)
let () =
String.iteri (fun i c -> Printf.printf "[%d]=%c " i c) "ABC"
(* 输出: [0]=A [1]=B [2]=C *)
Bytes 模块(可变字符串)
Bytes 提供了可变的字节序列:
(* 创建 Bytes *)
let b = Bytes.of_string "Hello"
(* 修改字节 *)
let () = Bytes.set b 0 'h' (* b = "hello" *)
(* 读取字节 *)
let c = Bytes.get b 0 (* => 'h' *)
(* 转换为不可变字符串 *)
let s = Bytes.to_string b (* => "hello" *)
(* 复制 *)
let b2 = Bytes.copy b
(* 子字节序列 *)
let sub = Bytes.sub b 0 3 (* => "hel" *)
(* 不可变字符串的修改 — 创建新字符串 *)
let replace_char s old_c new_c =
let b = Bytes.of_string s in
Bytes.iteri (fun i c ->
if c = old_c then Bytes.set b i new_c
) b;
Bytes.to_string b
let _ = replace_char "hello" 'l' 'r' (* => "herro" *)
⚠️ 注意:Bytes.of_string 不复制数据,它们共享内存。修改 bytes 会影响原始 string。安全做法是 Bytes.copy (Bytes.of_string s)。
💡 提示:String 是不可变的,Bytes 是可变的。从 4.06 开始,标准库明确区分两者。
Char 模块
(* 基本操作 *)
let _ = Char.code 'A' (* => 65 *)
let _ = Char.chr 97 (* => 'a' *)
(* 大小写转换 *)
let _ = Char.uppercase_ascii 'a' (* => 'A' *)
let _ = Char.lowercase_ascii 'A' (* => 'a' *)
(* 手动字符分类 *)
let is_digit c = c >= '0' && c <= '9'
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_alphanum c = is_alpha c || is_digit c
let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
let _ = is_digit '5' (* => true *)
let _ = is_alpha 'z' (* => true *)
let _ = is_whitespace ' ' (* => true *)
(* 字符转字符串 *)
let char_to_string c = String.make 1 c
字符串格式化:Printf.sprintf
(* 基本格式化 *)
let msg = Printf.sprintf "Hello, %s!" "World"
(* 数字格式化 *)
let hex = Printf.sprintf "0x%08X" 255 (* => "0x000000FF" *)
let padded = Printf.sprintf "%05d" 42 (* => "00042" *)
let float_f = Printf.sprintf "%.2f" 3.14159 (* => "3.14" *)
(* 宽度控制 *)
let left = Printf.sprintf "%-10s|" "hello" (* => "hello |" *)
let right = Printf.sprintf "%10s|" "hello" (* => " hello|" *)
(* 格式化到缓冲区 *)
let buf = Buffer.create 64
let () = Printf.bprintf buf "Name: %s, Age: %d" "Alice" 30
let result = Buffer.contents buf
格式说明符速查
| 说明符 | 类型 | 示例 |
|---|---|---|
%d | int | "%d" 42 => "42" |
%f | float | "%.2f" 3.14 => "3.14" |
%s | string | "%s" "hi" => "hi" |
%c | char | "%c" 'A' => "A" |
%b | bool | "%b" true => "true" |
%x / %X | int (hex) | "%x" 255 => "ff" |
%o | int (oct) | "%o" 8 => "10" |
%Ld | int64 | "%Ld" n |
%! | flush | "done%!" |
正则表达式
标准库有基础的 Str 模块,推荐使用更现代的 Re 库:
opam install re
Str 模块(标准库)
(* 编译: ocamlfind ocamlc -package str -linkpkg file.ml *)
let () =
let re = Str.regexp "\\([0-9]+\\)-\\([0-9]+\\)-\\([0-9]+\\)" in
let s = "日期: 2024-01-15" in
if Str.string_match re s 4 then begin
let year = Str.matched_group 1 s in
let month = Str.matched_group 2 s in
let day = Str.matched_group 3 s in
Printf.printf "年=%s, 月=%s, 日=%s\n" year month day
end
(* 全局替换 *)
let replaced = Str.global_replace (Str.regexp "foo") "bar" "foo and foo"
(* => "bar and bar" *)
Re 库(推荐)
(* 编译: ocamlfind ocamlopt -package re -linkpkg file.ml *)
open Re
(* 编译正则表达式 *)
let email_re =
compile (seq [
group (rep1 (compl [char '@']));
char '@';
group (rep1 (compl [char '.']));
char '.';
group (rep1 (compl [char '.']));
])
let extract_email s =
match exec_opt email_re s with
| Some g ->
let user = Group.get g 1 in
let domain = Group.get g 2 in
let tld = Group.get g 3 in
Some (user, domain, tld)
| None -> None
let _ = extract_email "联系: [email protected] 谢谢"
(* => Some ("alice", "example", "com") *)
(* 查找所有数字 *)
let find_all_numbers text =
let num_re = compile (rep1 digit) in
List.map (fun g -> Group.get g 0) (all num_re text)
let _ = find_all_numbers "价格: 100元, 数量: 5个"
(* => ["100"; "5"] *)
⚠️ 注意:Str 模块使用全局状态存储匹配结果,不是线程安全的。多线程环境应使用 Re 库。
UTF-8 处理
OCaml 的 string 是字节序列,处理 UTF-8 需要 Uutf 库:
opam install uutf
(* 遍历 UTF-8 字符 *)
let utf8_length s =
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String s) in
let count = ref 0 in
let rec loop () =
match Uutf.decode decoder with
| `Uchar _ -> incr count; loop ()
| `End -> !count
| `Malformed _ -> incr count; loop ()
| `Await -> !count
in
loop ()
let _ = String.length "你好" (* => 6,字节数 *)
let _ = utf8_length "你好" (* => 2,字符数 *)
(* CJK 字符检测 *)
let is_cjk c =
let n = Uchar.to_int c in
(n >= 0x4E00 && n <= 0x9FFF)
|| (n >= 0x3400 && n <= 0x4DBF)
💡 提示:简单 ASCII 操作用 String 和 Char 即可。处理中文等多字节文本务必使用 Uutf。
字符串拼接性能
| 方式 | 时间复杂度 | 推荐度 |
|---|---|---|
^ 循环拼接 | O(n²) | ❌ 避免 |
Buffer | O(n) | ✅ 推荐 |
String.concat | O(n) | ✅ 推荐 |
(* ❌ 最差:^ 操作符在循环中拼接 *)
let bad_concat lst =
List.fold_left (fun acc s -> acc ^ s) "" lst
(* ✅ 好:使用 Buffer *)
let good_concat lst =
let buf = Buffer.create 256 in
List.iter (Buffer.add_string buf) lst;
Buffer.contents buf
(* ✅ 好:使用 String.concat *)
let concat lst = String.concat "" lst
Buffer 使用示例
let buf = Buffer.create 64
let () =
Buffer.add_string buf "Hello";
Buffer.add_char buf ' ';
Buffer.add_string buf "World!";
Buffer.add_newline buf
let result = Buffer.contents buf
(* => "Hello World!\n" *)
(* 构建 CSV *)
let build_csv headers rows =
let buf = Buffer.create 256 in
Buffer.add_string buf (String.concat "," headers);
Buffer.add_newline buf;
List.iter (fun row ->
Buffer.add_string buf (String.concat "," row);
Buffer.add_newline buf
) rows;
Buffer.contents buf
实际业务示例:日志解析
type log_entry = {
timestamp : string;
level : string;
source : string;
message : string;
}
let parse_log line =
let re = Str.regexp
"\\[\\([^]]+\\)\\] \\[\\([^]]+\\)\\] \\[\\([^]]+\\)\\] \\(.*\\)" in
if Str.string_match re line 0 then
Some {
timestamp = Str.matched_group 1 line;
level = Str.matched_group 2 line;
source = Str.matched_group 3 line;
message = Str.matched_group 4 line;
}
else
None
let format_entry e =
Printf.sprintf "[%s] %-5s [%s] %s"
e.timestamp (String.uppercase_ascii e.level) e.source e.message
let () =
let logs = [
"[11/May/2026:10:00:00 +0800] [info] [server] 启动完成";
"[11/May/2026:10:01:00 +0800] [error] [db] 连接超时";
"[11/May/2026:10:03:00 +0800] [error] [auth] 认证失败";
] in
let entries = List.filter_map parse_log logs in
let errors = List.filter (fun e -> e.level = "error") entries in
Printf.printf "共 %d 条日志,%d 条错误\n"
(List.length entries) (List.length errors);
List.iter (fun e -> print_endline (format_entry e)) errors
实际业务示例:URL 处理
type url = {
scheme : string;
host : string;
port : int option;
path : string;
query : (string * string) list;
}
let parse_url s =
let scheme_end = String.index_opt s ':' in
let (scheme, rest) = match scheme_end with
| Some i -> (String.sub s 0 i, String.sub s (i + 3) (String.length s - i - 3))
| None -> ("http", s)
in
let (host_part, path_part) =
match String.index_opt rest '/' with
| Some i -> (String.sub rest 0 i, String.sub rest i (String.length rest - i))
| None -> (rest, "/")
in
let (host, port) =
match String.index_opt host_part ':' with
| Some i ->
(String.sub host_part 0 i,
Some (int_of_string (String.sub host_part (i + 1)
(String.length host_part - i - 1))))
| None -> (host_part, None)
in
let (path, query_str) =
match String.index_opt path_part '?' with
| Some i ->
(String.sub path_part 0 i,
Some (String.sub path_part (i + 1) (String.length path_part - i - 1)))
| None -> (path_part, None)
in
let query = match query_str with
| Some qs ->
String.split_on_char '&' qs
|> List.filter_map (fun param ->
match String.split_on_char '=' param with
| [k; v] -> Some (k, v)
| [k] -> Some (k, "")
| _ -> None)
| None -> []
in
{ scheme; host; port; path; query }
let build_url u =
Printf.sprintf "%s://%s%s%s%s"
u.scheme u.host
(match u.port with Some p -> Printf.sprintf ":%d" p | None -> "")
u.path
(match u.query with
| [] -> ""
| qs -> "?" ^ String.concat "&"
(List.map (fun (k, v) -> k ^ "=" ^ v) qs))
let () =
let url = parse_url "https://api.example.com:8080/users?page=1&limit=10" in
Printf.printf "Host: %s, Path: %s\n" url.host url.path;
Printf.printf "Rebuilt: %s\n" (build_url url)
业务场景
| 场景 | 推荐工具 |
|---|---|
| 日志格式化 | Printf.sprintf |
| CSV 解析 | String.split_on_char |
| URL 解析 | String.index_opt + String.sub |
| 模板渲染 | Buffer + 替换 |
| 数据验证 | Re 正则表达式 |
| 中文处理 | Uutf |
| 大量拼接 | Buffer |