OCaml 教程 / 字符串与字符处理

字符串与字符处理

概述

OCaml 的字符串是字节序列（byte sequence），默认不可变（从 4.06 开始）。char 类型表示单个字节（Latin-1 编码），不直接支持 Unicode。处理中文等多字节字符需要特殊库（如 Uutf）。

String 模块

创建字符串

(* 字面量 *)
let s = "Hello, OCaml!"
let empty = ""
let multiline = "line1\nline2\nline3"

(* 字符串长度 — 返回字节数 *)
let len = String.length "Hello"    (* => 5 *)
let cn_len = String.length "你好"  (* => 6，每个中文字符 3 字节 (UTF-8) *)

(* 重复字符串 *)
let dashes = String.make 20 '-'    (* "--------------------" *)

(* 从字符列表创建 *)
let from_chars = String.of_seq (List.to_seq ['O'; 'C'; 'a'; 'm'; 'l'])
(* => "OCaml" *)

字符串索引与切片

let s = "Hello, World!"

(* 单字符访问 — 返回 char *)
let first = String.get s 0        (* => 'H' *)
let first' = s.[0]               (* => 'H'，语法糖 *)

(* 子串提取 *)
let sub = String.sub s 0 5       (* => "Hello"，从索引 0 取 5 个字节 *)
let world = String.sub s 7 5     (* => "World" *)

(* 查找字符位置 *)
let pos = String.index s 'W'     (* => 7，第一次出现 *)
let rpos = String.rindex s 'l'   (* => 10，最后一次出现 *)

(* 安全查找 — 使用 opt 版本 *)
let opt_pos = String.index_opt s 'z'  (* => None *)
let opt_pos' = String.index_opt s 'W' (* => Some 7 *)

(* 检查是否包含某字符 *)
let has_o = String.contains s 'o'   (* => true *)

(* 前缀/后缀检查 *)
let starts_hello = String.starts_with ~prefix:"Hello" s  (* => true *)
let ends_bang = String.ends_with ~suffix:"!" s           (* => true *)

⚠️ 注意：String.sub s pos len 中 pos + len 不能超过 String.length s，否则会抛出 Invalid_argument 异常。

字符串变换

let s = "  Hello, World!  "

(* 去除空白 *)
let trimmed = String.trim s              (* => "Hello, World!" *)

(* 大小写转换 *)
let upper = String.uppercase_ascii "hello"   (* => "HELLO" *)
let lower = String.lowercase_ascii "HELLO"   (* => "hello" *)
let cap = String.capitalize_ascii "hello"    (* => "Hello" *)

(* 字符串映射 *)
let no_vowels = String.map (fun c ->
  match c with
  | 'a' | 'e' | 'i' | 'o' | 'u' -> '-'
  | c -> c
) "Hello, World!"
(* => "H-ll-, W-rld!" *)

字符串分割与连接

(* 分割字符串 *)
let words = String.split_on_char ' ' "hello world ocaml"
(* => ["hello"; "world"; "ocaml"] *)

let csv = String.split_on_char ',' "a,b,c,d"
(* => ["a"; "b"; "c"; "d"] *)

(* 连接字符串 *)
let joined = String.concat ", " ["Alice"; "Bob"; "Charlie"]
(* => "Alice, Bob, Charlie" *)

let path = String.concat "/" [""; "home"; "user"; "file.txt"]
(* => "/home/user/file.txt" *)

(* 遍历字符串 *)
let () =
  String.iter (fun c -> Printf.printf "%c " c) "Hello"
(* 输出: H e l l o *)

(* 带索引遍历 *)
let () =
  String.iteri (fun i c -> Printf.printf "[%d]=%c " i c) "ABC"
(* 输出: [0]=A [1]=B [2]=C *)

Bytes 模块（可变字符串）

Bytes 提供了可变的字节序列：

(* 创建 Bytes *)
let b = Bytes.of_string "Hello"

(* 修改字节 *)
let () = Bytes.set b 0 'h'      (* b = "hello" *)

(* 读取字节 *)
let c = Bytes.get b 0            (* => 'h' *)

(* 转换为不可变字符串 *)
let s = Bytes.to_string b        (* => "hello" *)

(* 复制 *)
let b2 = Bytes.copy b

(* 子字节序列 *)
let sub = Bytes.sub b 0 3        (* => "hel" *)

(* 不可变字符串的修改 — 创建新字符串 *)
let replace_char s old_c new_c =
  let b = Bytes.of_string s in
  Bytes.iteri (fun i c ->
    if c = old_c then Bytes.set b i new_c
  ) b;
  Bytes.to_string b

let _ = replace_char "hello" 'l' 'r'  (* => "herro" *)

⚠️ 注意：Bytes.of_string 不复制数据，它们共享内存。修改 bytes 会影响原始 string。安全做法是 Bytes.copy (Bytes.of_string s)。

💡 提示：String 是不可变的，Bytes 是可变的。从 4.06 开始，标准库明确区分两者。

Char 模块

(* 基本操作 *)
let _ = Char.code 'A'            (* => 65 *)
let _ = Char.chr 97              (* => 'a' *)

(* 大小写转换 *)
let _ = Char.uppercase_ascii 'a' (* => 'A' *)
let _ = Char.lowercase_ascii 'A' (* => 'a' *)

(* 手动字符分类 *)
let is_digit c = c >= '0' && c <= '9'
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_alphanum c = is_alpha c || is_digit c
let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'

let _ = is_digit '5'       (* => true *)
let _ = is_alpha 'z'       (* => true *)
let _ = is_whitespace ' '  (* => true *)

(* 字符转字符串 *)
let char_to_string c = String.make 1 c

字符串格式化：Printf.sprintf

(* 基本格式化 *)
let msg = Printf.sprintf "Hello, %s!" "World"

(* 数字格式化 *)
let hex = Printf.sprintf "0x%08X" 255       (* => "0x000000FF" *)
let padded = Printf.sprintf "%05d" 42       (* => "00042" *)
let float_f = Printf.sprintf "%.2f" 3.14159 (* => "3.14" *)

(* 宽度控制 *)
let left = Printf.sprintf "%-10s|" "hello"   (* => "hello     |" *)
let right = Printf.sprintf "%10s|" "hello"   (* => "     hello|" *)

(* 格式化到缓冲区 *)
let buf = Buffer.create 64
let () = Printf.bprintf buf "Name: %s, Age: %d" "Alice" 30
let result = Buffer.contents buf

格式说明符速查

说明符	类型	示例
`%d`	int	`"%d" 42` => `"42"`
`%f`	float	`"%.2f" 3.14` => `"3.14"`
`%s`	string	`"%s" "hi"` => `"hi"`
`%c`	char	`"%c" 'A'` => `"A"`
`%b`	bool	`"%b" true` => `"true"`
`%x` / `%X`	int (hex)	`"%x" 255` => `"ff"`
`%o`	int (oct)	`"%o" 8` => `"10"`
`%Ld`	int64	`"%Ld" n`
`%!`	flush	`"done%!"`

正则表达式

标准库有基础的 Str 模块，推荐使用更现代的 Re 库：

opam install re

Str 模块（标准库）

(* 编译: ocamlfind ocamlc -package str -linkpkg file.ml *)

let () =
  let re = Str.regexp "\\([0-9]+\\)-\\([0-9]+\\)-\\([0-9]+\\)" in
  let s = "日期: 2024-01-15" in
  if Str.string_match re s 4 then begin
    let year = Str.matched_group 1 s in
    let month = Str.matched_group 2 s in
    let day = Str.matched_group 3 s in
    Printf.printf "年=%s, 月=%s, 日=%s\n" year month day
  end

(* 全局替换 *)
let replaced = Str.global_replace (Str.regexp "foo") "bar" "foo and foo"
(* => "bar and bar" *)

Re 库（推荐）

(* 编译: ocamlfind ocamlopt -package re -linkpkg file.ml *)

open Re

(* 编译正则表达式 *)
let email_re =
  compile (seq [
    group (rep1 (compl [char '@']));
    char '@';
    group (rep1 (compl [char '.']));
    char '.';
    group (rep1 (compl [char '.']));
  ])

let extract_email s =
  match exec_opt email_re s with
  | Some g ->
    let user = Group.get g 1 in
    let domain = Group.get g 2 in
    let tld = Group.get g 3 in
    Some (user, domain, tld)
  | None -> None

let _ = extract_email "联系: [email protected] 谢谢"
(* => Some ("alice", "example", "com") *)

(* 查找所有数字 *)
let find_all_numbers text =
  let num_re = compile (rep1 digit) in
  List.map (fun g -> Group.get g 0) (all num_re text)

let _ = find_all_numbers "价格: 100元, 数量: 5个"
(* => ["100"; "5"] *)

⚠️ 注意：Str 模块使用全局状态存储匹配结果，不是线程安全的。多线程环境应使用 Re 库。

UTF-8 处理

OCaml 的 string 是字节序列，处理 UTF-8 需要 Uutf 库：

opam install uutf

(* 遍历 UTF-8 字符 *)
let utf8_length s =
  let decoder = Uutf.decoder ~encoding:`UTF_8 (`String s) in
  let count = ref 0 in
  let rec loop () =
    match Uutf.decode decoder with
    | `Uchar _ -> incr count; loop ()
    | `End -> !count
    | `Malformed _ -> incr count; loop ()
    | `Await -> !count
  in
  loop ()

let _ = String.length "你好"    (* => 6，字节数 *)
let _ = utf8_length "你好"      (* => 2，字符数 *)

(* CJK 字符检测 *)
let is_cjk c =
  let n = Uchar.to_int c in
  (n >= 0x4E00 && n <= 0x9FFF)
  || (n >= 0x3400 && n <= 0x4DBF)

💡 提示：简单 ASCII 操作用 String 和 Char 即可。处理中文等多字节文本务必使用 Uutf。

字符串拼接性能

方式	时间复杂度	推荐度
`^` 循环拼接	O(n²)	❌ 避免
`Buffer`	O(n)	✅ 推荐
`String.concat`	O(n)	✅ 推荐

(* ❌ 最差：^ 操作符在循环中拼接 *)
let bad_concat lst =
  List.fold_left (fun acc s -> acc ^ s) "" lst

(* ✅ 好：使用 Buffer *)
let good_concat lst =
  let buf = Buffer.create 256 in
  List.iter (Buffer.add_string buf) lst;
  Buffer.contents buf

(* ✅ 好：使用 String.concat *)
let concat lst = String.concat "" lst

Buffer 使用示例

let buf = Buffer.create 64

let () =
  Buffer.add_string buf "Hello";
  Buffer.add_char buf ' ';
  Buffer.add_string buf "World!";
  Buffer.add_newline buf

let result = Buffer.contents buf
(* => "Hello World!\n" *)

(* 构建 CSV *)
let build_csv headers rows =
  let buf = Buffer.create 256 in
  Buffer.add_string buf (String.concat "," headers);
  Buffer.add_newline buf;
  List.iter (fun row ->
    Buffer.add_string buf (String.concat "," row);
    Buffer.add_newline buf
  ) rows;
  Buffer.contents buf

实际业务示例：日志解析

type log_entry = {
  timestamp : string;
  level : string;
  source : string;
  message : string;
}

let parse_log line =
  let re = Str.regexp
    "\\[\\([^]]+\\)\\] \\[\\([^]]+\\)\\] \\[\\([^]]+\\)\\] \\(.*\\)" in
  if Str.string_match re line 0 then
    Some {
      timestamp = Str.matched_group 1 line;
      level = Str.matched_group 2 line;
      source = Str.matched_group 3 line;
      message = Str.matched_group 4 line;
    }
  else
    None

let format_entry e =
  Printf.sprintf "[%s] %-5s [%s] %s"
    e.timestamp (String.uppercase_ascii e.level) e.source e.message

let () =
  let logs = [
    "[11/May/2026:10:00:00 +0800] [info] [server] 启动完成";
    "[11/May/2026:10:01:00 +0800] [error] [db] 连接超时";
    "[11/May/2026:10:03:00 +0800] [error] [auth] 认证失败";
  ] in
  let entries = List.filter_map parse_log logs in
  let errors = List.filter (fun e -> e.level = "error") entries in
  Printf.printf "共 %d 条日志，%d 条错误\n"
    (List.length entries) (List.length errors);
  List.iter (fun e -> print_endline (format_entry e)) errors

实际业务示例：URL 处理

type url = {
  scheme : string;
  host : string;
  port : int option;
  path : string;
  query : (string * string) list;
}

let parse_url s =
  let scheme_end = String.index_opt s ':' in
  let (scheme, rest) = match scheme_end with
    | Some i -> (String.sub s 0 i, String.sub s (i + 3) (String.length s - i - 3))
    | None -> ("http", s)
  in
  let (host_part, path_part) =
    match String.index_opt rest '/' with
    | Some i -> (String.sub rest 0 i, String.sub rest i (String.length rest - i))
    | None -> (rest, "/")
  in
  let (host, port) =
    match String.index_opt host_part ':' with
    | Some i ->
      (String.sub host_part 0 i,
       Some (int_of_string (String.sub host_part (i + 1)
         (String.length host_part - i - 1))))
    | None -> (host_part, None)
  in
  let (path, query_str) =
    match String.index_opt path_part '?' with
    | Some i ->
      (String.sub path_part 0 i,
       Some (String.sub path_part (i + 1) (String.length path_part - i - 1)))
    | None -> (path_part, None)
  in
  let query = match query_str with
    | Some qs ->
      String.split_on_char '&' qs
      |> List.filter_map (fun param ->
        match String.split_on_char '=' param with
        | [k; v] -> Some (k, v)
        | [k] -> Some (k, "")
        | _ -> None)
    | None -> []
  in
  { scheme; host; port; path; query }

let build_url u =
  Printf.sprintf "%s://%s%s%s%s"
    u.scheme u.host
    (match u.port with Some p -> Printf.sprintf ":%d" p | None -> "")
    u.path
    (match u.query with
     | [] -> ""
     | qs -> "?" ^ String.concat "&"
         (List.map (fun (k, v) -> k ^ "=" ^ v) qs))

let () =
  let url = parse_url "https://api.example.com:8080/users?page=1&limit=10" in
  Printf.printf "Host: %s, Path: %s\n" url.host url.path;
  Printf.printf "Rebuilt: %s\n" (build_url url)

业务场景

场景	推荐工具
日志格式化	`Printf.sprintf`
CSV 解析	`String.split_on_char`
URL 解析	`String.index_opt` + `String.sub`
模板渲染	`Buffer` + 替换
数据验证	`Re` 正则表达式
中文处理	`Uutf`
大量拼接	`Buffer`