强曰为道

与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

第 08 章:字符串与正则

第 08 章:字符串与正则

“文本处理是脚本语言的灵魂。”


8.1 字符串深入

8.1.1 字符串内部表示

# Ruby 字符串是可变的字节序列
str = "Hello, 世界!"
str.encoding          # => #<Encoding:UTF-8>
str.bytesize          # => 16(字节数)
str.length            # => 9(字符数)
str.size              # => 9(同 length)

# 字节、字符、码点
"Hello".bytes         # => [72, 101, 108, 108, 111]
"Hello".chars         # => ["H", "e", "l", "l", "o"]
"Hello".codepoints    # => [72, 101, 108, 108, 111]
"你好".bytes           # => [228, 189, 160, 229, 165, 189]
"你好".chars           # => ["你", "好"]
"你好".codepoints      # => [20320, 22909]

# 字符串是可变的
str = "hello"
str << " world"    # 原地修改
str.concat("!")    # 原地修改
str.replace("new") # 完全替换

# frozen_string_literal: true(推荐)
# 该魔法注释会冻结文件中的所有字符串字面量
str = "hello"
str << " world"  # => FrozenError(如果启用了 frozen_string_literal)

8.1.2 高级字符串操作

# 扫描(scan)- 提取所有匹配
"hello world hello".scan(/hello/)        # => ["hello", "hello"]
"abc123def456".scan(/\d+/)              # => ["123", "456"]
"2024-01-15 2023-12-25".scan(/\d{4}-\d{2}-\d{2}/)  # => ["2024-01-15", "2023-12-25"]

# 带捕获组的 scan
"hello:world foo:bar".scan(/(\w+):(\w+)/)
# => [["hello", "world"], ["foo", "bar"]]

# 替换(sub / gsub)
"hello world".sub("world", "Ruby")       # => "hello Ruby"
"hello world".gsub("l", "L")             # => "heLLo worLd"
"hello world".gsub(/\w+/, &:upcase)      # => "HELLO WORLD"
"hello world".gsub(/(\w+)/) { $1.capitalize }  # => "Hello World"

# gsub 带命名捕获
"2024-01-15".gsub(/(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2})/) do
  "#{Regexp.last_match[:month]}/#{Regexp.last_match[:day]}/#{Regexp.last_match[:year]}"
end
# => "01/15/2024"

# 删除(delete)
"hello world".delete("l")                # => "heo word"
"hello world".delete("a-z")              # => " "
"hello world".delete("^a-z")             # => "helloworld"

# squeeze(压缩连续字符)
"hellooo  woorlld".squeeze               # => "hello world"
"hellooo  woorlld".squeeze("l")          # => "helo  world"
"aaa   bbb".squeeze(" ")                 # => "aaa bbb"

# count(计数)
"hello".count("l")                       # => 2
"hello".count("a-z")                     # => 5
"Hello".count("a-z")                     # => 4
"Hello".count("a-z", "A-Z")             # => 5

# 缩写词(abbrev)
require "abbrev"
%w[cat car cart carton].abbrev
# => {"cat"=>"cat", "car"=>"car", "cart"=>"cart", "carton"=>"carton",
#     "carto"=>"carton", "carton"=>"carton", "car"=>"car", "ca"=>"cat"}

8.1.3 字符串插值

name = "Ruby"
version = 3.3

# 基本插值
"Hello, #{name} #{version}!"  # => "Hello, Ruby 3.3!"

# 表达式插值
"2 + 3 = #{2 + 3}"            # => "2 + 3 = 5"
"数组长度: #{[1,2,3].length}"  # => "数组长度: 3"

# 方法调用插值
"hello".upcase                 # => "HELLO"
"结果: #{[1,2,3].map { |n| n * 2 }}"  # => "结果: [2, 4, 6]"

# 格式化插值
"圆周率: %.4f" % Math::PI     # => "圆周率: 3.1416"
"%s 有 %d 个字符" % ["hello", 5]  # => "hello 有 5 个字符"

# heredoc 插值
name = "World"
greeting = <<~TEXT
  Hello, #{name}!
  Today is #{Date.today}.
  Ruby version: #{RUBY_VERSION}
TEXT

8.1.4 多行字符串

# heredoc 语法
sql = <<~SQL
  SELECT u.name, u.email, o.total
  FROM users u
  JOIN orders o ON u.id = o.user_id
  WHERE o.created_at > '2024-01-01'
  ORDER BY o.total DESC
SQL

# 单引号 heredoc(不插值)
code = <<~'RUBY'
  def hello(name)
    "Hello, #{name}!"
  end
RUBY

# 缩进 heredoc(<<~ 自动去除缩进)
def generate_config
  <<~YAML
    server:
      host: localhost
      port: 3000
    database:
      adapter: postgresql
  YAML
end

8.2 正则表达式

8.2.1 正则基础

# 创建正则表达式
regex = /hello/               # 字面量
regex = Regexp.new("hello")   # 构造函数
regex = %r{path/to/file}      # %r 语法(路径中有 / 时方便)

# 匹配操作
"hello world" =~ /hello/      # => 0(返回索引)
"hello world" !~ /hello/      # => false
"hello world".match(/hello/)  # => #<MatchData "hello">
"hello world".match?(/hello/) # => true(Ruby 2.4+,不设置 $~)

# 特殊变量
"hello world" =~ /(\w+) (\w+)/
$1          # => "hello"(第一个捕获组)
$2          # => "world"(第二个捕获组)
$~          # => MatchData 对象
$&          # => "hello world"(整个匹配)
$`          # => ""(匹配前的内容)
$'          # => ""(匹配后的内容)

8.2.2 正则元字符

元字符说明示例
.任意字符(除换行)/a.c/ 匹配 “abc”, “a1c”
^行首/^hello/ 匹配以 hello 开头
$行尾/world$/ 匹配以 world 结尾
\b单词边界/\bword\b/ 匹配完整单词
\d数字/\d+/ 匹配一个或多个数字
\D非数字/\D+/ 匹配非数字
\w单词字符/\w+/ 匹配字母数字下划线
\W非单词字符/\W+/ 匹配非单词字符
\s空白字符/\s+/ 匹配空格、制表符等
\S非空白字符/\S+/ 匹配非空白字符

8.2.3 量词

量词说明示例
*零次或多次/a*/ 匹配 “”, “a”, “aa”…
+一次或多次/a+/ 匹配 “a”, “aa”…
?零次或一次/a?/ 匹配 “”, “a”
{n}恰好 n 次/\d{3}/ 匹配 3 位数字
{n,m}n 到 m 次/\d{2,4}/ 匹配 2-4 位数字
{n,}至少 n 次/\d{2,}/ 匹配至少 2 位数字
*?惰性零次或多次/a.*?b/ 最短匹配
+?惰性一次或多次/a.+?b/ 最短匹配
# 贪婪 vs 惰性
html = "<b>hello</b><b>world</b>"
html.scan(/<b>.*<\/b>/)     # => ["<b>hello</b><b>world</b>"](贪婪)
html.scan(/<b>.*?<\/b>/)    # => ["<b>hello</b>", "<b>world</b>"](惰性)

8.2.4 字符类和分组

# 字符类
/[aeiou]/          # 匹配任一元音字母
/[a-z]/            # 匹配小写字母
/[A-Z]/            # 匹配大写字母
/[0-9]/            # 匹配数字
/[^aeiou]/         # 匹配非元音字母(取反)
/[a-zA-Z0-9]/      # 匹配字母数字

# 分组
/(hello) (world)/           # 捕获组
/(?:hello) (world)/         # 非捕获组
/(?<first>\w+) (?<last>\w+)/ # 命名捕获组

# 命名捕获使用
"John Doe" =~ /(?<first>\w+) (?<last>\w+)/
Regexp.last_match[:first]   # => "John"
Regexp.last_match[:last]    # => "Doe"

# 反向引用
/(\w+) \1/                  # 匹配重复单词
"hello hello".match(/(\w+) \1/)  # => MatchData
"hello world".match(/(\w+) \1/)  # => nil

# 选择(alternation)
/cat|dog/          # 匹配 "cat" 或 "dog"
/red|green|blue/   # 匹配颜色

8.2.5 锚点

# 锚点
/^Hello/           # 行首
/world$/           # 行尾
/\AHello/          # 字符串开头(不受多行影响)
/world\z/          # 字符串结尾
/\bword\b/         # 单词边界
/(?=pattern)/      # 正向前瞻
/(?!pattern)/      # 负向前瞻
/(?<=pattern)/     # 正向后顾
/(?<!pattern)/     # 负向后顾

# 示例
"hello world" =~ /\Ahello/   # => 0
"hello world\nhello" =~ /^hello/  # => 0(多行模式)
"hello world\nhello" =~ /\Ahello/ # => 0

# 正向前瞻
"100px".scan(/\d+(?=px)/)    # => ["100"]
"100em".scan(/\d+(?=px)/)    # => []

# 负向前瞻
"100px".scan(/\d+(?!px)/)    # => ["10"](0 后面是 p,匹配;100 的 0 后面是 p)
"100em".scan(/\d+(?!px)/)    # => ["100"]

# 正向后顾
"$100".scan(/(?<=\$)\d+/)    # => ["100"]

8.2.6 正则选项

# 选项
/hello/i              # 忽略大小写
/hello/m              # 多行模式(. 匹配换行)
/hello/x              # 扩展模式(忽略空白和注释)
/hello/o              # 只编译一次

# Regexp 构造
Regexp.new("hello", Regexp::IGNORECASE)
Regexp.new("hello", Regexp::IGNORECASE | Regexp::MULTILINE)

# 选项内联
/(?i)hello/           # 忽略大小写
/(?im)hello/          # 忽略大小写 + 多行

# 扩展模式(带注释)
regex = /
  \A                    # 字符串开头
  (?<year>\d{4})        # 年
  -                     # 分隔符
  (?<month>\d{2})       # 月
  -                     # 分隔符
  (?<day>\d{2})         # 日
  \z                    # 字符串结尾
/x

"2024-01-15".match(regex)

8.2.7 MatchData 对象

str = "2024-01-15: Release of Ruby 3.3"
match = str.match(/(?<date>\d{4}-\d{2}-\d{2}): (?<event>.+)/)

match[0]              # => "2024-01-15: Release of Ruby 3.3"(完整匹配)
match[1]              # => "2024-01-15"(第一个捕获组)
match[2]              # => "Release of Ruby 3.3"(第二个捕获组)
match[:date]          # => "2024-01-15"(命名捕获)
match[:event]         # => "Release of Ruby 3.3"
match.captures        # => ["2024-01-15", "Release of Ruby 3.3"]
match.begin(0)        # => 0(匹配开始位置)
match.end(0)          # => 33(匹配结束位置)
match.pre_match       # => ""(匹配前的内容)
match.post_match      # => ""(匹配后的内容)
match.length          # => 3(匹配数量)
match.names           # => ["date", "event"]

8.3 正则实战

8.3.1 常用正则模式

# 邮箱验证(简化版)
EMAIL_REGEX = /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i
"[email protected]".match?(EMAIL_REGEX)  # => true

# 手机号(中国)
PHONE_REGEX = /\A1[3-9]\d{9}\z/
"13812345678".match?(PHONE_REGEX)       # => true

# URL
URL_REGEX = /\Ahttps?:\/\/[\w\-]+(\.[\w\-]+)+([\w\-.,@?^=%&:\/~+#]*[\w\-@?^=%&\/~+#])?\z/

# IP 地址
IP_REGEX = /\A(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\z/

# 身份证号(简化)
ID_CARD_REGEX = /\A\d{17}[\dXx]\z/

# HTML 标签
/<(\w+)([^>]*)>(.*?)<\/\1>/m

# 提取引号内内容
/"([^"]+)"/

# 驼峰转下划线
def camel_to_snake(str)
  str.gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
     .gsub(/([a-z\d])([A-Z])/, '\1_\2')
     .downcase
end

camel_to_snake("CamelCase")     # => "camel_case"
camel_to_snake("HTMLParser")    # => "html_parser"
camel_to_snake("myXMLParser")   # => "my_xml_parser"

# 下划线转驼峰
def snake_to_camel(str)
  str.split('_').map(&:capitalize).join
end

snake_to_camel("camel_case")    # => "CamelCase"

8.3.2 日志解析

# 解析 Nginx 访问日志
log_line = '192.168.1.1 - - [15/Jan/2024:10:30:00 +0800] "GET /api/users HTTP/1.1" 200 1234'

pattern = /
  (?<ip>\d+\.\d+\.\d+\.\d+)
  \s-\s
  (?<user>\S+)
  \s\[
  (?<time>[^\]]+)
  \]\s"
  (?<method>\w+)
  \s
  (?<path>\S+)
  \s
  (?<protocol>[^"]+)
  "\s
  (?<status>\d+)
  \s
  (?<size>\d+)
/x

match = log_line.match(pattern)
if match
  puts "IP: #{match[:ip]}"
  puts "时间: #{match[:time]}"
  puts "请求: #{match[:method]} #{match[:path]}"
  puts "状态: #{match[:status]}"
  puts "大小: #{match[:size]} bytes"
end

8.3.3 文本替换

# Markdown 链接转 HTML
def md_links_to_html(text)
  text.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do
    "<a href=\"#{$2}\">#{$1}</a>"
  end
end

md_links_to_html("Visit [Ruby](https://ruby-lang.org) for more info.")
# => "Visit <a href=\"https://ruby-lang.org\">Ruby</a> for more info."

# 高亮关键词
def highlight(text, keywords)
  keywords.each do |keyword|
    text = text.gsub(/(#{Regexp.escape(keyword)})/i, '**\1**')
  end
  text
end

highlight("Ruby is great", ["Ruby", "great"])
# => "**Ruby** is **great**"

8.4 编码处理

8.4.1 Ruby 编码基础

# 查看默认编码
Encoding.default_external   # => #<Encoding:UTF-8>
Encoding.default_internal   # => nil(通常为 nil)

# 字符串编码
str = "Hello, 世界!"
str.encoding               # => #<Encoding:UTF-8>
str.valid_encoding?        # => true

# 转换编码
ascii_str = str.encode("ASCII", invalid: :replace, undef: :replace)
# => "Hello, ??!"

# 强制编码(不转换字节,只改变编码标记)
binary = "\xC0\xC1".force_encoding("UTF-8")
binary.valid_encoding?     # => false

# 设置编码
# -*- coding: utf-8 -*-          # 文件顶部(旧方式)
# encoding: utf-8                 # 文件顶部
# frozen_string_literal: true     # 推荐(同时冻结字符串)

8.4.2 编码转换

# 文件编码转换
def convert_file(input_path, output_path, from_enc, to_enc)
  content = File.read(input_path, encoding: from_enc)
  converted = content.encode(to_enc, invalid: :replace, undef: :replace)
  File.write(output_path, converted, encoding: to_enc)
end

# 常见编码
encodings = Encoding.list.map(&:name)
puts encodings.sort.join(", ")
# ASCII, ISO-8859-1, UTF-8, UTF-16, ...

# 编码检测(需要 chardet gem)
# require "rchardet"
# detector = CharDet.detect("未知编码的字符串")
# puts detector["encoding"]

8.4.3 特殊字符处理

# Unicode 转义
"\u{1F600}"          # => "😀"
"\u{4F60 597D}"      # => "你好"

# 获取 Unicode 名称
"😀".unpack("U*")    # => [128512]

# 处理 BOM
def remove_bom(str)
  str.sub(/\A\xEF\xBB\xBF/, '')
end

# 零宽字符清理
def clean_invisible(str)
  str.gsub(/[\u200B\u200C\u200D\uFEFF]/, '')
end

8.5 符号化技巧

8.5.1 字符串与符号互转

# 字符串 → 符号
"hello".to_sym        # => :hello
"hello".intern        # => :hello
"hello world".to_sym  # => :"hello world"

# 符号 → 字符串
:hello.to_s           # => "hello"
:hello.inspect        # => ":hello"

# 批量转换
%w[name age city].map(&:to_sym)   # => [:name, :age, :city]
[:name, :age, :city].map(&:to_s)  # => ["name", "age", "city"]

8.5.2 哈希键符号化

# Rails 的 symbolize_keys
hash = { "name" => "Alice", "age" => 25 }

# 手动实现
hash.transform_keys(&:to_sym)
# => { name: "Alice", age: 25 }

# 深度符号化
def deep_symbolize_keys(obj)
  case obj
  when Hash
    obj.each_with_object({}) do |(k, v), result|
      key = k.is_a?(String) ? k.to_sym : k
      result[key] = deep_symbolize_keys(v)
    end
  when Array
    obj.map { |item| deep_symbolize_keys(item) }
  else
    obj
  end
end

data = {
  "user" => {
    "name" => "Alice",
    "address" => {
      "city" => "Beijing"
    }
  }
}

deep_symbolize_keys(data)
# => { user: { name: "Alice", address: { city: "Beijing" } } }

8.6 性能优化

8.6.1 字符串拼接

# ❌ 低效:多次创建新字符串
result = ""
[1, 2, 3, 4, 5].each { |n| result += n.to_s }

# ✅ 高效:原地修改
result = ""
[1, 2, 3, 4, 5].each { |n| result << n.to_s }

# ✅ 最佳:使用 join
result = [1, 2, 3, 4, 5].map(&:to_s).join

# ✅ 或使用 StringIO
require "stringio"
io = StringIO.new
[1, 2, 3, 4, 5].each { |n| io << n.to_s }
result = io.string

8.6.2 正则预编译

# ❌ 每次调用都编译
def validate_email(email)
  email.match?(/\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i)
end

# ✅ 预编译常量
EMAIL_REGEX = /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i
def validate_email(email)
  email.match?(EMAIL_REGEX)
end

# 使用 freeze 冻结
EMAIL_REGEX = /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i.freeze

8.7 动手练习

  1. CSV 解析器(不使用标准库)
# 实现简单的 CSV 解析
# parse_csv("a,b,c\n1,2,3") => [["a","b","c"], ["1","2","3"]]
def parse_csv(text)
  # 你的代码...
end
  1. Markdown 转 HTML(基础版)
# 实现 # 标题、**粗体**、*斜体*、[链接](url) 转换
def md_to_html(text)
  # 你的代码...
end
  1. 敏感信息脱敏
# 将手机号、邮箱、身份证号进行脱敏处理
# mask_info("手机: 13812345678, 邮箱: [email protected]")
# => "手机: 138****5678, 邮箱: t***@example.com"
def mask_info(text)
  # 你的代码...
end

8.8 本章小结

要点说明
字符串可变性Ruby 字符串默认可变,推荐使用 frozen_string_literal
正则表达式支持贪婪/惰性量词、命名捕获、前瞻后顾
编码Ruby 默认 UTF-8,注意编码转换和验证
性能预编译正则、使用 << 拼接、避免创建过多临时对象

📖 扩展阅读


上一章← 第 07 章:数组与哈希 下一章第 09 章:面向对象编程 →