Python 编程教程 / 08 - 字符串与文本处理

第 08 章：字符串与文本处理

深入掌握字符串操作、格式化、正则表达式和文本处理技巧。

8.1 字符串基础

8.1.1 创建方式

s1 = 'single quotes'
s2 = "double quotes"
s3 = '''triple single quotes
        multiline'''
s4 = """triple double quotes
        multiline"""

# 原始字符串
raw = r"C:\Users\new\docs"
raw_pattern = r"\d{3}-\d{4}"

# bytes
data = b"hello"

8.1.2 字符串方法

s = "Hello, Python World!"

# 查找
print(s.find("Python"))      # 7
print(s.find("Java"))        # -1（未找到）
print(s.count("l"))          # 2
print(s.index("Python"))     # 7（未找到时抛 ValueError）

# 判断
print(s.startswith("Hello"))  # True
print(s.endswith("!"))        # True
print("abc".isalnum())        # True
print("123".isdigit())        # True
print("hello".islower())      # True

# 大小写
print(s.upper())    # HELLO, PYTHON WORLD!
print(s.lower())    # hello, python world!
print(s.title())    # Hello, Python World!
print(s.swapcase()) # hELLO, pYTHON wORLD!

# 去除空白
s = "  hello  "
print(s.strip())    # hello
print(s.lstrip())   # hello  
print(s.rstrip())   #   hello
print("---hello---".strip("-"))  # hello

# 替换
s = "Hello World"
print(s.replace("World", "Python"))  # Hello Python

# 分割与拼接
csv = "apple,banana,cherry"
parts = csv.split(",")
print(parts)        # ['apple', 'banana', 'cherry']
print("-".join(parts))  # apple-banana-cherry

# 分割行
text = "line1\nline2\nline3"
lines = text.splitlines()

8.2 f-string 格式化

8.2.1 基本用法

name = "Alice"
age = 30
print(f"Hello, {name}! You are {age} years old.")

# 表达式
print(f"2 + 3 = {2 + 3}")
print(f"{'hello':>20}")    # 右对齐
print(f"{'hello':<20}")    # 左对齐
print(f"{'hello':^20}")    # 居中
print(f"{'hello':*^20}")   # 用*填充: *******hello********

8.2.2 数字格式化

pi = 3.141592653589793

# 小数位数
print(f"{pi:.2f}")      # 3.14
print(f"{pi:.4f}")      # 3.1416

# 千位分隔符
big_number = 1234567890
print(f"{big_number:,}")        # 1,234,567,890
print(f"{big_number:,.2f}")     # 1,234,567,890.00

# 百分比
ratio = 0.8567
print(f"{ratio:.1%}")   # 85.7%

# 科学计数法
print(f"{pi:.2e}")      # 3.14e+00

# 进制
num = 255
print(f"{num:b}")       # 11111111（二进制）
print(f"{num:o}")       # 377（八进制）
print(f"{num:x}")       # ff（十六进制）
print(f"{num:#x}")      # 0xff
print(f"{num:X}")       # FF

# 填充和对齐
print(f"{42:05d}")      # 00042
print(f"{42:>10}")      #         42
print(f"{42:<10}")      # 42        
print(f"{42:^10}")      #     42

8.2.3 Python 3.12+ 改进

# 3.12+ 支持更复杂的表达式
items = ["apple", "banana", "cherry"]
print(f"{', '.join(items) = }")  # ', '.join(items) = apple, banana, cherry

# 嵌套引号
name = "world"
print(f"{'hello'} {name}")  # 支持同类型引号嵌套

8.3 其他格式化方式

# format() 方法
print("Hello, {}!".format("World"))
print("{name} is {age} old".format(name="Alice", age=30))

# % 格式化（旧式）
print("Hello, %s! Age: %d" % ("Alice", 30))
print("Pi: %.2f" % 3.14159)

# Template（安全的用户输入格式化）
from string import Template
t = Template("Hello, $name! You are $age years old.")
print(t.substitute(name="Alice", age=30))

方式	推荐度	适用场景
f-string	⭐⭐⭐⭐⭐	日常开发首选
format()	⭐⭐⭐⭐	模板场景
%	⭐⭐	旧代码维护
Template	⭐⭐⭐	用户输入模板（防注入）

8.4 正则表达式

8.4.1 基本匹配

import re

# 匹配
text = "我的手机号是 13812345678，邮箱是 [email protected]"

# 查找手机号
phone = re.search(r"1[3-9]\d{9}", text)
if phone:
    print(phone.group())  # 13812345678

# 查找所有匹配
emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", text)
print(emails)  # ['[email protected]']

# 替换
result = re.sub(r"\d", "*", "电话: 13812345678")
print(result)  # 电话: ***********

8.4.2 常用模式

模式	说明	示例
`.`	任意字符（除换行）	`a.c` 匹配 “abc”
`\d`	数字	`\d+` 匹配 “123”
`\D`	非数字	`\D+` 匹配 “abc”
`\w`	字母数字下划线	`\w+` 匹配 “hello_123”
`\s`	空白字符	`\s+` 匹配 " \t\n"
`*`	0 次或多次	`ab*c` 匹配 “ac”, “abc”
`+`	1 次或多次	`ab+c` 匹配 “abc”
`?`	0 次或 1 次	`ab?c` 匹配 “ac”, “abc”
`{n,m}`	n 到 m 次	`\d{3,5}` 匹配 3-5 位数字
`^`	行首	`^Hello`
`$`	行尾	`World$`
`[]`	字符集	`[aeiou]` 元音
`\|`	或	`cat\|dog`
`()`	捕获组	`(\d+)-(\d+)`

8.4.3 实用模式

# IPv4 地址
ipv4 = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"

# 日期格式 YYYY-MM-DD
date_pattern = r"\b\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b"

# URL
url_pattern = r"https?://[\w./-]+"

# 中文字符
chinese = r"[\u4e00-\u9fff]+"

8.4.4 分组与捕获

import re

# 命名分组
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
match = re.search(pattern, "生日: 1990-05-15")
if match:
    print(match.group("year"))   # 1990
    print(match.group("month"))  # 05
    print(match.group("day"))    # 15

# 非捕获组
pattern = r"(?:http|https)://(\S+)"

# findall 返回组内容
text = "价格: ¥100, $200, €300"
prices = re.findall(r"[¥$€](\d+)", text)
print(prices)  # ['100', '200', '300']

8.4.5 编译正则

# 预编译提升性能
PHONE_RE = re.compile(r"1[3-9]\d{9}")
EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+\.[\w.]+")

text = "联系方式: 13812345678, [email protected]"
print(PHONE_RE.findall(text))
print(EMAIL_RE.findall(text))

8.4.6 re 模块函数

函数	说明
`re.search()`	搜索第一个匹配
`re.match()`	从字符串开头匹配
`re.fullmatch()`	完全匹配整个字符串
`re.findall()`	查找所有匹配
`re.finditer()`	返回匹配迭代器
`re.sub()`	替换匹配内容
`re.split()`	用正则分割字符串
`re.compile()`	预编译正则

8.5 编码与 Unicode

8.5.1 字符编码基础

# 编码
text = "你好世界"
encoded = text.encode("utf-8")
print(encoded)  # b'\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c'

# 解码
decoded = encoded.decode("utf-8")
print(decoded)  # 你好世界

# 编码长度
print(len(text))        # 4（字符数）
print(len(encoded))     # 12（字节数，每个中文 3 字节）

# 处理编码错误
text = b'\xe4\xbd\xa0'  # 不完整的 UTF-8
print(text.decode("utf-8", errors="ignore"))   # 空（忽略错误）
print(text.decode("utf-8", errors="replace"))  # �（替换）

8.5.2 Unicode 操作

# 字符与码点
print(ord("你"))    # 20320
print(chr(20320))   # "你"
print(hex(ord("你")))  # 0x4f60

# Unicode 类别
import unicodedata
print(unicodedata.category("A"))   # Lu (Letter, uppercase)
print(unicodedata.name("你"))      # CJK UNIFIED IDEOGRAPH-4F60

# Unicode 正规化
s1 = "\u0041\u0301"   # A + ́（两个字符）
s2 = "\u00C1"         # Á（一个字符）
print(s1 == s2)                     # False
print(unicodedata.normalize("NFC", s1) == s2)  # True

8.6 文本处理技巧

8.6.1 多行文本

# textwrap 模块
import textwrap

text = """Python 是一门优雅且强大的编程语言。
它支持多种编程范式，包括面向对象、函数式和过程式编程。
Python 的设计哲学强调代码的可读性和简洁性。"""

# 缩减宽度
print(textwrap.fill(text, width=30))

# 去除缩进
code = """
    def hello():
        print("Hello!")
"""
print(textwrap.dedent(code))

# 缩进
print(textwrap.indent(text, ">> "))

8.6.2 字符串模板

from string import Template

# 安全的模板（防止用户输入中的 $ 注入）
t = Template("Dear $name, your order #$order_id is confirmed.")

# safe_substitute 不会因缺少变量而报错
print(t.safe_substitute(name="Alice", order_id="12345"))
# Dear Alice, your order #12345 is confirmed.

8.6.3 difflib 文本对比

import difflib

text1 = "Hello World"
text2 = "Hello Python"

diff = difflib.unified_diff(
    text1.splitlines(),
    text2.splitlines(),
    fromfile="original",
    tofile="modified",
)
print("".join(diff))

8.7 路径处理

from pathlib import Path

# 创建路径
p = Path("/home/user/documents/file.txt")
p = Path("documents") / "file.txt"  # 路径拼接

# 常用属性
print(p.name)       # file.txt
print(p.stem)       # file
print(p.suffix)     # .txt
print(p.parent)     # documents
print(p.parts)      # ('documents', 'file.txt')

# 路径操作
p.exists()          # 是否存在
p.is_file()         # 是否为文件
p.is_dir()          # 是否为目录
p.mkdir(parents=True, exist_ok=True)  # 创建目录
p.read_text()       # 读取文本
p.write_text("data")  # 写入文本

# 遍历目录
for f in Path(".").glob("*.py"):
    print(f)

for f in Path(".").rglob("*.py"):  # 递归
    print(f)

8.8 注意事项

🔴 注意：

Python 字符串是不可变的，所有方法都返回新字符串
正则表达式中使用原始字符串 r"..." 避免转义问题
编码问题时指定 errors 参数，避免 UnicodeDecodeError
re.match() 只从开头匹配，re.search() 搜索整个字符串

💡 提示：

优先使用 f-string，它更快更可读
正则表达式复杂时使用 re.VERBOSE 标志添加注释
路径操作使用 pathlib 而非 os.path
处理中文文本注意编码一致性

📌 业务场景：

import re
from dataclasses import dataclass

@dataclass
class Contact:
    name: str
    email: str
    phone: str

def extract_contacts(text: str) -> list[Contact]:
    """从文本中提取联系人信息。"""
    contacts = []
    
    # 提取邮箱
    emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", text)
    # 提取手机号
    phones = re.findall(r"1[3-9]\d{9}", text)
    # 提取姓名（中文 2-4 字）
    names = re.findall(r"(?:联系人|姓名)[：:]\s*([\u4e00-\u9fff]{2,4})", text)
    
    for i in range(max(len(names), len(emails), len(phones))):
        contacts.append(Contact(
            name=names[i] if i < len(names) else "未知",
            email=emails[i] if i < len(emails) else "",
            phone=phones[i] if i < len(phones) else "",
        ))
    
    return contacts

text = """
联系人：张三，电话：13812345678，邮箱：[email protected]
联系人：李四，电话：13987654321，邮箱：[email protected]
"""

for contact in extract_contacts(text):
    print(f"{contact.name}: {contact.email}, {contact.phone}")

Python 编程教程 / 08 - 字符串与文本处理

第 08 章：字符串与文本处理

8.1 字符串基础

8.1.1 创建方式

8.1.2 字符串方法

8.2 f-string 格式化

8.2.1 基本用法

8.2.2 数字格式化

8.2.3 Python 3.12+ 改进

8.3 其他格式化方式

8.4 正则表达式

8.4.1 基本匹配

8.4.2 常用模式

8.4.3 实用模式

8.4.4 分组与捕获

8.4.5 编译正则

8.4.6 re 模块函数

8.5 编码与 Unicode

8.5.1 字符编码基础

8.5.2 Unicode 操作

8.6 文本处理技巧

8.6.1 多行文本

8.6.2 字符串模板

8.6.3 difflib 文本对比

8.7 路径处理

8.8 注意事项

8.9 扩展阅读