Tesseract OCR 完整教程 / 第 10 章：精度优化

第 10 章：精度优化

全面的精度优化策略，让 OCR 结果更准确。

10.1 精度优化总览

精度优化层次
├── 1. 输入优化（效果最大）
│   ├── 分辨率：300 DPI
│   ├── 清晰度：去模糊
│   ├── 对比度：黑白分明
│   └── 倾斜：校正
├── 2. 预处理优化
│   ├── 二值化
│   ├── 去噪
│   └── 缩放
├── 3. 识别参数优化
│   ├── PSM 模式
│   ├── OEM 模式
│   └── 语言选择
├── 4. 后处理优化
│   ├── 自定义词典
│   ├── 字符白名单
│   └── 拼写检查
└── 5. 模型优化（效果持久）
    ├── 微调训练
    └── 领域适配

10.1.1 各因素影响程度

因素	影响度	实施难度	见效速度
分辨率	⭐⭐⭐⭐⭐	低	立即
倾斜校正	⭐⭐⭐⭐	低	立即
二值化	⭐⭐⭐⭐	中	立即
PSM 选择	⭐⭐⭐	低	立即
语言配置	⭐⭐⭐	低	立即
字符白名单	⭐⭐⭐	低	立即
自定义词典	⭐⭐⭐	中	立即
模型微调	⭐⭐⭐⭐⭐	高	慢

10.2 分辨率与图片质量

10.2.1 分辨率检查与调整

from PIL import Image
import cv2

def check_and_fix_dpi(image_path, target_dpi=300):
    """检查并调整图片 DPI"""
    img = Image.open(image_path)
    
    # 获取当前 DPI
    dpi = img.info.get('dpi', (72, 72))
    print(f"当前 DPI: {dpi}")
    
    # 检查像素尺寸
    w, h = img.size
    print(f"尺寸: {w} x {h}")
    
    # 计算文字区域高度估算
    # 12pt 字体在 300 DPI 下约 50 像素高
    
    if w < 1000 or h < 100:
        print("⚠️ 图片太小，建议放大 2-3 倍")
        scale = max(1000 / w, 300 / h, 2.0)
        new_w, new_h = int(w * scale), int(h * scale)
        img = img.resize((new_w, new_h), Image.LANCZOS)
        print(f"已放大到: {new_w} x {new_h}")
    
    return img

# 调整 DPI
from PIL import Image

def set_dpi(input_path, output_path, dpi=300):
    """设置图片 DPI"""
    img = Image.open(input_path)
    img.save(output_path, dpi=(dpi, dpi))
    print(f"已设置 DPI: {dpi}")

10.2.2 图片质量检测

import cv2
import numpy as np

def assess_image_quality(image_path):
    """评估图片质量"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # 1. 清晰度（拉普拉斯方差）
    laplacian_var = cv2.Laplacian(img, cv2.CV_64F).var()
    sharpness = "清晰" if laplacian_var > 100 else "模糊"
    
    # 2. 对比度
    contrast = np.std(img)
    contrast_level = "高" if contrast > 60 else "中" if contrast > 30 else "低"
    
    # 3. 亮度
    brightness = np.mean(img)
    brightness_level = "正常" if 80 < brightness < 200 else "偏暗" if brightness <= 80 else "偏亮"
    
    # 4. 噪声估计
    noise = np.mean(np.abs(cv2.Laplacian(img, cv2.CV_64F)))
    noise_level = "低" if noise < 5 else "中" if noise < 15 else "高"
    
    print(f"清晰度: {sharpness} ({laplacian_var:.1f})")
    print(f"对比度: {contrast_level} ({contrast:.1f})")
    print(f"亮度: {brightness_level} ({brightness:.1f})")
    print(f"噪声: {noise_level} ({noise:.1f})")
    
    # 综合评分
    score = 0
    if laplacian_var > 100: score += 25
    if contrast > 40: score += 25
    if 80 < brightness < 200: score += 25
    if noise < 10: score += 25
    
    print(f"\n综合质量评分: {score}/100")
    
    return {
        'sharpness': laplacian_var,
        'contrast': contrast,
        'brightness': brightness,
        'noise': noise,
        'score': score
    }

10.3 字符白名单与黑名单

10.3.1 基本用法

import pytesseract
from PIL import Image

img = Image.open('test.png')

# 只识别数字
numbers = pytesseract.image_to_string(
    img, config='-c tessedit_char_whitelist=0123456789'
)

# 只识别数字和字母
alphanumeric = pytesseract.image_to_string(
    img, config='-c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
)

# 中文数字
cn_numbers = pytesseract.image_to_string(
    img, lang='chi_sim',
    config='-c tessedit_char_whitelist=零一二三四五六七八九十百千万亿元角分'
)

# 排除特定字符
exclude = pytesseract.image_to_string(
    img, config='-c tessedit_char_blacklist=|[]{}'
)

10.3.2 场景化白名单

WHITELISTS = {
    '身份证号': '0123456789X',
    '手机号': '0123456789+',
    '车牌号': '0123456789ABCDEFGHJKLMNPQRSTUVWXYZ京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤川青藏琼宁',
    '银行卡号': '0123456789',
    '金额': '0123456789.,',
    '日期': '0123456789-/',
    '邮箱': '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz@._-',
    '纯中文': '的一是不了人我在有他这为之大来以个中上们',
    '中文数字': '零一二三四五六七八九十百千万亿',
}

def ocr_with_whitelist(image_path, whitelist_name, lang='chi_sim+eng'):
    """使用预定义白名单 OCR"""
    whitelist = WHITELISTS.get(whitelist_name, '')
    
    img = Image.open(image_path)
    config = f'--psm 7 -c tessedit_char_whitelist={whitelist}'
    
    text = pytesseract.image_to_string(img, lang=lang, config=config)
    return text.strip()

10.4 自定义词典

10.4.1 创建用户词典

# 创建用户词典文件
cat > user-words.txt << 'EOF'
张三
李四
北京大学
人工智能
机器学习
深度学习
EOF

10.4.2 使用用户词典

# 使用用户词典
tesseract image.png output -l chi_sim+eng \
    --user-words user-words.txt \
    --user-patterns user-patterns.txt

import pytesseract
from PIL import Image

def ocr_with_custom_dict(image_path, words_file, lang='chi_sim+eng'):
    """使用自定义词典 OCR"""
    img = Image.open(image_path)
    
    # pytesseract 支持 --user-words
    config = f'--user-words {words_file}'
    
    text = pytesseract.image_to_string(img, lang=lang, config=config)
    return text

10.4.3 用户模式文件

# user-patterns.txt - 使用正则模式
# 每行一个模式
^[0-9]{4}-[0-9]{2}-[0-9]{2}$     # 日期格式
^[0-9]{11}$                        # 手机号
^[A-Z]{2}[0-9]{6}$                 # 编号格式

10.5 页面分割模式优化

10.5.1 自动 PSM 选择

import pytesseract
from PIL import Image

def auto_select_psm(image_path, lang='chi_sim+eng'):
    """自动选择最佳 PSM 模式"""
    img = Image.open(image_path)
    w, h = img.size
    
    # 根据图片特征选择 PSM
    aspect_ratio = w / h
    
    if aspect_ratio > 5:
        # 非常宽，可能是单行
        return 7
    elif aspect_ratio < 0.2:
        # 非常高，可能是竖排
        return 5
    elif w < 100 and h < 100:
        # 小图，可能是单个字符
        return 10
    elif w < 200:
        # 小图，可能是单词
        return 8
    else:
        # 尝试多种模式，选择最佳
        configs = [3, 4, 6, 11]
        best_psm = 3
        best_len = 0
        
        for psm in configs:
            try:
                config = f'--psm {psm}'
                text = pytesseract.image_to_string(img, lang=lang, config=config)
                # 选择识别出最多文本的模式
                if len(text.strip()) > best_len:
                    best_len = len(text.strip())
                    best_psm = psm
            except:
                continue
        
        return best_psm

psm = auto_select_psm('test.png')
print(f"推荐 PSM: {psm}")

10.5.2 PSM 对比测试

def compare_psm(image_path, lang='chi_sim+eng'):
    """对比不同 PSM 模式的效果"""
    img = Image.open(image_path)
    
    results = {}
    for psm in [3, 4, 5, 6, 7, 11]:
        try:
            config = f'--psm {psm}'
            text = pytesseract.image_to_string(img, lang=lang, config=config)
            
            # 计算质量指标
            text = text.strip()
            cn_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
            en_chars = sum(1 for c in text if c.isascii() and c.isalpha())
            
            results[psm] = {
                'text': text[:100],
                'length': len(text),
                'cn_chars': cn_chars,
                'en_chars': en_chars,
                'score': cn_chars + en_chars  # 简单评分
            }
        except Exception as e:
            results[psm] = {'error': str(e)}
    
    # 输出结果
    print("PSM 模式对比:")
    print("-" * 60)
    for psm, info in sorted(results.items()):
        if 'error' in info:
            print(f"PSM {psm}: 错误 - {info['error']}")
        else:
            print(f"PSM {psm}: 长度={info['length']:4d}, "
                  f"中文={info['cn_chars']:3d}, 英文={info['en_chars']:3d} "
                  f"| {info['text'][:40]}...")
    
    # 找出最佳
    best = max(results.items(), key=lambda x: x[1].get('score', 0))
    print(f"\n推荐 PSM: {best[0]}")
    
    return results

10.6 语言模型优化

10.6.1 语言组合策略

import pytesseract
from PIL import Image

def optimized_language_config(image_path):
    """优化语言配置"""
    img = Image.open(image_path)
    
    # 策略 1: 检测主要语言
    osd = pytesseract.image_to_osd(img, output_type=pytesseract.Output.DICT)
    script = osd['script']
    
    script_lang = {
        'Han': 'chi_sim',
        'Latin': 'eng',
        'Japanese': 'jpn',
        'Korean': 'kor',
        'Arabic': 'ara',
        'Cyrillic': 'rus',
    }
    
    primary_lang = script_lang.get(script, 'eng')
    
    # 策略 2: 如果是中文，加上英文（中英混合很常见）
    if primary_lang == 'chi_sim':
        lang = 'chi_sim+eng'
    elif primary_lang == 'eng':
        lang = 'eng'
    else:
        lang = f'{primary_lang}+eng'
    
    print(f"检测脚本: {script}, 使用语言: {lang}")
    return lang

10.6.2 使用 best 模型

# 下载最佳精度模型
cd /tmp
wget https://github.com/tesseract-ocr/tessdata_best/raw/main/chi_sim_best.traineddata
wget https://github.com/tesseract-ocr/tessdata_best/raw/main/eng_best.traineddata

# 备份原模型
sudo cp /usr/share/tesseract-ocr/5/tessdata/chi_sim.traineddata \
        /usr/share/tesseract-ocr/5/tessdata/chi_sim.traineddata.bak

# 替换
sudo mv chi_sim_best.traineddata /usr/share/tesseract-ocr/5/tessdata/chi_sim.traineddata
sudo mv eng_best.traineddata /usr/share/tesseract-ocr/5/tessdata/eng.traineddata

10.7 后处理优化

10.7.1 拼写检查

# pip install pyspellchecker
from spellchecker import SpellChecker

def spell_check_text(text, language='zh'):
    """拼写检查与纠正"""
    spell = SpellChecker(language=language)
    
    words = text.split()
    corrected = []
    
    for word in words:
        if word.isascii():
            # 英文单词检查
            correction = spell.correction(word)
            corrected.append(correction if correction else word)
        else:
            # 中文不做拼写检查
            corrected.append(word)
    
    return ' '.join(corrected)

10.7.2 正则表达式后处理

import re

def post_process_ocr(text):
    """OCR 结果后处理"""
    # 1. 修复常见 OCR 错误
    replacements = {
        '。': '.',    # 句号
        '，': ',',    # 逗号
        '|': 'I',     # 管道符可能是 I
        '0': 'O',     # 数字 0 可能是字母 O（根据上下文）
        'l': '1',     # 小写 L 可能是 1（根据上下文）
    }
    
    # 2. 修复金额格式
    text = re.sub(r'(\d+)\s*[.]\s*(\d{2})', r'\1.\2', text)
    
    # 3. 修复日期格式
    text = re.sub(r'(\d{4})\s*[-/年]\s*(\d{1,2})\s*[-/月]\s*(\d{1,2})', 
                  r'\1-\2-\3', text)
    
    # 4. 移除多余空白
    text = re.sub(r'\s+', ' ', text)
    
    # 5. 修复中文标点
    text = text.replace('。。', '。')
    text = text.replace('，，', '，')
    
    return text.strip()

10.7.3 上下文纠正

def context_correction(text, field_type='general'):
    """根据上下文纠正 OCR 结果"""
    
    if field_type == 'phone':
        # 手机号：11 位数字
        digits = re.findall(r'\d+', text)
        for d in digits:
            if len(d) == 11:
                return d
    
    elif field_type == 'id_card':
        # 身份证：18 位
        match = re.search(r'\d{17}[\dXx]', text)
        if match:
            return match.group().upper()
    
    elif field_type == 'date':
        # 日期格式
        match = re.search(r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}', text)
        if match:
            return match.group()
    
    elif field_type == 'amount':
        # 金额
        match = re.search(r'[\d,]+\.?\d*', text.replace('，', ','))
        if match:
            return match.group().replace(',', '')
    
    return text

10.8 高级参数调优

10.8.1 Tesseract 内部参数

import pytesseract
from PIL import Image

def ocr_with_tuned_params(image_path, lang='chi_sim+eng'):
    """使用调优参数 OCR"""
    img = Image.open(image_path)
    
    # 高级配置
    config = ' '.join([
        '--oem 1',                                          # LSTM 引擎
        '--psm 6',                                          # 统一文本块
        '-c tessedit_char_blacklist=|[]{}()',               # 黑名单
        '-c tessedit_min_confidence=50',                    # 最小置信度
        '-c preserve_interword_spaces=1',                   # 保留空格
        '-c textord_heavy_nr=1',                            # 噪声处理
        '-c tessedit_pageseg_mode=6',                       # 页面分割
        '-c tessedit_do_invert=0',                          # 不反转
        '-c language_model_ngram_on=0',                     # 禁用 ngram（加快）
    ])
    
    text = pytesseract.image_to_string(img, lang=lang, config=config)
    return text

10.8.2 参数列表

参数	说明	默认值	调优建议
`tessedit_min_confidence`	最小置信度	-1	50-80
`preserve_interword_spaces`	保留空格	0	设为 1
`textord_heavy_nr`	噪声处理	0	噪声大时设为 1
`tessedit_do_invert`	反转处理	1	白底黑字设为 0
`language_model_ngram_on`	N-gram 模型	1	速度优先设为 0
`numeric_punctuation`	数字标点	`.,`	根据语言调整
`tessedit_write_images`	输出调试图	0	调试时设为 1

10.9 A/B 测试框架

import pytesseract
from PIL import Image
import os

def ab_test(image_dir, gt_dir, configs):
    """A/B 测试不同配置的效果"""
    results = {}
    
    for config_name, config in configs.items():
        total_chars = 0
        correct_chars = 0
        
        for filename in os.listdir(image_dir):
            if not filename.endswith(('.png', '.tif')):
                continue
            
            # 读取 Ground Truth
            gt_path = os.path.join(gt_dir, filename.rsplit('.', 1)[0] + '.gt.txt')
            if not os.path.exists(gt_path):
                continue
            
            with open(gt_path, 'r', encoding='utf-8') as f:
                gt_text = f.read().strip()
            
            # OCR
            img_path = os.path.join(image_dir, filename)
            img = Image.open(img_path)
            
            try:
                pred_text = pytesseract.image_to_string(img, config=config).strip()
            except:
                continue
            
            # 计算准确率
            for gt, pred in zip(gt_text, pred_text):
                total_chars += 1
                if gt == pred:
                    correct_chars += 1
        
        accuracy = correct_chars / total_chars * 100 if total_chars > 0 else 0
        results[config_name] = accuracy
        print(f"{config_name}: {accuracy:.1f}%")
    
    return results

# 测试不同 PSM 模式
configs = {
    'psm3': '--psm 3',
    'psm4': '--psm 4',
    'psm6': '--psm 6',
    'psm6_best': '--psm 6 -c tessedit_min_confidence=50',
}

results = ab_test('./images', './ground_truth', configs)
best = max(results.items(), key=lambda x: x[1])
print(f"\n最佳配置: {best[0]} ({best[1]:.1f}%)")

10.10 精度优化检查清单

输入检查
□ 分辨率 ≥ 300 DPI
□ 图片清晰（拉普拉斯方差 > 100）
□ 对比度足够（标准差 > 40）
□ 亮度正常（均值 80-200）
□ 已校正倾斜

预处理检查
□ 已灰度化
□ 已二值化（Otsu 或自适应）
□ 已去噪
□ 已去除边框

参数检查
□ PSM 模式正确
□ 语言配置正确
□ 使用 best 模型（如需高精度）
□ 字符白名单/黑名单已设置

后处理检查
□ 已过滤低置信度结果
□ 已纠正常见 OCR 错误
□ 已进行格式化处理

10.11 本章小结

策略	效果	推荐度
提高分辨率	⭐⭐⭐⭐⭐	必做
倾斜校正	⭐⭐⭐⭐	强烈推荐
字符白名单	⭐⭐⭐	特定场景
best 模型	⭐⭐⭐⭐	高精度场景
参数调优	⭐⭐⭐	需要测试
后处理	⭐⭐⭐	推荐

10.12 扩展阅读

上一章: 版面分析 | 下一章: Docker 部署