Tesseract OCR 完整教程 / 第 6 章：模型训练

第 6 章：模型训练

从零开始训练或微调 Tesseract LSTM 模型。

6.1 训练概述

训练流程总览
├── 1. 数据准备
│   ├── 收集图片
│   ├── 生成 Ground Truth
│   └── 数据增强
├── 2. 训练配置
│   ├── 设置参数
│   ├── 准备语言文件
│   └── 创建训练脚本
├── 3. 模型训练
│   ├── LSTM 训练
│   ├── 旧引擎训练（可选）
│   └── 合并模型
├── 4. 评估迭代
│   ├── 计算准确率
│   ├── 分析错误
│   └── 调整参数
└── 5. 部署使用
    └── 安装到 tessdata

6.1.1 何时需要训练

场景	是否需要训练	替代方案
标准印刷体英文	❌	使用默认模型
标准印刷体中文	❌	使用默认模型
特殊字体（如等宽、手写）	✅	微调模型
古籍/历史文档	✅	专用训练
特定领域（医学、法律）	✅	领域微调
新语言/方言	✅	从零训练

6.1.2 训练类型

类型	数据量	时间	精度	适用场景
从零训练	大（10万+）	天级	需迭代	新语言
微调	中（1千-1万）	小时级	较好	特定字体/领域
合并训练	中	小时级	好	多种场景混合

6.2 环境准备

6.2.1 安装训练工具

# Ubuntu/Debian
sudo apt install tesseract-ocr tesseract-ocr-dev leptonica-progs

# 从源码编译训练工具
git clone https://github.com/tesseract-ocr/tesseract.git
cd tesseract
./autogen.sh
mkdir build && cd build
../configure
make -j$(nproc)
sudo make install
sudo make training-install
sudo ldconfig

# 验证训练工具
lstmtraining --version
combine_tessdata --version
unicharset_extractor --version

6.2.2 辅助工具

# 安装 Python 依赖
pip install pytesseract Pillow opencv-python numpy tqdm

# 安装 jTessBoxEditor（GUI 标注工具）
# 下载地址：https://github.com/nguyenq/jTessBoxEditorFX

6.2.3 训练数据目录结构

training/
├── ground-truth/          # Ground Truth 文件
│   ├── 0001.tif           # 训练图片
│   ├── 0001.gt.txt        # 对应文本
│   ├── 0002.tif
│   ├── 0002.gt.txt
│   └── ...
├── eval/                  # 评估数据
│   ├── eval_0001.tif
│   ├── eval_0001.gt.txt
│   └── ...
├── output/                # 输出目录
└── langdata/              # 语言数据
    ├── mylang.config
    ├── mylang.punc
    ├── mylang.numbers
    └── mylang.wordlist

6.3 数据准备

6.3.1 收集训练图片

图片要求：

要求	推荐值	最小值
分辨率	300 DPI	200 DPI
图片数量	1000+	100
文字大小	20-40px	12px
格式	TIFF/PNG	PNG
颜色	灰度/二值	灰度

# 图片批量转换为 TIFF
import os
from PIL import Image

def convert_to_tiff(input_dir, output_dir):
    """批量转换图片为 TIFF 格式"""
    os.makedirs(output_dir, exist_ok=True)
    
    for i, filename in enumerate(sorted(os.listdir(input_dir))):
        if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp')):
            img = Image.open(os.path.join(input_dir, filename))
            img = img.convert('L')  # 转为灰度
            
            output_path = os.path.join(output_dir, f'{i+1:04d}.tif')
            img.save(output_path, compression='tiff_lzw')
            print(f"转换: {filename} → {output_path}")

convert_to_tiff('./raw_images', './ground-truth')

6.3.2 生成 Ground Truth 文本

Ground Truth 文件是每张图片的正确文本，命名规则：{image_name}.gt.txt

# 示例：0001.tif 对应 0001.gt.txt
echo "Hello World" > ground-truth/0001.gt.txt
echo "这是测试文本" > ground-truth/0002.gt.txt

# 批量生成 Ground Truth 模板
import os

def create_gt_template(image_dir):
    """为图片创建空的 Ground Truth 模板"""
    for filename in sorted(os.listdir(image_dir)):
        if filename.endswith('.tif'):
            gt_filename = filename.replace('.tif', '.gt.txt')
            gt_path = os.path.join(image_dir, gt_filename)
            
            if not os.path.exists(gt_path):
                with open(gt_path, 'w', encoding='utf-8') as f:
                    f.write('')  # 空文件，待手动填写
                print(f"创建模板: {gt_filename}")

create_gt_template('./ground-truth')

6.3.3 数据增强

import cv2
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
import random

def augment_image(image_path, output_dir, num_augments=5):
    """数据增强：生成多种变体"""
    img = Image.open(image_path)
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    
    for i in range(num_augments):
        aug_img = img.copy()
        
        # 随机选择增强方式
        choice = random.choice(['rotate', 'noise', 'blur', 'contrast', 'scale'])
        
        if choice == 'rotate':
            angle = random.uniform(-5, 5)
            aug_img = aug_img.rotate(angle, fillcolor='white')
        
        elif choice == 'noise':
            np_img = np.array(aug_img)
            noise = np.random.normal(0, 15, np_img.shape).astype(np.uint8)
            np_img = np.clip(np_img + noise, 0, 255).astype(np.uint8)
            aug_img = Image.fromarray(np_img)
        
        elif choice == 'blur':
            radius = random.uniform(0.5, 1.5)
            aug_img = aug_img.filter(ImageFilter.GaussianBlur(radius))
        
        elif choice == 'contrast':
            factor = random.uniform(0.8, 1.3)
            enhancer = ImageEnhance.Contrast(aug_img)
            aug_img = enhancer.enhance(factor)
        
        elif choice == 'scale':
            scale = random.uniform(0.9, 1.1)
            w, h = aug_img.size
            aug_img = aug_img.resize((int(w*scale), int(h*scale)))
        
        # 保存
        output_path = os.path.join(output_dir, f'{base_name}_aug{i+1}.tif')
        aug_img.save(output_path, compression='tiff_lzw')
        
        # 复制 Ground Truth
        gt_path = image_path.replace('.tif', '.gt.txt')
        gt_output = output_path.replace('.tif', '.gt.txt')
        if os.path.exists(gt_path):
            import shutil
            shutil.copy(gt_path, gt_output)

6.3.4 使用现有文本生成训练图片

from PIL import Image, ImageDraw, ImageFont
import os

def text_to_image(text, font_path, output_path, font_size=32):
    """从文本生成训练图片"""
    # 创建图像
    img = Image.new('L', (800, 60), color=255)
    draw = ImageDraw.Draw(img)
    
    # 加载字体
    font = ImageFont.truetype(font_path, font_size)
    
    # 绘制文本
    draw.text((10, 10), text, fill=0, font=font)
    
    # 保存
    img.save(output_path, compression='tiff_lzw')

def generate_training_data(text_file, font_dir, output_dir):
    """从文本文件批量生成训练数据"""
    os.makedirs(output_dir, exist_ok=True)
    
    # 读取文本
    with open(text_file, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    # 获取可用字体
    fonts = [os.path.join(font_dir, f) for f in os.listdir(font_dir) 
             if f.endswith(('.ttf', '.otf'))]
    
    idx = 1
    for line in lines:
        for font_path in fonts:
            output_path = os.path.join(output_dir, f'{idx:04d}.tif')
            text_to_image(line, font_path, output_path)
            
            # 写入 Ground Truth
            gt_path = output_path.replace('.tif', '.gt.txt')
            with open(gt_path, 'w', encoding='utf-8') as f:
                f.write(line)
            
            idx += 1

# 使用示例
generate_training_data(
    text_file='corpus.txt',
    font_dir='/usr/share/fonts/truetype/',
    output_dir='./ground-truth'
)

6.4 语言配置文件

6.4.1 创建语言配置

# mylang.config
mkdir -p langdata

cat > langdata/mylang.config << 'EOF'
mylang
mylang
mylang
mylang
mylang
EOF

6.4.2 创建标点符号文件

cat > langdata/mylang.punc << 'EOF'
.
,
!
?
;
:
'
"
(
)
[
]
{
}
-
_
/
\
@
#
$
%
&
*
+
=
<
>
|
~
^
EOF

6.4.3 创建数字文件

cat > langdata/mylang.numbers << 'EOF'
0
1
2
3
4
5
6
7
8
9
EOF

6.4.4 创建词表

# 从语料库生成词表
cat corpus.txt | tr ' ' '\n' | sort | uniq -c | sort -rn | head -10000 > langdata/mylang.wordlist

6.5 LSTM 训练

6.5.1 从现有模型微调

#!/bin/bash
# finetune.sh - 微调现有模型

LANG="mylang"
TESSDATA="/usr/share/tesseract-ocr/5/tessdata"
TRAINING_DIR="./ground-truth"
OUTPUT_DIR="./output"
START_MODEL="eng"  # 基础模型

mkdir -p "$OUTPUT_DIR"

# 1. 生成训练文件列表
ls "$TRAINING_DIR"/*.tif > "$OUTPUT_DIR/training_files.txt"

# 2. 提取 LSTM 模型
combine_tessdata -e "$TESSDATA/$START_MODEL.traineddata" \
    "$OUTPUT_DIR/$START_MODEL.lstm"

# 3. 准备训练数据
tesstrain \
    --fonts_dir /usr/share/fonts \
    --fontlist "Arial" "Times New Roman" \
    --lang "$LANG" \
    --linedata_only \
    --noextract_font_properties \
    --output_dir "$OUTPUT_DIR" \
    --maxpages 100

# 4. 训练（微调）
lstmtraining \
    --model_output "$OUTPUT_DIR/$LANG" \
    --continue_from "$OUTPUT_DIR/$START_MODEL.lstm" \
    --traineddata "$TESSDATA/$START_MODEL.traineddata" \
    --train_listfile "$OUTPUT_DIR/training_files.txt" \
    --max_iterations 1000 \
    --learning_rate 0.001

# 5. 合并到最终模型
lstmtraining \
    --stop_training \
    --continue_from "$OUTPUT_DIR/$LANG_checkpoint" \
    --traineddata "$TESSDATA/$START_MODEL.traineddata" \
    --model_output "$OUTPUT_DIR/$LANG.traineddata"

echo "训练完成: $OUTPUT_DIR/$LANG.traineddata"

6.5.2 使用 tesstrain 脚本

# 克隆 tesstrain 工具
git clone https://github.com/tesseract-ocr/tesstrain.git
cd tesstrain

# 准备 Ground Truth
# 将训练数据放在 data/mylang-ground-truth/

# 开始训练
make training \
    MODEL_NAME=mylang \
    START_MODEL=eng \
    TESSDATA=/usr/share/tesseract-ocr/5/tessdata \
    GROUND_TRUTH_DIR=./data/mylang-ground-truth \
    MAX_ITERATIONS=5000

6.5.3 训练参数说明

参数	说明	推荐值
`--max_iterations`	最大迭代次数	1000-10000
`--learning_rate`	学习率	0.001-0.01
`--learning_rate_min`	最小学习率	0.00001
`--momentum`	动量	0.5-0.9
`--net_spec`	网络结构	‘[Lfx128]’
`--num_hidden`	隐藏层大小	128-256

6.5.4 训练过程监控

# 查看训练日志
tail -f output/mylang_checkpoint.log

# 输出示例
# Line 0: 0.1 loss
# Line 100: 0.05 loss
# Line 200: 0.03 loss
# ...

# 使用 lstmtraining 查看进度
lstmtraining --model_output output/mylang \
    --continue_from output/mylang_checkpoint \
    --traineddata /usr/share/tesseract-ocr/5/tessdata/eng.traineddata \
    --train_listfile output/training_files.txt \
    --max_iterations 5000 \
    --learning_rate 0.001 \
    --debug_level 1 2>&1 | tee training.log

6.6 评估与迭代

6.6.1 计算准确率

import pytesseract
from PIL import Image
import os

def evaluate_model(model_path, eval_dir, lang='mylang'):
    """评估模型准确率"""
    total_chars = 0
    correct_chars = 0
    total_words = 0
    correct_words = 0
    
    for filename in sorted(os.listdir(eval_dir)):
        if filename.endswith('.tif'):
            # 读取 Ground Truth
            gt_path = os.path.join(eval_dir, filename.replace('.tif', '.gt.txt'))
            with open(gt_path, 'r', encoding='utf-8') as f:
                gt_text = f.read().strip()
            
            # OCR 识别
            img_path = os.path.join(eval_dir, filename)
            img = Image.open(img_path)
            pred_text = pytesseract.image_to_string(img, lang=lang).strip()
            
            # 字符级准确率
            for gt_char, pred_char in zip(gt_text, pred_text):
                total_chars += 1
                if gt_char == pred_char:
                    correct_chars += 1
            
            # 词级准确率
            gt_words = gt_text.split()
            pred_words = pred_text.split()
            for gt_word, pred_word in zip(gt_words, pred_words):
                total_words += 1
                if gt_word == pred_word:
                    correct_words += 1
    
    char_acc = correct_chars / total_chars * 100 if total_chars > 0 else 0
    word_acc = correct_words / total_words * 100 if total_words > 0 else 0
    
    print(f"字符准确率: {char_acc:.2f}%")
    print(f"词准确率: {word_acc:.2f}%")
    
    return char_acc, word_acc

# 使用
evaluate_model('output/mylang.traineddata', './eval')

6.6.2 错误分析

def error_analysis(eval_dir, lang='mylang'):
    """分析识别错误"""
    errors = []
    
    for filename in sorted(os.listdir(eval_dir)):
        if filename.endswith('.tif'):
            gt_path = os.path.join(eval_dir, filename.replace('.tif', '.gt.txt'))
            with open(gt_path, 'r', encoding='utf-8') as f:
                gt_text = f.read().strip()
            
            img_path = os.path.join(eval_dir, filename)
            img = Image.open(img_path)
            pred_text = pytesseract.image_to_string(img, lang=lang).strip()
            
            if gt_text != pred_text:
                errors.append({
                    'file': filename,
                    'expected': gt_text,
                    'actual': pred_text
                })
    
    # 输出错误统计
    print(f"总错误数: {len(errors)}")
    print("\n常见错误示例:")
    for err in errors[:10]:
        print(f"  {err['file']}:")
        print(f"    期望: {err['expected']}")
        print(f"    实际: {err['actual']}")
    
    return errors

6.6.3 迭代优化策略

问题	解决方案
特定字符识别差	增加该字符的训练样本
某种字体识别差	增加该字体的训练样本
过拟合	增加数据量、降低迭代次数
欠拟合	增加迭代次数、调整学习率
混淆字符	增加相似字符的对比样本

6.7 模型部署

6.7.1 安装自定义模型

# 复制训练好的模型到 tessdata
sudo cp output/mylang.traineddata /usr/share/tesseract-ocr/5/tessdata/

# 验证
tesseract --list-langs | grep mylang

# 使用
tesseract image.png stdout -l mylang

6.7.2 模型打包

# 使用 combine_tessdata 合并组件
combine_tessdata output/mylang.

# 检查模型内容
combine_tessdata -d /usr/share/tesseract-ocr/5/tessdata/mylang.traineddata

6.8 常见问题

问题	原因	解决方案
训练不收敛	学习率过大	降低学习率
训练太慢	数据量太大	减少数据或使用 GPU
模型文件过大	组件过多	只保留必要组件
识别精度下降	过拟合	减少迭代、增加数据
新字符不识别	unicharset 缺失	重建 unicharset

6.9 本章小结

要点	说明
何时训练	特殊字体、新语言、领域适配
数据要求	100+ 图片，300 DPI，高质量 GT
训练方式	微调（推荐）或从零训练
关键参数	learning_rate、max_iterations
评估方法	字符准确率、词准确率

6.10 扩展阅读

上一章: 多语言支持 | 下一章: PDF 处理