第 8 章:Python 集成
第 8 章:Python 集成
使用 Python 高效集成 Tesseract OCR。
8.1 环境配置
# 安装 Python 依赖
pip install pytesseract Pillow opencv-python numpy pandas
# 验证
python3 -c "import pytesseract; print(pytesseract.get_tesseract_version())"
8.2 pytesseract 基础
8.2.1 核心 API
import pytesseract
from PIL import Image
img = Image.open('test.png')
# 1. 识别为文本
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
# 2. 识别为数据(含位置、置信度)
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
# 3. 识别为 hOCR
hocr = pytesseract.image_to_pdf_or_hocr(img, extension='hocr')
# 4. 识别为 PDF
pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf')
# 5. 获取 OSD 信息
osd = pytesseract.image_to_osd(img, output_type=pytesseract.Output.DICT)
8.2.2 输出类型
| 函数 | 输出 | 用途 |
|---|
image_to_string | 字符串 | 简单文本提取 |
image_to_data | 字典/DataFrame | 位置、置信度 |
image_to_boxes | 字符框 | 字符级位置 |
image_to_pdf_or_hocr | PDF/hOCR | 生成文件 |
image_to_osd | OSD 信息 | 方向、脚本检测 |
8.3 结果解析
8.3.1 image_to_data 详解
import pytesseract
from PIL import Image
img = Image.open('test.png')
# 获取详细数据
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
# 字段说明
# data['level'] - 层级: 1=page, 2=block, 3=para, 4=line, 5=word
# data['page_num'] - 页码
# data['block_num']- 块编号
# data['par_num'] - 段落编号
# data['line_num'] - 行编号
# data['word_num'] - 词编号
# data['left'] - 左边距
# data['top'] - 上边距
# data['width'] - 宽度
# data['height'] - 高度
# data['conf'] - 置信度 (-1=失败)
# data['text'] - 文本
# 遍历结果
n_boxes = len(data['text'])
for i in range(n_boxes):
if int(data['conf'][i]) > 60: # 置信度过滤
text = data['text'][i].strip()
if text:
print(f"文本: {text:20s} 置信度: {data['conf'][i]:6.1f} "
f"位置: ({data['left'][i]}, {data['top'][i]})")
8.3.2 结果转换为 DataFrame
import pytesseract
import pandas as pd
from PIL import Image
def ocr_to_dataframe(image_path, lang='chi_sim+eng'):
"""OCR 结果转为 DataFrame"""
img = Image.open(image_path)
data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
df = pd.DataFrame(data)
# 转换类型
df['conf'] = df['conf'].astype(int)
df['text'] = df['text'].astype(str)
# 过滤
df = df[df['conf'] > 0] # 排除失败项
df = df[df['text'].str.strip() != ''] # 排除空文本
return df
# 使用
df = ocr_to_dataframe('test.png')
print(df[['text', 'conf', 'left', 'top']].to_string())
# 保存为 CSV
df.to_csv('ocr_results.csv', index=False, encoding='utf-8-sig')
8.3.3 按层级组织结果
def ocr_structured(image_path, lang='chi_sim+eng'):
"""结构化 OCR 结果"""
import pytesseract
from PIL import Image
img = Image.open(image_path)
data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
result = {
'blocks': {}
}
n = len(data['text'])
for i in range(n):
level = data['level'][i]
conf = int(data['conf'][i])
text = data['text'][i].strip()
if conf <= 0 or not text:
continue
if level == 2: # Block
block_id = data['block_num'][i]
if block_id not in result['blocks']:
result['blocks'][block_id] = {'lines': {}, 'bbox': (
data['left'][i], data['top'][i],
data['left'][i] + data['width'][i],
data['top'][i] + data['height'][i]
)}
elif level == 4: # Line
block_id = data['block_num'][i]
line_id = data['line_num'][i]
if block_id in result['blocks']:
if line_id not in result['blocks'][block_id]['lines']:
result['blocks'][block_id]['lines'][line_id] = {'words': [], 'text': ''}
elif level == 5: # Word
block_id = data['block_num'][i]
line_id = data['line_num'][i]
if (block_id in result['blocks'] and
line_id in result['blocks'][block_id]['lines']):
result['blocks'][block_id]['lines'][line_id]['words'].append({
'text': text,
'conf': conf,
'bbox': (data['left'][i], data['top'][i],
data['left'][i] + data['width'][i],
data['top'][i] + data['height'][i])
})
result['blocks'][block_id]['lines'][line_id]['text'] += text + ' '
return result
# 使用
result = ocr_structured('test.png')
for block_id, block in result['blocks'].items():
print(f"\nBlock {block_id}:")
for line_id, line in block['lines'].items():
print(f" Line {line_id}: {line['text'].strip()}")
8.4 置信度过滤
8.4.1 基本过滤
import pytesseract
from PIL import Image
def ocr_with_confidence(image_path, min_conf=60, lang='chi_sim+eng'):
"""带置信度过滤的 OCR"""
img = Image.open(image_path)
data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
results = []
n = len(data['text'])
for i in range(n):
conf = int(data['conf'][i])
text = data['text'][i].strip()
if conf >= min_conf and text:
results.append({
'text': text,
'confidence': conf,
'bbox': {
'left': data['left'][i],
'top': data['top'][i],
'width': data['width'][i],
'height': data['height'][i]
},
'level': data['level'][i]
})
return results
# 使用
results = ocr_with_confidence('test.png', min_conf=70)
for r in results:
print(f"{r['text']:20s} conf={r['confidence']:.0f}")
8.4.2 置信度统计分析
import numpy as np
def confidence_analysis(image_path, lang='chi_sim+eng'):
"""置信度统计分析"""
import pytesseract
from PIL import Image
img = Image.open(image_path)
data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
confs = [int(c) for c in data['conf'] if int(c) > 0]
if not confs:
print("无有效结果")
return
print(f"样本数: {len(confs)}")
print(f"平均置信度: {np.mean(confs):.1f}")
print(f"中位数: {np.median(confs):.1f}")
print(f"标准差: {np.std(confs):.1f}")
print(f"最低: {min(confs)}")
print(f"最高: {max(confs)}")
# 分布
ranges = [(90, 100), (80, 90), (70, 80), (60, 70), (0, 60)]
for low, high in ranges:
count = sum(1 for c in confs if low <= c < high)
print(f" {low}-{high}: {count} ({count/len(confs)*100:.1f}%)")
8.4.3 自适应置信度阈值
def adaptive_confidence_threshold(image_path, lang='chi_sim+eng'):
"""自适应置信度阈值"""
import pytesseract
from PIL import Image
import numpy as np
img = Image.open(image_path)
data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
confs = [int(c) for c in data['conf'] if int(c) > 0]
if not confs:
return []
# 使用均值-标准差方法
mean_conf = np.mean(confs)
std_conf = np.std(confs)
threshold = max(mean_conf - std_conf, 50) # 最低 50
print(f"自适应阈值: {threshold:.1f}")
# 过滤
results = []
n = len(data['text'])
for i in range(n):
conf = int(data['conf'][i])
text = data['text'][i].strip()
if conf >= threshold and text:
results.append({'text': text, 'confidence': conf})
return results
8.5 批量处理
8.5.1 简单批量处理
import os
import pytesseract
from PIL import Image
def batch_ocr(input_dir, output_dir, lang='chi_sim+eng'):
"""批量 OCR 处理"""
os.makedirs(output_dir, exist_ok=True)
for filename in os.listdir(input_dir):
if filename.endswith(('.png', '.jpg', '.tif', '.tiff')):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(output_dir, filename.rsplit('.', 1)[0] + '.txt')
try:
img = Image.open(input_path)
text = pytesseract.image_to_string(img, lang=lang)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
print(f"✓ {filename}")
except Exception as e:
print(f"✗ {filename}: {e}")
batch_ocr('./images', './texts')
8.5.2 并行批量处理
import os
import pytesseract
from PIL import Image
from concurrent.futures import ProcessPoolExecutor, as_completed
def process_single(args):
"""处理单个文件"""
input_path, output_path, lang = args
try:
img = Image.open(input_path)
text = pytesseract.image_to_string(img, lang=lang)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
return True, input_path, len(text)
except Exception as e:
return False, input_path, str(e)
def parallel_batch_ocr(input_dir, output_dir, lang='chi_sim+eng', workers=4):
"""并行批量 OCR"""
os.makedirs(output_dir, exist_ok=True)
tasks = []
for filename in os.listdir(input_dir):
if filename.endswith(('.png', '.jpg', '.tif', '.tiff')):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(output_dir, filename.rsplit('.', 1)[0] + '.txt')
tasks.append((input_path, output_path, lang))
print(f"待处理: {len(tasks)} 个文件,使用 {workers} 个进程")
success = 0
fail = 0
with ProcessPoolExecutor(max_workers=workers) as executor:
futures = [executor.submit(process_single, task) for task in tasks]
for future in as_completed(futures):
ok, path, info = future.result()
if ok:
success += 1
print(f"✓ {os.path.basename(path)} ({info} 字符)")
else:
fail += 1
print(f"✗ {os.path.basename(path)}: {info}")
print(f"\n完成: 成功 {success}, 失败 {fail}")
parallel_batch_ocr('./images', './texts', workers=4)
8.6 OpenCV 集成
8.6.1 OpenCV 预处理 + pytesseract
import cv2
import numpy as np
import pytesseract
def ocr_with_preprocessing(image_path, lang='chi_sim+eng'):
"""OpenCV 预处理 + Tesseract OCR"""
# 读取
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 预处理
# 1. 去噪
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# 2. 对比度增强
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)
# 3. 二值化
_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 4. 形态学处理
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
processed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# OCR
from PIL import Image
pil_img = Image.fromarray(processed)
text = pytesseract.image_to_string(pil_img, lang=lang)
return text
text = ocr_with_preprocessing('scan.png')
print(text)
8.6.2 区域提取 OCR
import cv2
import pytesseract
def region_ocr(image_path, region, lang='chi_sim+eng'):
"""对指定区域进行 OCR"""
img = cv2.imread(image_path)
x, y, w, h = region
roi = img[y:y+h, x:x+w]
# 预处理
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# OCR
text = pytesseract.image_to_string(binary, lang=lang, config='--psm 6')
return text.strip()
# 识别右上角区域
h, w = cv2.imread('document.png').shape[:2]
top_right = region_ocr('document.png', (w//2, 0, w//2, h//4))
print(f"右上角文字: {top_right}")
8.6.3 可视化结果
import cv2
import pytesseract
from PIL import Image
def visualize_ocr(image_path, output_path='ocr_result.png'):
"""可视化 OCR 结果"""
img = cv2.imread(image_path)
pil_img = Image.open(image_path)
data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
n = len(data['text'])
for i in range(n):
conf = int(data['conf'][i])
text = data['text'][i].strip()
if conf > 60 and text:
x = data['left'][i]
y = data['top'][i]
w = data['width'][i]
h = data['height'][i]
# 绘制边框
color = (0, 255, 0) if conf > 80 else (0, 255, 255)
cv2.rectangle(img, (x, y), (x+w, y+h), color, 2)
# 绘制置信度
cv2.putText(img, f"{conf}%", (x, y-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
cv2.imwrite(output_path, img)
print(f"可视化结果已保存: {output_path}")
visualize_ocr('test.png')
8.7 高级用法
8.7.1 自定义配置
import pytesseract
from PIL import Image
img = Image.open('test.png')
# 组合配置
custom_config = r'--oem 1 --psm 6 -c tessedit_char_whitelist=0123456789'
text = pytesseract.image_to_string(img, config=custom_config)
# 常用配置
configs = {
'数字': '--psm 7 -c tessedit_char_whitelist=0123456789',
'单行英文': '--psm 7',
'表格': '--psm 6',
'稀疏文本': '--psm 11',
'垂直文本': '--psm 5',
}
8.7.2 多引擎结果合并
import pytesseract
from PIL import Image
def multi_config_ocr(image_path, configs, lang='chi_sim+eng'):
"""多配置 OCR 结果合并"""
img = Image.open(image_path)
results = {}
for name, config in configs.items():
try:
text = pytesseract.image_to_string(img, lang=lang, config=config)
results[name] = text.strip()
except Exception as e:
results[name] = f"Error: {e}"
# 选择最长的结果(简单策略)
best = max(results.items(), key=lambda x: len(x[1]))
return best[0], best[1], results
configs = {
'psm3': '--psm 3',
'psm4': '--psm 4',
'psm6': '--psm 6',
}
best_name, best_text, all_results = multi_config_ocr('test.png', configs)
print(f"最佳配置: {best_name}")
print(f"识别结果:\n{best_text}")
8.8 异常处理
import pytesseract
from PIL import Image
def safe_ocr(image_path, lang='chi_sim+eng', retries=3):
"""带重试的安全 OCR"""
import time
for attempt in range(retries):
try:
img = Image.open(image_path)
# 验证图片
if img.size[0] < 10 or img.size[1] < 10:
return None, "图片太小"
# OCR
text = pytesseract.image_to_string(img, lang=lang)
return text.strip(), None
except pytesseract.TesseractNotFoundError:
return None, "Tesseract 未安装"
except pytesseract.TesseractError as e:
if attempt < retries - 1:
time.sleep(1)
continue
return None, f"Tesseract 错误: {e}"
except Exception as e:
return None, f"未知错误: {e}"
return None, "重试次数用完"
# 使用
text, error = safe_ocr('test.png')
if error:
print(f"错误: {error}")
else:
print(f"结果: {text}")
8.9 实用工具函数
import pytesseract
from PIL import Image
import os
def ocr_info():
"""获取 Tesseract 信息"""
print(f"版本: {pytesseract.get_tesseract_version()}")
print(f"语言: {pytesseract.get_languages()}")
print(f"路径: {pytesseract.get_tesseract_version()}")
def quick_ocr(image_path, lang='chi_sim+eng'):
"""快速 OCR"""
return pytesseract.image_to_string(Image.open(image_path), lang=lang)
def word_count(text):
"""统计字数"""
cn_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
en_words = len([w for w in text.split() if w.isascii()])
return {'中文字符': cn_chars, '英文单词': en_words}
8.10 本章小结
| 要点 | 说明 |
|---|
| 核心库 | pytesseract + Pillow |
| 结果解析 | image_to_data 获取详细信息 |
| 置信度过滤 | conf > 60 一般可用 |
| 批量处理 | ProcessPoolExecutor 并行 |
| 预处理 | OpenCV + pytesseract 组合 |
8.11 扩展阅读
上一章: PDF 处理 | 下一章: 版面分析