第 9 章:版面分析
第 9 章:版面分析
掌握复杂文档布局的分析与处理技术。
9.1 版面分析概述
文档布局类型
├── 单栏文本
├── 多栏文本
├── 表格
├── 图文混排
├── 表单
└── 复杂布局(报纸、杂志)
9.1.1 Tesseract 版面分析能力
| 功能 | 支持程度 | 说明 |
|---|
| 单栏文本 | ⭐⭐⭐⭐⭐ | 最佳 |
| 多栏文本 | ⭐⭐⭐ | 一般 |
| 表格 | ⭐⭐ | 有限 |
| 图文混排 | ⭐⭐⭐ | 一般 |
| 表单 | ⭐⭐⭐ | 需配置 |
| 手写体 | ⭐ | 很有限 |
9.2 页面分割模式详解
import pytesseract
from PIL import Image
def test_psm_modes(image_path, lang='chi_sim+eng'):
"""测试不同 PSM 模式"""
img = Image.open(image_path)
psm_modes = {
0: 'OSD only',
1: 'Auto + OSD',
2: 'Auto',
3: 'Fully auto',
4: 'Single column',
5: 'Vertical block',
6: 'Uniform block',
7: 'Single line',
8: 'Single word',
11: 'Sparse text',
12: 'Sparse + OSD',
}
results = {}
for psm, desc in psm_modes.items():
try:
config = f'--psm {psm}'
text = pytesseract.image_to_string(img, lang=lang, config=config)
results[psm] = {
'desc': desc,
'text': text.strip()[:100],
'length': len(text.strip())
}
except Exception as e:
results[psm] = {'desc': desc, 'error': str(e)}
return results
# 测试并选择最佳模式
results = test_psm_modes('document.png')
for psm, info in results.items():
print(f"PSM {psm:2d} ({info['desc']:15s}): {info.get('text', info.get('error', ''))[:60]}")
9.3 表格识别
9.3.1 基本表格识别
import pytesseract
from PIL import Image
def ocr_table(image_path, lang='chi_sim+eng'):
"""表格 OCR"""
img = Image.open(image_path)
# PSM 6 适合表格
data = pytesseract.image_to_data(img, lang=lang,
config='--psm 6',
output_type=pytesseract.Output.DICT)
# 按行组织结果
lines = {}
n = len(data['text'])
for i in range(n):
if int(data['conf'][i]) > 30 and data['text'][i].strip():
line_num = data['line_num'][i]
if line_num not in lines:
lines[line_num] = []
lines[line_num].append({
'text': data['text'][i].strip(),
'left': data['left'][i],
'top': data['top'][i],
'width': data['width'][i],
'conf': int(data['conf'][i])
})
# 按 x 坐标排序每行
for line_num in lines:
lines[line_num].sort(key=lambda x: x['left'])
return lines
# 使用
lines = ocr_table('table.png')
for line_num, words in sorted(lines.items()):
row = ' | '.join(w['text'] for w in words)
print(f"行 {line_num}: {row}")
9.3.2 表格结构检测
import cv2
import numpy as np
def detect_table_structure(image_path):
"""检测表格结构(行列)"""
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
_, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# 检测水平线
h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
h_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, h_kernel)
# 检测垂直线
v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
v_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, v_kernel)
# 找到行和列的位置
h_proj = np.sum(h_lines, axis=1)
v_proj = np.sum(v_lines, axis=0)
row_positions = np.where(h_proj > np.max(h_proj) * 0.5)[0]
col_positions = np.where(v_proj > np.max(v_proj) * 0.5)[0]
# 合并相近位置
def merge_positions(positions, threshold=10):
if len(positions) == 0:
return []
merged = [positions[0]]
for pos in positions[1:]:
if pos - merged[-1] > threshold:
merged.append(pos)
return merged
rows = merge_positions(row_positions)
cols = merge_positions(col_positions)
return rows, cols
rows, cols = detect_table_structure('table.png')
print(f"检测到 {len(rows)-1} 行, {len(cols)-1} 列")
9.3.3 完整表格 OCR
import cv2
import numpy as np
import pytesseract
from PIL import Image
def full_table_ocr(image_path, lang='chi_sim+eng'):
"""完整表格 OCR:检测结构 + 识别内容"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 1. 检测表格结构
rows, cols = detect_table_structure(image_path)
if len(rows) < 2 or len(cols) < 2:
print("未检测到表格结构,尝试普通 OCR")
return pytesseract.image_to_string(gray, lang=lang)
# 2. 提取每个单元格
table_data = []
for i in range(len(rows) - 1):
row_data = []
for j in range(len(cols) - 1):
# 单元格边界
y1, y2 = rows[i], rows[i+1]
x1, x2 = cols[j], cols[j+1]
# 添加边距
margin = 5
cell = gray[y1+margin:y2-margin, x1+margin:x2-margin]
if cell.size == 0:
row_data.append('')
continue
# OCR 识别单元格
cell_pil = Image.fromarray(cell)
text = pytesseract.image_to_string(
cell_pil, lang=lang, config='--psm 7'
).strip()
row_data.append(text)
table_data.append(row_data)
return table_data
# 使用
table = full_table_ocr('table.png')
for i, row in enumerate(table):
print(f"行 {i+1}: {' | '.join(row)}")
9.3.4 专业表格识别工具
# 使用其他库增强表格识别
# pip install tabula-py camelot-py[cv]
# Camelot(PDF 表格提取)
import camelot
def extract_pdf_tables(pdf_path):
"""从 PDF 提取表格"""
tables = camelot.read_pdf(pdf_path, pages='all')
for i, table in enumerate(tables):
print(f"表格 {i+1}: {table.shape}")
print(table.df) # DataFrame
print()
return tables
# tabula-py(Java 依赖)
import tabula
def extract_tables_tabula(pdf_path):
"""使用 tabula 提取表格"""
dfs = tabula.read_pdf(pdf_path, pages='all')
return dfs
9.4 多栏文档处理
9.4.1 栏检测
import cv2
import numpy as np
def detect_columns(image_path):
"""检测文档栏数"""
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
_, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# 垂直投影
v_proj = np.sum(binary, axis=0)
# 平滑
kernel_size = 20
v_proj_smooth = np.convolve(v_proj, np.ones(kernel_size)/kernel_size, mode='same')
# 找空白区域(投影值低的区域)
threshold = np.mean(v_proj_smooth) * 0.3
blank_regions = np.where(v_proj_smooth < threshold)[0]
# 找栏分隔点
columns = []
if len(blank_regions) > 0:
# 分组连续空白区域
groups = []
current_group = [blank_regions[0]]
for pos in blank_regions[1:]:
if pos - current_group[-1] <= 5:
current_group.append(pos)
else:
groups.append(current_group)
current_group = [pos]
groups.append(current_group)
# 每组的中心是栏分隔线
for group in groups:
center = int(np.mean(group))
width = len(group)
if width > 20: # 足够宽的空白区域才是分栏线
columns.append(center)
return len(columns) + 1, columns # 栏数,分隔位置
num_cols, separators = detect_columns('newspaper.png')
print(f"检测到 {num_cols} 栏,分隔位置: {separators}")
9.4.2 多栏文档分割
import cv2
import pytesseract
from PIL import Image
def split_columns(image_path, separators):
"""按栏分割图像"""
img = cv2.imread(image_path)
h, w = img.shape[:2]
# 添加边界
boundaries = [0] + separators + [w]
columns = []
for i in range(len(boundaries) - 1):
x1, x2 = boundaries[i], boundaries[i+1]
column = img[:, x1:x2]
columns.append(column)
return columns
def ocr_multicolumn(image_path, lang='chi_sim+eng'):
"""多栏文档 OCR"""
# 检测栏
num_cols, separators = detect_columns(image_path)
if num_cols == 1:
# 单栏,直接识别
img = Image.open(image_path)
return pytesseract.image_to_string(img, lang=lang)
# 分割栏
columns = split_columns(image_path, separators)
# 逐栏识别
all_text = []
for i, col_img in enumerate(columns):
col_pil = Image.fromarray(cv2.cvtColor(col_img, cv2.COLOR_BGR2RGB))
text = pytesseract.image_to_string(col_pil, lang=lang, config='--psm 4')
all_text.append(f"=== 栏 {i+1} ===\n{text}")
return '\n\n'.join(all_text)
9.5 图文混排处理
9.5.1 区域类型检测
import cv2
import numpy as np
def detect_regions(image_path):
"""检测文档中的不同类型区域"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# 查找轮廓
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
regions = {'text': [], 'image': [], 'table': []}
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
if area < 1000: # 太小,忽略
continue
# 计算像素密度
roi = binary[y:y+h, x:x+w]
density = np.sum(roi > 0) / area
# 分类
aspect_ratio = w / h
if density > 0.3 and 0.1 < aspect_ratio < 10:
# 文本区域
regions['text'].append((x, y, w, h))
elif density < 0.1 and area > 10000:
# 图像区域
regions['image'].append((x, y, w, h))
else:
# 可能是表格
regions['table'].append((x, y, w, h))
return regions
regions = detect_regions('mixed.png')
print(f"文本区域: {len(regions['text'])}")
print(f"图像区域: {len(regions['image'])}")
print(f"表格区域: {len(regions['table'])}")
9.5.2 图文混排 OCR
import cv2
import pytesseract
from PIL import Image
def ocr_mixed_layout(image_path, lang='chi_sim+eng'):
"""图文混排 OCR"""
img = cv2.imread(image_path)
regions = detect_regions(image_path)
results = []
# 处理文本区域
for i, (x, y, w, h) in enumerate(regions['text']):
roi = img[y:y+h, x:x+w]
roi_pil = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
text = pytesseract.image_to_string(roi_pil, lang=lang, config='--psm 6')
if text.strip():
results.append({
'type': 'text',
'position': (x, y),
'content': text.strip()
})
# 按位置排序(从上到下,从左到右)
results.sort(key=lambda r: (r['position'][1], r['position'][0]))
return results
9.6 版面分析可视化
import cv2
import pytesseract
from PIL import Image
def visualize_layout(image_path, output_path='layout.png'):
"""可视化版面分析结果"""
img = cv2.imread(image_path)
pil_img = Image.open(image_path)
data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
# 颜色映射
colors = {
1: (255, 0, 0), # page - 红
2: (0, 255, 0), # block - 绿
3: (0, 0, 255), # para - 蓝
4: (255, 255, 0), # line - 黄
5: (255, 0, 255), # word - 紫
}
n = len(data['level'])
for i in range(n):
level = data['level'][i]
if level in [2, 4]: # 只显示 block 和 line
x = data['left'][i]
y = data['top'][i]
w = data['width'][i]
h = data['height'][i]
color = colors.get(level, (128, 128, 128))
cv2.rectangle(img, (x, y), (x+w, y+h), color, 2)
# 添加标签
label = f"L{level}"
cv2.putText(img, label, (x, y-5), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
cv2.imwrite(output_path, img)
print(f"版面分析可视化: {output_path}")
visualize_layout('document.png')
9.7 表单识别
import pytesseract
from PIL import Image
import re
def ocr_form(image_path, fields, lang='chi_sim+eng'):
"""表单 OCR:提取指定字段"""
img = Image.open(image_path)
# 全文 OCR
data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
# 构建词列表
words = []
n = len(data['text'])
for i in range(n):
if int(data['conf'][i]) > 30 and data['text'][i].strip():
words.append({
'text': data['text'][i].strip(),
'left': data['left'][i],
'top': data['top'][i],
'right': data['left'][i] + data['width'][i],
'bottom': data['top'][i] + data['height'][i]
})
# 查找字段
results = {}
for field_name, field_config in fields.items():
# 查找标签位置
label_text = field_config.get('label', '')
label_pos = None
for word in words:
if label_text in word['text']:
label_pos = word
break
if label_pos is None:
results[field_name] = None
continue
# 查找值(标签右侧或下方)
value_words = []
for word in words:
if word == label_pos:
continue
# 检查是否在标签右侧
if (word['left'] > label_pos['right'] and
abs(word['top'] - label_pos['top']) < 30):
value_words.append(word)
# 检查是否在标签下方
elif (word['top'] > label_pos['bottom'] and
abs(word['left'] - label_pos['left']) < 50):
value_words.append(word)
value = ' '.join(w['text'] for w in value_words)
results[field_name] = value.strip()
return results
# 使用
fields = {
'name': {'label': '姓名'},
'id': {'label': '身份证'},
'phone': {'label': '电话'},
}
result = ocr_form('form.png', fields)
for field, value in result.items():
print(f"{field}: {value}")
9.8 复杂布局处理策略
9.8.1 分治策略
def process_complex_layout(image_path, lang='chi_sim+eng'):
"""复杂布局处理:分治策略"""
# 1. 检测布局类型
num_cols, _ = detect_columns(image_path)
regions = detect_regions(image_path)
# 2. 根据布局选择策略
if len(regions['table']) > 0:
# 有表格,使用表格模式
return full_table_ocr(image_path, lang)
elif num_cols > 1:
# 多栏,分栏处理
return ocr_multicolumn(image_path, lang)
elif len(regions['image']) > 0:
# 图文混排
return ocr_mixed_layout(image_path, lang)
else:
# 普通文档
img = Image.open(image_path)
return pytesseract.image_to_string(img, lang=lang)
9.9 第三方版面分析工具
| 工具 | 特点 | 安装 |
|---|
| LayoutParser | 深度学习版面分析 | pip install layoutparser |
| Detectron2 | Facebook 检测框架 | 需编译 |
| PaddleDetection | 百度检测 | pip install paddle-det |
| YOLOv8 | 通用目标检测 | pip install ultralytics |
# LayoutParser 示例
import layoutparser as lp
def layout_analysis_advanced(image_path):
"""使用 LayoutParser 进行版面分析"""
model = lp.Detectron2LayoutModel(
config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5]
)
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
layout = model.detect(image)
for block in layout:
print(f"类型: {block.type}, 置信度: {block.score:.2f}")
print(f" 位置: {block.block.x_1:.0f}, {block.block.y_1:.0f}, "
f"{block.block.x_2:.0f}, {block.block.y_2:.0f}")
return layout
9.10 本章小结
| 要点 | 说明 |
|---|
| PSM 选择 | 根据布局选择合适模式 |
| 表格识别 | 检测结构 + 单元格 OCR |
| 多栏处理 | 垂直投影 + 分割 + 逐栏 OCR |
| 图文混排 | 区域检测 + 分类处理 |
| 可视化 | image_to_data + OpenCV 绘制 |
9.11 扩展阅读
上一章: Python 集成 | 下一章: 精度优化