第 11 章:Docker 部署
第 11 章:Docker 部署
使用 Docker 容器化部署 Tesseract OCR 服务。
11.1 Docker 部署优势
| 优势 | 说明 |
|---|
| 环境一致 | 避免"在我机器上能跑"问题 |
| 快速部署 | 一条命令启动 |
| 易于扩展 | 容器编排、负载均衡 |
| 隔离性好 | 不影响宿主环境 |
| 版本管理 | 镜像版本控制 |
11.2 基础 Docker 镜像
11.2.1 使用官方镜像
# 拉取镜像
docker pull tesseractshadow/tesseract4
# 基本使用
docker run --rm -v $(pwd):/data tesseractshadow/tesseract4 \
tesseract /data/image.png /data/output -l chi_sim+eng
# 查看可用语言
docker run --rm tesseractshadow/tesseract4 tesseract --list-langs
11.2.2 自定义 Dockerfile
# Dockerfile
FROM python:3.11-slim
# 安装系统依赖
RUN apt-get update && apt-get install -y \
tesseract-ocr \
tesseract-ocr-chi-sim \
tesseract-ocr-chi-tra \
tesseract-ocr-eng \
tesseract-ocr-jpn \
libtesseract-dev \
libleptonica-dev \
&& rm -rf /var/lib/apt/lists/*
# 设置环境变量
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
# 安装 Python 依赖
RUN pip install --no-cache-dir \
pytesseract \
Pillow \
opencv-python-headless \
flask \
gunicorn
# 创建工作目录
WORKDIR /app
# 复制应用代码
COPY . /app/
# 暴露端口
EXPOSE 5000
# 启动命令
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "app:app"]
# 构建镜像
docker build -t tesseract-ocr .
# 运行容器
docker run -d -p 5000:5000 --name ocr-service tesseract-ocr
11.2.3 多阶段构建(优化镜像大小)
# 多阶段构建
FROM python:3.11-slim as builder
RUN apt-get update && apt-get install -y \
tesseract-ocr \
tesseract-ocr-chi-sim \
tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
FROM python:3.11-slim
# 从 builder 复制
COPY --from=builder /usr/bin/tesseract /usr/bin/tesseract
COPY --from=builder /usr/share/tesseract-ocr /usr/share/tesseract-ocr
COPY --from=builder /usr/lib/x86_64-linux-gnu/libtesseract* /usr/lib/x86_64-linux-gnu/
COPY --from=builder /usr/lib/x86_64-linux-gnu/liblept* /usr/lib/x86_64-linux-gnu/
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
RUN pip install --no-cache-dir pytesseract Pillow opencv-python-headless flask gunicorn
WORKDIR /app
COPY . /app/
EXPOSE 5000
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "app:app"]
11.3 REST API 服务
11.3.1 Flask API
# app.py
from flask import Flask, request, jsonify
import pytesseract
from PIL import Image
import io
import base64
app = Flask(__name__)
@app.route('/ocr', methods=['POST'])
def ocr():
"""OCR API"""
try:
# 获取参数
lang = request.form.get('lang', 'chi_sim+eng')
psm = request.form.get('psm', '6')
# 获取图片
if 'image' in request.file:
img_file = request.files['image']
img = Image.open(img_file.stream)
elif 'base64' in request.form:
img_data = base64.b64decode(request.form['base64'])
img = Image.open(io.BytesIO(img_data))
else:
return jsonify({'error': '请提供图片'}), 400
# OCR
config = f'--psm {psm}'
text = pytesseract.image_to_string(img, lang=lang, config=config)
return jsonify({
'text': text.strip(),
'lang': lang,
'psm': psm
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/ocr/detailed', methods=['POST'])
def ocr_detailed():
"""详细 OCR API(含位置和置信度)"""
try:
lang = request.form.get('lang', 'chi_sim+eng')
if 'image' not in request.files:
return jsonify({'error': '请提供图片'}), 400
img = Image.open(request.files['image'].stream)
data = pytesseract.image_to_data(img, lang=lang,
output_type=pytesseract.Output.DICT)
results = []
n = len(data['text'])
for i in range(n):
if int(data['conf'][i]) > 0 and data['text'][i].strip():
results.append({
'text': data['text'][i].strip(),
'confidence': int(data['conf'][i]),
'bbox': {
'left': data['left'][i],
'top': data['top'][i],
'width': data['width'][i],
'height': data['height'][i]
}
})
return jsonify({'results': results})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/health', methods=['GET'])
def health():
"""健康检查"""
return jsonify({
'status': 'ok',
'version': str(pytesseract.get_tesseract_version()),
'languages': pytesseract.get_languages()
})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
11.3.2 FastAPI API(推荐)
# app_fastapi.py
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
import pytesseract
from PIL import Image
import io
app = FastAPI(title="Tesseract OCR API")
@app.post("/ocr")
async def ocr(
image: UploadFile = File(...),
lang: str = Form("chi_sim+eng"),
psm: int = Form(6)
):
"""OCR 接口"""
contents = await image.read()
img = Image.open(io.BytesIO(contents))
config = f'--psm {psm}'
text = pytesseract.image_to_string(img, lang=lang, config=config)
return {"text": text.strip(), "lang": lang}
@app.post("/ocr/batch")
async def ocr_batch(
images: list[UploadFile] = File(...),
lang: str = Form("chi_sim+eng")
):
"""批量 OCR"""
results = []
for img_file in images:
contents = await img_file.read()
img = Image.open(io.BytesIO(contents))
text = pytesseract.image_to_string(img, lang=lang)
results.append({
"filename": img_file.filename,
"text": text.strip()
})
return {"results": results}
@app.get("/health")
async def health():
"""健康检查"""
return {
"status": "ok",
"version": str(pytesseract.get_tesseract_version()),
"languages": pytesseract.get_languages()
}
# FastAPI Dockerfile
FROM python:3.11-slim
RUN apt-get update && apt-get install -y \
tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir fastapi uvicorn pytesseract Pillow python-multipart
WORKDIR /app
COPY app_fastapi.py /app/
CMD ["uvicorn", "app_fastapi:app", "--host", "0.0.0.0", "--port", "8000"]
11.3.3 API 调用示例
# 基本 OCR
curl -X POST http://localhost:5000/ocr \
-F "[email protected]" \
-F "lang=chi_sim+eng"
# 详细结果
curl -X POST http://localhost:5000/ocr/detailed \
-F "[email protected]" \
-F "lang=chi_sim+eng"
# Python 调用
import requests
with open('test.png', 'rb') as f:
response = requests.post(
'http://localhost:5000/ocr',
files={'image': f},
data={'lang': 'chi_sim+eng'}
)
result = response.json()
print(result['text'])
11.4 Docker Compose
11.4.1 完整配置
# docker-compose.yml
version: '3.8'
services:
ocr:
build: .
ports:
- "5000:5000"
volumes:
- ./data:/data
environment:
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
- WORKERS=4
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:5000/health"]
interval: 30s
timeout: 10s
retries: 3
# Redis 缓存(可选)
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
# Nginx 反向代理(可选)
nginx:
image: nginx:alpine
ports:
- "80:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- ocr
volumes:
redis_data:
# 启动
docker-compose up -d
# 查看日志
docker-compose logs -f ocr
# 停止
docker-compose down
11.5 批量处理容器
11.5.1 批量处理脚本
# batch_process.py
import os
import pytesseract
from PIL import Image
import json
def batch_ocr(input_dir, output_dir, lang='chi_sim+eng'):
"""批量 OCR 处理"""
os.makedirs(output_dir, exist_ok=True)
results = []
for filename in sorted(os.listdir(input_dir)):
if filename.endswith(('.png', '.jpg', '.tif', '.tiff')):
input_path = os.path.join(input_dir, filename)
try:
img = Image.open(input_path)
text = pytesseract.image_to_string(img, lang=lang)
# 保存文本
output_path = os.path.join(
output_dir,
filename.rsplit('.', 1)[0] + '.txt'
)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
results.append({
'file': filename,
'status': 'success',
'chars': len(text.strip())
})
print(f"✓ {filename}: {len(text.strip())} 字符")
except Exception as e:
results.append({
'file': filename,
'status': 'error',
'error': str(e)
})
print(f"✗ {filename}: {e}")
# 保存处理报告
report_path = os.path.join(output_dir, 'report.json')
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
return results
if __name__ == '__main__':
import sys
input_dir = sys.argv[1] if len(sys.argv) > 1 else '/data/input'
output_dir = sys.argv[2] if len(sys.argv) > 2 else '/data/output'
batch_ocr(input_dir, output_dir)
11.5.2 批量处理 Dockerfile
FROM python:3.11-slim
RUN apt-get update && apt-get install -y \
tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir pytesseract Pillow
WORKDIR /app
COPY batch_process.py /app/
ENTRYPOINT ["python", "batch_process.py"]
# 构建
docker build -t ocr-batch -f Dockerfile.batch .
# 运行批量处理
docker run --rm \
-v $(pwd)/input:/data/input \
-v $(pwd)/output:/data/output \
ocr-batch /data/input /data/output
11.6 性能优化
11.6.1 资源限制
# docker-compose.yml
services:
ocr:
build: .
deploy:
resources:
limits:
cpus: '2'
memory: 2G
reservations:
cpus: '0.5'
memory: 512M
11.6.2 Worker 配置
# 根据 CPU 核心数调整 workers
import multiprocessing
workers = multiprocessing.cpu_count() * 2 + 1
# gunicorn 启动
# gunicorn --bind 0.0.0.0:5000 --workers $workers app:app
11.6.3 缓存策略
import hashlib
import redis
import json
class OCRCache:
def __init__(self, redis_host='redis', redis_port=6379):
self.redis = redis.Redis(host=redis_host, port=redis_port)
self.ttl = 3600 * 24 # 24 小时
def get_cache_key(self, image_bytes, lang):
return f"ocr:{hashlib.md5(image_bytes + lang.encode()).hexdigest()}"
def get(self, image_bytes, lang):
key = self.get_cache_key(image_bytes, lang)
result = self.redis.get(key)
if result:
return json.loads(result)
return None
def set(self, image_bytes, lang, result):
key = self.get_cache_key(image_bytes, lang)
self.redis.setex(key, self.ttl, json.dumps(result))
11.7 监控与日志
11.7.1 日志配置
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('ocr')
@app.route('/ocr', methods=['POST'])
def ocr():
logger.info(f"OCR 请求: lang={request.form.get('lang')}")
# ...
logger.info(f"OCR 完成: {len(text)} 字符")
11.7.2 Prometheus 指标
from prometheus_client import Counter, Histogram, generate_latest
import time
REQUEST_COUNT = Counter('ocr_requests_total', 'Total OCR requests')
REQUEST_LATENCY = Histogram('ocr_request_latency_seconds', 'OCR request latency')
@app.route('/metrics')
def metrics():
return generate_latest()
@app.route('/ocr', methods=['POST'])
def ocr():
REQUEST_COUNT.inc()
start = time.time()
# ... OCR 处理 ...
REQUEST_LATENCY.observe(time.time() - start)
11.8 安全考虑
11.8.1 文件大小限制
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB
@app.errorhandler(413)
def too_large(e):
return jsonify({'error': '文件太大'}), 413
11.8.2 文件类型验证
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'tif', 'tiff', 'bmp'}
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
11.9 本章小结
| 要点 | 说明 |
|---|
| 推荐框架 | FastAPI(异步、高性能) |
| 部署方式 | Docker + Docker Compose |
| 批量处理 | 独立容器 + 卷挂载 |
| 性能优化 | Worker 数、资源限制、缓存 |
| 监控 | 日志 + Prometheus 指标 |
11.10 扩展阅读
上一章: 精度优化 | 下一章: 最佳实践