第 11 章 — 目标检测
第 11 章 — 目标检测
11.1 模板匹配(Template Matching)
模板匹配是最简单的目标检测方法,在大图中搜索小模板的位置。
import cv2
import numpy as np
img = cv2.imread("scene.jpg", cv2.IMREAD_GRAYSCALE)
template = cv2.imread("target.jpg", cv2.IMREAD_GRAYSCALE)
h, w = template.shape
# 模板匹配
# TM_SQDIFF: 平方差(越小越好)
# TM_SQDIFF_NORMED: 归一化平方差
# TM_CCORR: 相关(越大越好)
# TM_CCORR_NORMED: 归一化相关
# TM_CCOEFF: 相关系数(越大越好)
# TM_CCOEFF_NORMED: 归一化相关系数(推荐)
result = cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED)
# 获取最佳匹配位置
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
# TM_SQDIFF 用 min_loc,其他用 max_loc
top_left = max_loc
bottom_right = (top_left[0] + w, top_left[1] + h)
# 绘制结果
output = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
cv2.rectangle(output, top_left, bottom_right, (0, 255, 0), 2)
print(f"匹配置信度: {max_val:.3f}")
# 多目标匹配
threshold = 0.8
locations = np.where(result >= threshold)
for pt in zip(*locations[::-1]):
cv2.rectangle(output, pt, (pt[0] + w, pt[1] + h), (0, 255, 0), 2)
匹配方法对比
| 方法 | 公式特点 | 值范围 | 最佳值 |
|---|---|---|---|
TM_SQDIFF | 平方差 | 无界 | 最小值 |
TM_CCORR | 乘积和 | 无界 | 最大值 |
TM_CCOEFF | 减均值后乘积 | 无界 | 最大值 |
TM_SQDIFF_NORMED | 归一化 | 0~1 | 最小值 |
TM_CCORR_NORMED | 归一化 | 0~1 | 最大值 |
TM_CCOEFF_NORMED | 归一化 | -1~1 | 最大值(推荐) |
注意: 模板匹配不支持旋转和缩放,模板必须与目标方向、大小一致。
11.2 HOG 行人检测
HOG(Histogram of Oriented Gradients)是经典的行人检测方法。
HOG 流程
图像 → 伽马校正 → 梯度计算 → 分块统计直方图 → 块归一化 → 特征向量 → SVM 分类
import cv2
import numpy as np
# 创建 HOG 描述符 + 默认行人检测 SVM
hog = cv2.HOGDescriptor()
hog.setSVMDetector(cv2.HOGDescriptor_getDefaultPeopleDetector())
img = cv2.imread("street.jpg")
# 多尺度检测
locations, weights = hog.detectMultiScale(
img,
winStride=(8, 8), # 滑动窗口步长
padding=(4, 4), # 填充
scale=1.05, # 金字塔缩放因子
hitThreshold=0, # 命中阈值
groupThreshold=2 # 分组阈值
)
# 使用 NMS 去重
def non_max_suppression(boxes, scores, threshold=0.3):
"""非极大值抑制"""
if len(boxes) == 0:
return []
boxes = np.array(boxes)
scores = np.array(scores)
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
inter = np.maximum(0, xx2 - xx1 + 1) * np.maximum(0, yy2 - yy1 + 1)
iou = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(iou <= threshold)[0]
order = order[inds + 1]
return keep
# 绘制检测结果
result = img.copy()
for (x, y, w, h), weight in zip(locations, weights):
if weight[0] > 0.5:
cv2.rectangle(result, (x, y), (x + w, y + h), (0, 255, 0), 2)
cv2.putText(result, f"{weight[0]:.2f}", (x, y - 5),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
11.3 Haar 级联分类器
Haar 级联分类器用于人脸、眼睛、车辆等物体检测。
import cv2
# 加载预训练级联分类器
face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
)
eye_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + "haarcascade_eye.xml"
)
img = cv2.imread("faces.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 人脸检测
faces = face_cascade.detectMultiScale(
gray,
scaleFactor=1.1, # 每次缩放比例
minNeighbors=5, # 最小邻居数(越大越严格)
minSize=(30, 30), # 最小人脸尺寸
flags=cv2.CASCADE_SCALE_IMAGE
)
result = img.copy()
for (x, y, w, h) in faces:
cv2.rectangle(result, (x, y), (x + w, y + h), (0, 255, 0), 2)
# 在人脸区域内检测眼睛
roi_gray = gray[y:y+h, x:x+w]
roi_color = result[y:y+h, x:x+w]
eyes = eye_cascade.detectMultiScale(roi_gray, 1.1, 5)
for (ex, ey, ew, eh) in eyes:
cv2.rectangle(roi_color, (ex, ey), (ex+ew, ey+eh), (255, 0, 0), 2)
print(f"检测到 {len(faces)} 张人脸")
可用 Haar 模型
| 文件名 | 用途 |
|---|---|
haarcascade_frontalface_default.xml | 正面人脸 |
haarcascade_frontalface_alt2.xml | 正面人脸(备选) |
haarcascade_profileface.xml | 侧脸 |
haarcascade_eye.xml | 眼睛 |
haarcascade_smile.xml | 微笑 |
haarcascade_upperbody.xml | 上半身 |
haarcascade_fullbody.xml | 全身 |
haarcascade_car.xml | 车辆 |
11.4 YOLO 集成
通过 OpenCV DNN 模块加载 YOLO 模型进行目标检测。
"""
yolo_detector.py — OpenCV DNN 加载 YOLOv8
"""
import cv2
import numpy as np
class YOLODetector:
def __init__(self, model_path, conf_threshold=0.5, nms_threshold=0.4):
# 加载模型(ONNX 格式)
self.net = cv2.dnn.readNetFromONNX(model_path)
self.conf_threshold = conf_threshold
self.nms_threshold = nms_threshold
# COCO 类别名称
self.classes = [
"person", "bicycle", "car", "motorcycle", "airplane", "bus",
"train", "truck", "boat", "traffic light", "fire hydrant",
"stop sign", "parking meter", "bench", "bird", "cat", "dog",
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard", "tennis racket",
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
"hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop",
"mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock", "vase",
"scissors", "teddy bear", "hair drier", "toothbrush"
]
def detect(self, image, input_size=640):
h, w = image.shape[:2]
# 预处理
blob = cv2.dnn.blobFromImage(
image, 1/255.0, (input_size, input_size),
swapRB=True, crop=False
)
self.net.setInput(blob)
# 前向推理
outputs = self.net.forward(self.net.getUnconnectedOutLayersNames())
# 后处理
boxes, confidences, class_ids = [], [], []
for output in outputs:
for detection in output[0]:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > self.conf_threshold:
cx, cy, bw, bh = detection[:4]
x = int((cx - bw/2) * w / input_size)
y = int((cy - bh/2) * h / input_size)
w_box = int(bw * w / input_size)
h_box = int(bh * h / input_size)
boxes.append([x, y, w_box, h_box])
confidences.append(float(confidence))
class_ids.append(class_id)
# NMS
indices = cv2.dnn.NMSBoxes(
boxes, confidences,
self.conf_threshold, self.nms_threshold
)
results = []
if len(indices) > 0:
for i in indices.flatten():
results.append({
"bbox": boxes[i],
"confidence": confidences[i],
"class": self.classes[class_ids[i]],
"class_id": class_ids[i]
})
return results
# 使用
# detector = YOLODetector("yolov8n.onnx")
# img = cv2.imread("street.jpg")
# detections = detector.detect(img)
# for det in detections:
# x, y, w, h = det["bbox"]
# cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
# label = f"{det['class']} {det['confidence']:.2f}"
# cv2.putText(img, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
11.5 检测方法选型
| 方法 | 速度 | 精度 | 适用场景 |
|---|---|---|---|
| 模板匹配 | ★★★★★ | ★★☆ | 固定目标、工业定位 |
| Haar 级联 | ★★★★☆ | ★★★ | 人脸/眼睛快速检测 |
| HOG+SVM | ★★★★☆ | ★★★ | 行人检测 |
| YOLO/DNN | ★★★☆☆ | ★★★★★ | 通用目标检测 |
| SSD/MobileNet | ★★★★☆ | ★★★★ | 实时目标检测 |
11.6 扩展阅读
| 资源 | 链接 | 说明 |
|---|---|---|
| YOLOv8 官方 | github.com/ultralytics | 导出 ONNX |
| OpenCV DNN | docs.opencv.org/4.x/d2/d58/tutorial_table_of_content_dnn | DNN 教程 |
| 下一章 | 第 12 章 — 视频处理 | 读取/写入/追踪 |
本章小结: 掌握了模板匹配、HOG 行人检测、Haar 级联分类器和 YOLO 集成四种目标检测方法,能够根据不同场景选择合适的方案。