12 - Moderation API
第 12 章 · Moderation API(内容审核)
Moderation API 帮助检测文本和图片中的有害内容,是构建安全 AI 应用的必备组件。本章详解审核分类、集成方式和自定义策略。
12.1 API 概述
可用模型
| 模型 | Model ID | 支持内容 | 定价 |
|---|---|---|---|
| Omni Moderation | omni-moderation-latest | 文本 + 图片 | 免费 |
| Text Moderation | text-moderation-latest | 仅文本 | 免费 |
好消息:Moderation API 完全免费,可以放心大量使用。
审核类别
| 类别 | Key | 说明 |
|---|---|---|
| 仇恨言论 | hate | 基于种族、性别等的歧视 |
| 严重仇恨 | hate/threatening | 包含暴力威胁的仇恨言论 |
| 自残 | self-harm | 自杀、自残倾向 |
| 性内容 | sexual | 性相关描述 |
| 未成年相关 | sexual/minors | 涉及未成年人的性内容 |
| 暴力 | violence | 暴力行为描述 |
| 图形暴力 | violence/graphic | 极端暴力图形描述 |
12.2 基础用法
文本审核
from openai import OpenAI
client = OpenAI()
def moderate_text(text: str) -> dict:
"""审核文本内容"""
response = client.moderations.create(
model="omni-moderation-latest",
input=text,
)
result = response.results[0]
return {
"flagged": result.flagged, # 是否被标记
"categories": {
cat: result.categories.__dict__[cat]
for cat in result.categories.__dict__
if not cat.startswith("_")
},
"scores": {
cat: result.category_scores.__dict__[cat]
for cat in result.category_scores.__dict__
if not cat.startswith("_")
},
}
# 测试
texts = [
"今天天气真好,适合出去散步。",
"我非常讨厌你这个蠢货。",
"这个产品太棒了,强烈推荐!",
]
for text in texts:
result = moderate_text(text)
print(f"文本: {text[:30]}...")
print(f" 标记: {result['flagged']}")
if result['flagged']:
flagged = [k for k, v in result['categories'].items() if v]
print(f" 类别: {flagged}")
print()
批量审核
def moderate_batch(texts: list[str]) -> list[dict]:
"""批量审核(最多 1000 条)"""
response = client.moderations.create(
model="omni-moderation-latest",
input=texts,
)
results = []
for result in response.results:
results.append({
"flagged": result.flagged,
"categories": result.categories.__dict__,
"scores": result.category_scores.__dict__,
})
return results
12.3 图片审核
import base64
def moderate_image(image_path: str) -> dict:
"""审核图片内容"""
with open(image_path, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
response = client.moderations.create(
model="omni-moderation-latest",
input=[{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{b64}"
}
}],
)
result = response.results[0]
return {
"flagged": result.flagged,
"categories": result.categories.__dict__,
"scores": result.category_scores.__dict__,
}
12.4 集成到 Chat 流程
请求前审核
def safe_chat(user_input: str) -> str:
"""带内容审核的聊天"""
# 第一步:审核用户输入
mod_result = moderate_text(user_input)
if mod_result["flagged"]:
flagged = [k for k, v in mod_result["categories"].items() if v]
return f"您的消息包含违规内容({', '.join(flagged)}),请修改后重试。"
# 第二步:正常调用 Chat
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "你是一个友善的助手。"},
{"role": "user", "content": user_input},
],
)
reply = response.choices[0].message.content
# 第三步:审核 AI 回复
reply_mod = moderate_text(reply)
if reply_mod["flagged"]:
return "抱歉,我无法提供该回复。请换一个话题。"
return reply
12.5 自定义分类策略
多层审核架构
class ContentModerator:
"""多层内容审核系统"""
def __init__(self):
self.client = OpenAI()
def moderate(self, text: str) -> dict:
"""多层审核"""
result = {
"passed": True,
"issues": [],
"risk_level": "low",
}
# 第一层:关键词过滤(快速)
keyword_issues = self._keyword_check(text)
if keyword_issues:
result["issues"].extend(keyword_issues)
result["risk_level"] = "high"
result["passed"] = False
return result
# 第二层:OpenAI Moderation API
mod_result = self._openai_moderation(text)
if mod_result["flagged"]:
result["issues"].extend(mod_result["flagged_categories"])
result["risk_level"] = "high"
result["passed"] = False
return result
# 第三层:自定义规则
custom_issues = self._custom_rules(text)
if custom_issues:
result["issues"].extend(custom_issues)
result["risk_level"] = "medium"
return result
def _keyword_check(self, text: str) -> list[str]:
"""关键词过滤(本地快速检测)"""
blocked_words = ["暴力关键词1", "违规关键词2"]
issues = []
for word in blocked_words:
if word in text:
issues.append(f"包含禁用词: {word}")
return issues
def _openai_moderation(self, text: str) -> dict:
"""OpenAI Moderation API"""
response = self.client.moderations.create(
model="omni-moderation-latest",
input=text,
)
result = response.results[0]
flagged = []
for cat in result.categories.__dict__:
if not cat.startswith("_") and getattr(result.categories, cat):
flagged.append(cat)
return {"flagged": result.flagged, "flagged_categories": flagged}
def _custom_rules(self, text: str) -> list[str]:
"""自定义规则"""
issues = []
if len(text) > 5000:
issues.append("文本过长")
# 添加更多自定义规则...
return issues
12.6 阈值调优
自定义阈值配置
MODERATION_THRESHOLDS = {
"hate": 0.5,
"hate/threatening": 0.3, # 威胁类阈值更低
"self-harm": 0.3, # 自残类阈值更低
"sexual": 0.5,
"sexual/minors": 0.1, # 涉未成年人极严格
"violence": 0.5,
"violence/graphic": 0.3,
}
def moderate_with_thresholds(text: str, thresholds: dict = None) -> dict:
"""带自定义阈值的审核"""
if thresholds is None:
thresholds = MODERATION_THRESHOLDS
response = client.moderations.create(
model="omni-moderation-latest",
input=text,
)
result = response.results[0]
flagged_categories = {}
for cat, threshold in thresholds.items():
score = getattr(result.category_scores, cat.replace("/", "_"), 0)
if score >= threshold:
flagged_categories[cat] = {
"score": score,
"threshold": threshold,
}
return {
"passed": len(flagged_categories) == 0,
"flagged_categories": flagged_categories,
"all_scores": result.category_scores.__dict__,
}
12.7 审计日志
import json
from datetime import datetime
class ModerationLogger:
"""审核日志记录"""
def __init__(self, log_file: str = "moderation.log"):
self.log_file = log_file
def log(self, user_id: str, text: str, result: dict):
"""记录审核结果"""
entry = {
"timestamp": datetime.utcnow().isoformat(),
"user_id": user_id,
"text_hash": hash(text), # 不记录原文,保护隐私
"text_length": len(text),
"passed": result.get("passed", True),
"risk_level": result.get("risk_level", "low"),
"issues": result.get("issues", []),
}
with open(self.log_file, "a") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
12.8 业务场景
| 场景 | 策略 | 说明 |
|---|---|---|
| 用户 UGC 审核 | 请求前审核 | 用户提交内容前过滤 |
| AI 输出审核 | 响应后审核 | 确保 AI 不生成有害内容 |
| API 网关 | 全局审核 | 作为中间件统一过滤 |
| 评论系统 | 异步审核 | 后台审核,不阻塞提交 |
| 敏感场景 | 人工复审 | 高风险内容需人工确认 |
12.9 注意事项
- 免费但有限制:虽然免费,但有速率限制
- 不是万能的:不能替代人工审核,建议高风险场景人工复审
- 误判处理:允许用户申诉,建立申诉流程
- 隐私保护:审核日志不要存储原始文本
- 多语言支持:对中文的支持不如英文,建议结合本地关键词过滤
- 延迟考虑:审核会增加延迟,生产环境建议异步处理
12.10 扩展阅读
下一章:13 - RAG 系统实现 — 文档处理、分块策略、检索优化。