07 - Embeddings API
第 07 章 · Embeddings API(向量嵌入)
Embeddings 将文本转化为高维向量,是语义搜索、推荐系统和 RAG 的基础。本章详解 Embeddings API 的使用、向量搜索实现和 RAG 基础。
7.1 什么是 Embedding
Embedding 是文本的"数学表示"——将一段文字映射为一个高维浮点数向量,语义相似的文本在向量空间中距离更近。
"猫" → [0.12, -0.34, 0.56, ..., 0.78] (1536维)
"小猫" → [0.11, -0.33, 0.55, ..., 0.77] (非常接近)
"汽车" → [-0.45, 0.67, -0.12, ..., 0.23] (距离较远)
可用模型
| 模型 | 维度 | 单价 | 特点 |
|---|---|---|---|
text-embedding-3-small | 1536 | $0.02/1M tokens | 高性价比 |
text-embedding-3-large | 3072 | $0.13/1M tokens | 高精度 |
维度缩减
# text-embedding-3 系列支持 dimensions 参数缩减维度
response = client.embeddings.create(
model="text-embedding-3-large",
input="你好世界",
dimensions=1024, # 从 3072 缩减到 1024,仍保持良好质量
)
7.2 基础用法
7.2.1 单文本嵌入
from openai import OpenAI
client = OpenAI()
response = client.embeddings.create(
model="text-embedding-3-small",
input="人工智能正在改变世界",
)
embedding = response.data[0].embedding
print(f"维度: {len(embedding)}") # 1536
print(f"前5个值: {embedding[:5]}") # [0.0023, -0.0098, ...]
print(f"Token 用量: {response.usage.total_tokens}")
7.2.2 批量嵌入
texts = [
"机器学习是人工智能的一个分支",
"深度学习使用多层神经网络",
"自然语言处理处理人类语言",
"今天天气真不错",
]
response = client.embeddings.create(
model="text-embedding-3-small",
input=texts, # 最多 2048 条
)
embeddings = [item.embedding for item in response.data]
print(f"生成了 {len(embeddings)} 个向量")
7.3 相似度计算
余弦相似度 (Cosine Similarity)
import numpy as np
def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
"""计算两个向量的余弦相似度"""
a = np.array(vec_a)
b = np.array(vec_b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# 比较相似度
texts = ["猫", "小猫", "狗", "汽车"]
response = client.embeddings.create(
model="text-embedding-3-small",
input=texts,
)
vectors = {text: item.embedding for text, item in zip(texts, response.data)}
# 计算相似度矩阵
print("相似度矩阵:")
for t1 in texts:
for t2 in texts:
sim = cosine_similarity(vectors[t1], vectors[t2])
print(f" {t1} ↔ {t2}: {sim:.4f}")
输出示例:
猫 ↔ 小猫: 0.9321 # 非常相似
猫 ↔ 狗: 0.8567 # 较相似(都是动物)
猫 ↔ 汽车: 0.2134 # 不相似
批量相似度计算
def batch_similarity(query_vec: list[float], doc_vecs: list[list[float]]) -> list[float]:
"""批量计算查询向量与文档向量的相似度"""
q = np.array(query_vec)
docs = np.array(doc_vecs)
# 向量化计算,高效
norms = np.linalg.norm(docs, axis=1)
return np.dot(docs, q) / (norms * np.linalg.norm(q))
7.4 语义搜索引擎
import numpy as np
from openai import OpenAI
from dataclasses import dataclass
@dataclass
class Document:
id: int
text: str
metadata: dict
embedding: list[float] = None
class SemanticSearchEngine:
"""基于 Embeddings 的语义搜索引擎"""
def __init__(self, model: str = "text-embedding-3-small"):
self.client = OpenAI()
self.model = model
self.documents: list[Document] = []
self._embeddings_matrix: np.ndarray = None
def add_documents(self, docs: list[Document]):
"""添加文档并生成嵌入"""
texts = [doc.text for doc in docs]
response = self.client.embeddings.create(
model=self.model,
input=texts,
)
for doc, emb_data in zip(docs, response.data):
doc.embedding = emb_data.embedding
self.documents.append(doc)
self._rebuild_index()
def _rebuild_index(self):
"""重建索引矩阵"""
if self.documents:
self._embeddings_matrix = np.array(
[doc.embedding for doc in self.documents]
)
def search(self, query: str, top_k: int = 5) -> list[tuple[Document, float]]:
"""语义搜索"""
if not self.documents:
return []
# 查询嵌入
response = self.client.embeddings.create(
model=self.model,
input=query,
)
query_vec = np.array(response.data[0].embedding)
# 批量计算相似度
norms = np.linalg.norm(self._embeddings_matrix, axis=1)
similarities = np.dot(self._embeddings_matrix, query_vec) / (
norms * np.linalg.norm(query_vec)
)
# 取 top_k
top_indices = np.argsort(similarities)[::-1][:top_k]
results = [
(self.documents[idx], float(similarities[idx]))
for idx in top_indices
]
return results
# 使用示例
engine = SemanticSearchEngine()
documents = [
Document(1, "Python是一种解释型、面向对象的高级编程语言", {"category": "编程"}),
Document(2, "机器学习使用算法从数据中学习模式", {"category": "AI"}),
Document(3, "深度学习是机器学习的一个子集,使用多层神经网络", {"category": "AI"}),
Document(4, "今天北京的天气非常晴朗", {"category": "天气"}),
Document(5, "RESTful API是一种Web服务架构风格", {"category": "编程"}),
Document(6, "向量数据库用于存储和检索高维向量", {"category": "数据库"}),
]
engine.add_documents(documents)
# 搜索
results = engine.search("如何学习AI编程", top_k=3)
for doc, score in results:
print(f"[{score:.4f}] {doc.text}")
7.5 FAISS 向量索引
对于大规模数据(>10万条),推荐使用 FAISS:
import faiss
import numpy as np
class FAISSSearchEngine:
"""基于 FAISS 的高性能向量搜索"""
def __init__(self, dimension: int = 1536):
self.dimension = dimension
self.index = faiss.IndexFlatIP(dimension) # 内积索引(归一化后等价余弦)
self.documents: list[Document] = []
def add_documents(self, embeddings: list[list[float]], docs: list[Document]):
"""添加向量到索引"""
vectors = np.array(embeddings, dtype='float32')
# 归一化(使内积等价于余弦相似度)
faiss.normalize_L2(vectors)
self.index.add(vectors)
self.documents.extend(docs)
def search(self, query_embedding: list[float], top_k: int = 5):
"""搜索最相似的文档"""
query = np.array([query_embedding], dtype='float32')
faiss.normalize_L2(query)
scores, indices = self.index.search(query, top_k)
results = [
(self.documents[idx], float(score))
for score, idx in zip(scores[0], indices[0])
if idx >= 0
]
return results
7.6 Embedding 缓存优化
import hashlib
import json
from pathlib import Path
class EmbeddingCache:
"""Embedding 本地缓存,避免重复调用 API"""
def __init__(self, cache_dir: str = ".embedding_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self.client = OpenAI()
self.model = "text-embedding-3-small"
def _cache_key(self, text: str) -> str:
return hashlib.md5(f"{self.model}:{text}".encode()).hexdigest()
def get_embedding(self, text: str) -> list[float]:
"""获取 Embedding(优先从缓存读取)"""
key = self._cache_key(text)
cache_file = self.cache_dir / f"{key}.json"
if cache_file.exists():
with open(cache_file) as f:
return json.load(f)
response = self.client.embeddings.create(model=self.model, input=text)
embedding = response.data[0].embedding
with open(cache_file, 'w') as f:
json.dump(embedding, f)
return embedding
def batch_get(self, texts: list[str]) -> list[list[float]]:
"""批量获取,只请求未缓存的"""
results = [None] * len(texts)
uncached = [] # (index, text)
for i, text in enumerate(texts):
key = self._cache_key(text)
cache_file = self.cache_dir / f"{key}.json"
if cache_file.exists():
with open(cache_file) as f:
results[i] = json.load(f)
else:
uncached.append((i, text))
if uncached:
response = self.client.embeddings.create(
model=self.model,
input=[t for _, t in uncached],
)
for (idx, text), emb_data in zip(uncached, response.data):
results[idx] = emb_data.embedding
key = self._cache_key(text)
cache_file = self.cache_dir / f"{key}.json"
with open(cache_file, 'w') as f:
json.dump(emb_data.embedding, f)
return results
7.7 混合搜索 (Hybrid Search)
结合关键词搜索和语义搜索的优势:
def hybrid_search(query: str, documents: list[Document], alpha: float = 0.7) -> list:
"""混合搜索:alpha * 语义 + (1-alpha) * 关键词"""
# 语义搜索得分
semantic_results = engine.search(query, top_k=len(documents))
semantic_scores = {doc.id: score for doc, score in semantic_results}
# 关键词搜索得分(BM25 简化版)
keyword_scores = {}
for doc in documents:
# 简单的关键词匹配
overlap = sum(1 for word in query if word in doc.text)
keyword_scores[doc.id] = overlap / max(len(query), 1)
# 归一化
max_sem = max(semantic_scores.values()) if semantic_scores else 1
max_kw = max(keyword_scores.values()) if keyword_scores else 1
# 加权合并
combined = {}
for doc_id in set(semantic_scores) | set(keyword_scores):
sem = semantic_scores.get(doc_id, 0) / max_sem
kw = keyword_scores.get(doc_id, 0) / max_kw
combined[doc_id] = alpha * sem + (1 - alpha) * kw
# 排序
sorted_ids = sorted(combined, key=combined.get, reverse=True)
return [(next(d for d in documents if d.id == did), combined[did]) for did in sorted_ids]
7.8 业务场景
| 场景 | 使用方式 | 模型推荐 |
|---|---|---|
| 知识库问答 (RAG) | 文档嵌入 + 检索 | text-embedding-3-small |
| 语义去重 | 高相似度 = 重复内容 | text-embedding-3-small |
| 商品推荐 | 用户/商品向量匹配 | text-embedding-3-large |
| 情感聚类 | 聚类分析相似文本 | text-embedding-3-small |
| 论文查重 | 段落级相似度比较 | text-embedding-3-large |
7.9 注意事项
- 输入限制:单次最多 2048 条文本,每条最大 8191 tokens
- 语言支持:多语言效果好,但跨语言语义对齐可能有偏差
- 维度选择:1536 维足够大多数场景,3072 维适合精度要求高的场景
- 批量处理:尽量批量请求,减少 API 调用次数
- 缓存策略:相同文本的 Embedding 不变,务必缓存
- 向量归一化:计算余弦相似度前需要归一化
7.10 扩展阅读
下一章:08 - Assistants API — 线程管理、文件检索、代码解释器。