强曰为道

与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

07 - Embeddings API

第 07 章 · Embeddings API(向量嵌入)

Embeddings 将文本转化为高维向量,是语义搜索、推荐系统和 RAG 的基础。本章详解 Embeddings API 的使用、向量搜索实现和 RAG 基础。


7.1 什么是 Embedding

Embedding 是文本的"数学表示"——将一段文字映射为一个高维浮点数向量,语义相似的文本在向量空间中距离更近。

"猫"    → [0.12, -0.34, 0.56, ..., 0.78]  (1536维)
"小猫"  → [0.11, -0.33, 0.55, ..., 0.77]  (非常接近)
"汽车"  → [-0.45, 0.67, -0.12, ..., 0.23] (距离较远)

可用模型

模型维度单价特点
text-embedding-3-small1536$0.02/1M tokens高性价比
text-embedding-3-large3072$0.13/1M tokens高精度

维度缩减

# text-embedding-3 系列支持 dimensions 参数缩减维度
response = client.embeddings.create(
    model="text-embedding-3-large",
    input="你好世界",
    dimensions=1024,  # 从 3072 缩减到 1024,仍保持良好质量
)

7.2 基础用法

7.2.1 单文本嵌入

from openai import OpenAI

client = OpenAI()

response = client.embeddings.create(
    model="text-embedding-3-small",
    input="人工智能正在改变世界",
)

embedding = response.data[0].embedding
print(f"维度: {len(embedding)}")       # 1536
print(f"前5个值: {embedding[:5]}")     # [0.0023, -0.0098, ...]
print(f"Token 用量: {response.usage.total_tokens}")

7.2.2 批量嵌入

texts = [
    "机器学习是人工智能的一个分支",
    "深度学习使用多层神经网络",
    "自然语言处理处理人类语言",
    "今天天气真不错",
]

response = client.embeddings.create(
    model="text-embedding-3-small",
    input=texts,  # 最多 2048 条
)

embeddings = [item.embedding for item in response.data]
print(f"生成了 {len(embeddings)} 个向量")

7.3 相似度计算

余弦相似度 (Cosine Similarity)

import numpy as np

def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
    """计算两个向量的余弦相似度"""
    a = np.array(vec_a)
    b = np.array(vec_b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# 比较相似度
texts = ["猫", "小猫", "狗", "汽车"]
response = client.embeddings.create(
    model="text-embedding-3-small",
    input=texts,
)
vectors = {text: item.embedding for text, item in zip(texts, response.data)}

# 计算相似度矩阵
print("相似度矩阵:")
for t1 in texts:
    for t2 in texts:
        sim = cosine_similarity(vectors[t1], vectors[t2])
        print(f"  {t1}{t2}: {sim:.4f}")

输出示例:

猫 ↔ 小猫: 0.9321    # 非常相似
猫 ↔ 狗:   0.8567    # 较相似(都是动物)
猫 ↔ 汽车: 0.2134    # 不相似

批量相似度计算

def batch_similarity(query_vec: list[float], doc_vecs: list[list[float]]) -> list[float]:
    """批量计算查询向量与文档向量的相似度"""
    q = np.array(query_vec)
    docs = np.array(doc_vecs)
    # 向量化计算,高效
    norms = np.linalg.norm(docs, axis=1)
    return np.dot(docs, q) / (norms * np.linalg.norm(q))

7.4 语义搜索引擎

import numpy as np
from openai import OpenAI
from dataclasses import dataclass

@dataclass
class Document:
    id: int
    text: str
    metadata: dict
    embedding: list[float] = None

class SemanticSearchEngine:
    """基于 Embeddings 的语义搜索引擎"""

    def __init__(self, model: str = "text-embedding-3-small"):
        self.client = OpenAI()
        self.model = model
        self.documents: list[Document] = []
        self._embeddings_matrix: np.ndarray = None

    def add_documents(self, docs: list[Document]):
        """添加文档并生成嵌入"""
        texts = [doc.text for doc in docs]
        response = self.client.embeddings.create(
            model=self.model,
            input=texts,
        )
        for doc, emb_data in zip(docs, response.data):
            doc.embedding = emb_data.embedding
            self.documents.append(doc)

        self._rebuild_index()

    def _rebuild_index(self):
        """重建索引矩阵"""
        if self.documents:
            self._embeddings_matrix = np.array(
                [doc.embedding for doc in self.documents]
            )

    def search(self, query: str, top_k: int = 5) -> list[tuple[Document, float]]:
        """语义搜索"""
        if not self.documents:
            return []

        # 查询嵌入
        response = self.client.embeddings.create(
            model=self.model,
            input=query,
        )
        query_vec = np.array(response.data[0].embedding)

        # 批量计算相似度
        norms = np.linalg.norm(self._embeddings_matrix, axis=1)
        similarities = np.dot(self._embeddings_matrix, query_vec) / (
            norms * np.linalg.norm(query_vec)
        )

        # 取 top_k
        top_indices = np.argsort(similarities)[::-1][:top_k]
        results = [
            (self.documents[idx], float(similarities[idx]))
            for idx in top_indices
        ]
        return results


# 使用示例
engine = SemanticSearchEngine()

documents = [
    Document(1, "Python是一种解释型、面向对象的高级编程语言", {"category": "编程"}),
    Document(2, "机器学习使用算法从数据中学习模式", {"category": "AI"}),
    Document(3, "深度学习是机器学习的一个子集,使用多层神经网络", {"category": "AI"}),
    Document(4, "今天北京的天气非常晴朗", {"category": "天气"}),
    Document(5, "RESTful API是一种Web服务架构风格", {"category": "编程"}),
    Document(6, "向量数据库用于存储和检索高维向量", {"category": "数据库"}),
]

engine.add_documents(documents)

# 搜索
results = engine.search("如何学习AI编程", top_k=3)
for doc, score in results:
    print(f"[{score:.4f}] {doc.text}")

7.5 FAISS 向量索引

对于大规模数据(>10万条),推荐使用 FAISS:

import faiss
import numpy as np

class FAISSSearchEngine:
    """基于 FAISS 的高性能向量搜索"""

    def __init__(self, dimension: int = 1536):
        self.dimension = dimension
        self.index = faiss.IndexFlatIP(dimension)  # 内积索引(归一化后等价余弦)
        self.documents: list[Document] = []

    def add_documents(self, embeddings: list[list[float]], docs: list[Document]):
        """添加向量到索引"""
        vectors = np.array(embeddings, dtype='float32')
        # 归一化(使内积等价于余弦相似度)
        faiss.normalize_L2(vectors)
        self.index.add(vectors)
        self.documents.extend(docs)

    def search(self, query_embedding: list[float], top_k: int = 5):
        """搜索最相似的文档"""
        query = np.array([query_embedding], dtype='float32')
        faiss.normalize_L2(query)
        scores, indices = self.index.search(query, top_k)
        results = [
            (self.documents[idx], float(score))
            for score, idx in zip(scores[0], indices[0])
            if idx >= 0
        ]
        return results

7.6 Embedding 缓存优化

import hashlib
import json
from pathlib import Path

class EmbeddingCache:
    """Embedding 本地缓存,避免重复调用 API"""

    def __init__(self, cache_dir: str = ".embedding_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.client = OpenAI()
        self.model = "text-embedding-3-small"

    def _cache_key(self, text: str) -> str:
        return hashlib.md5(f"{self.model}:{text}".encode()).hexdigest()

    def get_embedding(self, text: str) -> list[float]:
        """获取 Embedding(优先从缓存读取)"""
        key = self._cache_key(text)
        cache_file = self.cache_dir / f"{key}.json"

        if cache_file.exists():
            with open(cache_file) as f:
                return json.load(f)

        response = self.client.embeddings.create(model=self.model, input=text)
        embedding = response.data[0].embedding

        with open(cache_file, 'w') as f:
            json.dump(embedding, f)

        return embedding

    def batch_get(self, texts: list[str]) -> list[list[float]]:
        """批量获取,只请求未缓存的"""
        results = [None] * len(texts)
        uncached = []  # (index, text)

        for i, text in enumerate(texts):
            key = self._cache_key(text)
            cache_file = self.cache_dir / f"{key}.json"
            if cache_file.exists():
                with open(cache_file) as f:
                    results[i] = json.load(f)
            else:
                uncached.append((i, text))

        if uncached:
            response = self.client.embeddings.create(
                model=self.model,
                input=[t for _, t in uncached],
            )
            for (idx, text), emb_data in zip(uncached, response.data):
                results[idx] = emb_data.embedding
                key = self._cache_key(text)
                cache_file = self.cache_dir / f"{key}.json"
                with open(cache_file, 'w') as f:
                    json.dump(emb_data.embedding, f)

        return results

结合关键词搜索和语义搜索的优势:

def hybrid_search(query: str, documents: list[Document], alpha: float = 0.7) -> list:
    """混合搜索:alpha * 语义 + (1-alpha) * 关键词"""
    # 语义搜索得分
    semantic_results = engine.search(query, top_k=len(documents))
    semantic_scores = {doc.id: score for doc, score in semantic_results}

    # 关键词搜索得分(BM25 简化版)
    keyword_scores = {}
    for doc in documents:
        # 简单的关键词匹配
        overlap = sum(1 for word in query if word in doc.text)
        keyword_scores[doc.id] = overlap / max(len(query), 1)

    # 归一化
    max_sem = max(semantic_scores.values()) if semantic_scores else 1
    max_kw = max(keyword_scores.values()) if keyword_scores else 1

    # 加权合并
    combined = {}
    for doc_id in set(semantic_scores) | set(keyword_scores):
        sem = semantic_scores.get(doc_id, 0) / max_sem
        kw = keyword_scores.get(doc_id, 0) / max_kw
        combined[doc_id] = alpha * sem + (1 - alpha) * kw

    # 排序
    sorted_ids = sorted(combined, key=combined.get, reverse=True)
    return [(next(d for d in documents if d.id == did), combined[did]) for did in sorted_ids]

7.8 业务场景

场景使用方式模型推荐
知识库问答 (RAG)文档嵌入 + 检索text-embedding-3-small
语义去重高相似度 = 重复内容text-embedding-3-small
商品推荐用户/商品向量匹配text-embedding-3-large
情感聚类聚类分析相似文本text-embedding-3-small
论文查重段落级相似度比较text-embedding-3-large

7.9 注意事项

  1. 输入限制:单次最多 2048 条文本,每条最大 8191 tokens
  2. 语言支持:多语言效果好,但跨语言语义对齐可能有偏差
  3. 维度选择:1536 维足够大多数场景,3072 维适合精度要求高的场景
  4. 批量处理:尽量批量请求,减少 API 调用次数
  5. 缓存策略:相同文本的 Embedding 不变,务必缓存
  6. 向量归一化:计算余弦相似度前需要归一化

7.10 扩展阅读


下一章08 - Assistants API — 线程管理、文件检索、代码解释器。