OpenAI API 接口对接完全教程 / 10 - TTS 语音合成

第 10 章 · TTS 语音合成 API

TTS (Text-to-Speech) API 将文本转化为自然语音，支持多音色和流式合成。本章详解语音生成、音色选择和流式 TTS 实现。

10.1 API 概述

可用模型

模型	Model ID	特点	定价
TTS-1	`tts-1`	低延迟，适合实时场景	$15/1M chars
TTS-1-HD	`tts-1-hd`	高清音质，适合成品	$30/1M chars
GPT-4o Mini TTS	`gpt-4o-mini-tts`	可指令控制语气和风格	$12/1M chars (输入) + $24/1M chars (输出)

可用音色

音色	ID	性别	特点
Alloy	`alloy`	中性	平稳、自然
Echo	`echo`	男	低沉、专业
Fable	`fable`	男	叙事感、温暖
Onyx	`onyx`	男	深沉、权威
Nova	`nova`	女	活泼、友好
Shimmer	`shimmer`	女	温柔、清晰
Coral	`coral`	女	自然、温暖
Verse	`verse`	中性	新特性

支持的输出格式

格式	说明	适用场景
`mp3`	默认格式，通用	通用播放
`opus`	高效压缩	流式传输
`aac`	高兼容性	移动端
`flac`	无损压缩	高质量存储
`wav`	无压缩	专业音频处理
`pcm`	原始 PCM	实时处理

10.2 基础语音合成

from openai import OpenAI
from pathlib import Path

client = OpenAI()

def text_to_speech(text: str, output_path: str = "speech.mp3",
                   model: str = "tts-1", voice: str = "alloy"):
    """文本转语音"""
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text,
    )

    response.stream_to_file(output_path)
    print(f"音频已保存: {output_path}")

# 使用
text_to_speech("你好，欢迎使用 OpenAI 语音合成服务。", "welcome.mp3")

Node.js 示例

import OpenAI from 'openai';
import fs from 'fs';

const client = new OpenAI();

async function textToSpeech(text, outputPath = 'speech.mp3') {
  const response = await client.audio.speech.create({
    model: 'tts-1',
    voice: 'alloy',
    input: text,
  });

  const buffer = Buffer.from(await response.arrayBuffer());
  fs.writeFileSync(outputPath, buffer);
  console.log(`音频已保存: ${outputPath}`);
}

await textToSpeech('你好，欢迎使用语音合成服务。');

10.3 GPT-4o Mini TTS (可控语气)

GPT-4o Mini TTS 支持通过自然语言指令控制语气、情感和风格：

def expressive_tts(text: str, instructions: str, output_path: str = "speech.mp3"):
    """可控语气的语音合成"""
    response = client.audio.speech.create(
        model="gpt-4o-mini-tts",
        voice="coral",
        input=text,
        instructions=instructions,
    )
    response.stream_to_file(output_path)

# 欢快语气
expressive_tts(
    "恭喜你获得了大奖！",
    instructions="用非常兴奋和欢快的语气说话，像一个热情的主持人。"
)

# 专业播报
expressive_tts(
    "今天的新闻主要内容如下...",
    instructions="用专业新闻主播的语气，语速适中，吐字清晰。"
)

# 温柔安慰
expressive_tts(
    "不要担心，一切都会好起来的。",
    instructions="用温柔安慰的语气，声音柔和，语速稍慢。"
)

# 有声读物
expressive_tts(
    "很久很久以前，在一个遥远的王国里...",
    instructions="像讲故事一样，语气生动有趣，有起伏变化。"
)

可控参数说明

指令方向	示例指令
情感	“开心的”、“悲伤的”、“愤怒的”、“惊讶的”
语速	“快速地说”、“慢速地说”、“语速适中”
语气	“专业的”、“随意的”、“正式的”、“幽默的”
角色	“像老师一样”、“像主持人一样”、“像机器人一样”
口音	“用英式口音”、“用美式口音”

10.4 音色对比试听

voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer", "coral"]
sample_text = "人工智能正在改变我们的生活方式。"

for voice in voices:
    response = client.audio.speech.create(
        model="tts-1",
        voice=voice,
        input=sample_text,
    )
    response.stream_to_file(f"voice_{voice}.mp3")
    print(f"已生成: voice_{voice}.mp3")

10.5 流式 TTS (实时音频流)

Python 流式输出

def stream_tts(text: str, output_path: str = "stream.mp3"):
    """流式生成音频"""
    with client.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="alloy",
        input=text,
        response_format="opus",  # opus 格式更适合流式
    ) as response:
        response.stream_to_file(output_path)

FastAPI 流式音频接口

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from openai import OpenAI
from pydantic import BaseModel
import io

app = FastAPI()
client = OpenAI()

class TTSRequest(BaseModel):
    text: str
    voice: str = "alloy"
    model: str = "tts-1"

@app.post("/api/tts")
async def text_to_speech(request: TTSRequest):
    """流式返回音频"""
    response = client.audio.speech.create(
        model=request.model,
        voice=request.voice,
        input=request.text,
        response_format="opus",
    )

    audio_bytes = response.read()

    return StreamingResponse(
        io.BytesIO(audio_bytes),
        media_type="audio/ogg",
        headers={"Content-Disposition": "attachment; filename=speech.opus"},
    )

前端播放

async function playTTS(text) {
  const response = await fetch('/api/tts', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ text, voice: 'nova' }),
  });

  const blob = await response.blob();
  const url = URL.createObjectURL(blob);
  const audio = new Audio(url);
  audio.play();
}

浏览器端实时 TTS + 流式播放

async function streamingTTS(text) {
  const response = await fetch('https://api.openai.com/v1/audio/speech', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${API_KEY}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      model: 'tts-1',
      voice: 'alloy',
      input: text,
      response_format: 'opus',
    }),
  });

  // 使用 MediaSource 流式播放
  const mediaSource = new MediaSource();
  const audio = new Audio();
  audio.src = URL.createObjectURL(mediaSource);

  mediaSource.addEventListener('sourceopen', async () => {
    const sourceBuffer = mediaSource.addSourceBuffer('audio/ogg; codecs="opus"');
    const reader = response.body.getReader();

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      sourceBuffer.appendBuffer(value);
    }
  });

  audio.play();
}

10.6 长文本分段合成

def long_text_to_speech(text: str, output_path: str = "long_speech.mp3",
                        max_chars: int = 4000):
    """长文本分段合成后合并"""
    from pydub import AudioSegment
    import tempfile

    # 按句子分割
    sentences = []
    current = ""
    for char in text:
        current += char
        if char in "。！？.!?\n" and len(current) > 10:
            sentences.append(current.strip())
            current = ""
    if current.strip():
        sentences.append(current.strip())

    # 分组合并
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_chars:
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            current_chunk += sentence
    if current_chunk:
        chunks.append(current_chunk)

    # 生成每段音频
    combined = AudioSegment.empty()
    for i, chunk in enumerate(chunks):
        print(f"生成第 {i+1}/{len(chunks)} 段...")
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
            response = client.audio.speech.create(
                model="tts-1-hd",
                voice="nova",
                input=chunk,
            )
            response.stream_to_file(tmp.name)
            segment = AudioSegment.from_mp3(tmp.name)
            combined += segment

    combined.export(output_path, format="mp3")
    print(f"已保存: {output_path}")

10.7 多语言语音合成

def multilingual_tts(text: str, lang: str, output_path: str):
    """多语言语音合成"""
    # 选择适合该语言的音色
    voice_map = {
        "zh": "nova",      # 中文
        "en": "alloy",     # 英文
        "ja": "shimmer",   # 日文
        "ko": "coral",     # 韩文
        "fr": "fable",     # 法文
        "de": "onyx",      # 德文
    }

    response = client.audio.speech.create(
        model="tts-1-hd",
        voice=voice_map.get(lang, "alloy"),
        input=text,
    )
    response.stream_to_file(output_path)

# 不同语言
multilingual_tts("你好，世界！", "zh", "hello_zh.mp3")
multilingual_tts("Hello, World!", "en", "hello_en.mp3")
multilingual_tts("こんにちは世界！", "ja", "hello_ja.mp3")

10.8 实时对话 TTS

结合 Chat Completions 实现 AI 语音对话：

def ai_voice_reply(user_text: str) -> str:
    """AI 回复并转为语音"""
    # 获取文本回复
    chat_response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "用简短友好的方式回复，不超过100字。"},
            {"role": "user", "content": user_text},
        ],
    )
    reply_text = chat_response.choices[0].message.content
    print(f"AI回复: {reply_text}")

    # 转为语音
    tts_response = client.audio.speech.create(
        model="tts-1",
        voice="nova",
        input=reply_text,
        response_format="mp3",
    )
    tts_response.stream_to_file("reply.mp3")
    return reply_text

10.9 注意事项

文本长度：单次输入建议 < 4096 字符，超长文本请分段
模型选择：实时交互用 tts-1（低延迟），成品用 tts-1-hd（高质量）
音色选择：建议在目标场景中 A/B 测试不同音色
费用：按字符数计费，长文本注意成本控制
延迟：TTS-1 延迟更低，TTS-1-HD 需要更多处理时间
中文发音：GPT-4o Mini TTS 中文发音质量最优
输出格式：流式场景推荐 opus，通用场景用 mp3

10.10 扩展阅读

下一章：11 - DALL-E 图片生成 — 图片创建、编辑、变体。