强曰为道

与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

第 5 章:SQL 进阶

第 5 章:SQL 进阶

掌握子查询、JOIN、窗口函数和 CTE,从 SQL 初阶迈入中高阶。


5.1 子查询(Subquery)

子查询是嵌套在其他 SQL 语句中的 SELECT 查询。

5.1.1 子查询分类

分类位置示例
标量子查询SELECT/WHERE 中,返回单个值WHERE score > (SELECT AVG(score) ...)
列子查询WHERE IN/ANY/ALLWHERE id IN (SELECT user_id ...)
行子查询WHERE 中比较行WHERE (a, b) = (SELECT a, b ...)
表子查询FROM 子句中FROM (SELECT ...) AS t
EXISTS 子查询WHERE EXISTS/NOT EXISTSWHERE EXISTS (SELECT 1 ...)

5.1.2 标量子查询

-- 查询余额高于平均值的用户
SELECT username, balance
FROM users
WHERE balance > (SELECT AVG(balance) FROM users)
ORDER BY balance DESC;

-- 查询每个用户的订单数(关联子查询)
SELECT 
    u.username,
    u.email,
    (SELECT COUNT(*) FROM orders o WHERE o.user_id = u.id) AS order_count,
    (SELECT COALESCE(SUM(pay_amount), 0) FROM orders o WHERE o.user_id = u.id) AS total_spent
FROM users u
WHERE u.status = 1;

5.1.3 IN / NOT IN 子查询

-- 查询有订单的用户
SELECT * FROM users
WHERE id IN (SELECT DISTINCT user_id FROM orders);

-- 查询没有订单的用户
SELECT * FROM users
WHERE id NOT IN (SELECT DISTINCT user_id FROM orders WHERE user_id IS NOT NULL);
-- ⚠️ 注意:NOT IN 子查询结果中如果包含 NULL,会导致整个查询返回空集
-- 解决方案:排除 NULL
-- WHERE id NOT IN (SELECT user_id FROM orders WHERE user_id IS NOT NULL)

5.1.4 EXISTS / NOT EXISTS

-- EXISTS:有订单的用户(通常比 IN 高效)
SELECT u.*
FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o WHERE o.user_id = u.id
);

-- NOT EXISTS:没有订单的用户
SELECT u.*
FROM users u
WHERE NOT EXISTS (
    SELECT 1 FROM orders o WHERE o.user_id = u.id
);

💡 IN vs EXISTS 选择

场景推荐原因
外表小,子查询结果集大IN先查外表,再逐个匹配
外表大,子查询结果集小EXISTS遍历外表,子查询找到一个就返回
子查询结果集可能含 NULLEXISTSNOT IN + NULL = 空集

5.1.5 派生表(FROM 子查询)

-- 统计每月订单量和金额
SELECT 
    t.month,
    t.order_count,
    t.total_amount
FROM (
    SELECT 
        DATE_FORMAT(created_at, '%Y-%m') AS month,
        COUNT(*) AS order_count,
        SUM(pay_amount) AS total_amount
    FROM orders
    WHERE created_at >= '2026-01-01'
    GROUP BY DATE_FORMAT(created_at, '%Y-%m')
) t
ORDER BY t.month;

5.2 JOIN 连接查询

5.2.1 JOIN 类型

┌────────────────────────────────────────────────┐
│                  JOIN 类型                       │
│                                                │
│  INNER JOIN     两表交集                         │
│  LEFT JOIN      左表全集 + 右表匹配(NULL 填充) │
│  RIGHT JOIN     右表全集 + 左表匹配(NULL 填充) │
│  CROSS JOIN     笛卡尔积(M × N 行)            │
│  FULL OUTER JOIN 两表全集(MySQL 不直接支持)    │
│  SELF JOIN      表与自身连接                     │
└────────────────────────────────────────────────┘

5.2.2 内连接(INNER JOIN)

-- 查询有订单的用户及其订单
SELECT 
    u.username,
    o.order_no,
    o.total_amount,
    o.status
FROM users u
INNER JOIN orders o ON u.id = o.user_id
ORDER BY o.created_at DESC;

5.2.3 左连接(LEFT JOIN)

-- 查询所有用户及其订单数(包括没有订单的用户)
SELECT 
    u.username,
    u.email,
    COUNT(o.id) AS order_count,
    COALESCE(SUM(o.pay_amount), 0) AS total_spent
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.username, u.email
ORDER BY order_count DESC;

5.2.4 右连接(RIGHT JOIN)

-- 右连接用得较少,通常可以用 LEFT JOIN 重写
-- 以下两句话等价:
SELECT u.username, o.order_no
FROM users u RIGHT JOIN orders o ON u.id = o.user_id;

SELECT u.username, o.order_no
FROM orders o LEFT JOIN users u ON u.id = o.user_id;

5.2.5 交叉连接(CROSS JOIN)

-- 笛卡尔积:生成所有组合
SELECT a.color, b.size
FROM (SELECT '红' AS color UNION SELECT '蓝' UNION SELECT '绿') a
CROSS JOIN (SELECT 'S' AS size UNION SELECT 'M' UNION SELECT 'L') b;
-- 生成 3×3 = 9 种组合

5.2.6 自连接(SELF JOIN)

-- 查询员工及其上级(假设 employees 表有 manager_id 字段)
SELECT 
    e.name AS employee,
    m.name AS manager
FROM employees e
LEFT JOIN employees m ON e.manager_id = m.id;

-- 查询同一天注册的用户对
SELECT 
    a.username AS user1,
    b.username AS user2,
    DATE(a.created_at) AS register_date
FROM users a
INNER JOIN users b 
    ON a.id < b.id 
    AND DATE(a.created_at) = DATE(b.created_at);

5.2.7 MySQL 模拟 FULL OUTER JOIN

-- MySQL 不支持 FULL OUTER JOIN,用 UNION 模拟
SELECT u.username, o.order_no
FROM users u LEFT JOIN orders o ON u.id = o.user_id
UNION
SELECT u.username, o.order_no
FROM users u RIGHT JOIN orders o ON u.id = o.user_id;

5.2.8 JOIN 性能要点

-- ❌ 避免:在 ON 条件中使用函数
SELECT * FROM orders o
JOIN users u ON YEAR(o.created_at) = YEAR(u.created_at);

-- ✅ 正确:直接关联索引字段
SELECT * FROM orders o
JOIN users u ON o.user_id = u.id;

-- ❌ 避免:隐式连接(逗号连接)
SELECT * FROM users u, orders o WHERE u.id = o.user_id;

-- ✅ 正确:显式 JOIN
SELECT * FROM users u INNER JOIN orders o ON u.id = o.user_id;

⚠️ JOIN 索引原则:JOIN 条件的列必须有索引,否则会触发全表扫描(笛卡尔积级别)。


5.3 UNION / UNION ALL

-- UNION:合并结果集并去重
SELECT username, 'VIP' AS type FROM users WHERE balance >= 10000
UNION
SELECT username, 'active' AS type FROM users WHERE last_login > DATE_SUB(NOW(), INTERVAL 7 DAY);

-- UNION ALL:合并结果集不去重(性能更好)
SELECT username, 'VIP' AS type FROM users WHERE balance >= 10000
UNION ALL
SELECT username, 'active' AS type FROM users WHERE last_login > DATE_SUB(NOW(), INTERVAL 7 DAY);
-- 可能有重复用户同时标记为 VIP 和 active

💡 UNION vs UNION ALL

维度UNIONUNION ALL
去重✅ 自动去重❌ 不去重
性能较慢(需要排序去重)更快
适用场景确实需要去重明确没有重复或不在乎重复

5.4 窗口函数(Window Functions)

MySQL 8.0 引入了窗口函数,这是 SQL 进阶最重要的特性。

5.4.1 语法结构

函数名() OVER (
    [PARTITION BY 分组列]
    [ORDER BY 排序列 [ASC|DESC]]
    [ROWS|RANGE 帧范围]
)

5.4.2 排名函数

-- ===================== 准备测试数据 =====================
CREATE TABLE scores (
    id INT AUTO_INCREMENT PRIMARY KEY,
    student VARCHAR(50),
    subject VARCHAR(50),
    score INT
);

INSERT INTO scores (student, subject, score) VALUES
('张三', '语文', 90), ('张三', '数学', 85), ('张三', '英语', 92),
('李四', '语文', 88), ('李四', '数学', 95), ('李四', '英语', 80),
('王五', '语文', 92), ('王五', '数学', 88), ('王五', '英语', 85);

-- ===================== ROW_NUMBER:连续排名 =====================
-- 每个科目的成绩排名
SELECT 
    student, subject, score,
    ROW_NUMBER() OVER (PARTITION BY subject ORDER BY score DESC) AS rank_no
FROM scores;

-- ===================== RANK:有间隔排名 =====================
-- 相同分数排名相同,下一个排名跳过
-- 90, 90, 88 → 1, 1, 3
SELECT 
    student, subject, score,
    RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank_no
FROM scores;

-- ===================== DENSE_RANK:无间隔排名 =====================
-- 相同分数排名相同,下一个排名不跳过
-- 90, 90, 88 → 1, 1, 2
SELECT 
    student, subject, score,
    DENSE_RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank_no
FROM scores;

三种排名函数对比:

函数相同值排名下一个排名示例 90,90,88
ROW_NUMBER()不同连续1, 2, 3
RANK()相同跳过1, 1, 3
DENSE_RANK()相同不跳过1, 1, 2

5.4.3 聚合窗口函数

-- 每个学生的成绩和其总分、平均分
SELECT 
    student, subject, score,
    SUM(score) OVER (PARTITION BY student) AS total_score,
    AVG(score) OVER (PARTITION BY student) AS avg_score,
    MAX(score) OVER (PARTITION BY student) AS max_score,
    COUNT(*)  OVER (PARTITION BY student) AS subject_count
FROM scores;

-- 全局百分比
SELECT 
    student, subject, score,
    score / SUM(score) OVER () * 100 AS pct
FROM scores;

5.4.4 LAG / LEAD(前后行访问)

-- 查询每天的销售额及与前一天的差异
SELECT 
    order_date,
    daily_amount,
    LAG(daily_amount, 1, 0) OVER (ORDER BY order_date) AS prev_day_amount,
    daily_amount - LAG(daily_amount, 1, 0) OVER (ORDER BY order_date) AS diff
FROM (
    SELECT DATE(created_at) AS order_date, SUM(pay_amount) AS daily_amount
    FROM orders
    GROUP BY DATE(created_at)
) t;

-- LEAD:访问下一行
SELECT 
    order_date,
    daily_amount,
    LEAD(daily_amount) OVER (ORDER BY order_date) AS next_day_amount
FROM (
    SELECT DATE(created_at) AS order_date, SUM(pay_amount) AS daily_amount
    FROM orders
    GROUP BY DATE(created_at)
) t;

5.4.5 累计计算(RUNNING TOTAL)

-- 每日销售额及累计销售额
SELECT 
    DATE(created_at) AS order_date,
    SUM(pay_amount) AS daily_amount,
    SUM(SUM(pay_amount)) OVER (ORDER BY DATE(created_at)) AS running_total
FROM orders
GROUP BY DATE(created_at)
ORDER BY order_date;

-- 每个用户的累计消费
SELECT 
    u.username,
    o.order_no,
    o.pay_amount,
    o.created_at,
    SUM(o.pay_amount) OVER (
        PARTITION BY o.user_id 
        ORDER BY o.created_at
        ROWS UNBOUNDED PRECEDING
    ) AS cumulative_spent
FROM orders o
JOIN users u ON o.user_id = u.id
ORDER BY u.username, o.created_at;

5.4.6 NTILE(分桶)

-- 将用户按余额分为 4 档
SELECT 
    username, balance,
    NTILE(4) OVER (ORDER BY balance DESC) AS quartile
FROM users
WHERE status = 1;
-- 1 = 前 25%,4 = 后 25%

5.4.7 帧范围(Frame)

-- 滑动窗口:近 3 天的移动平均
SELECT 
    order_date,
    daily_amount,
    AVG(daily_amount) OVER (
        ORDER BY order_date
        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
    ) AS moving_avg_3d
FROM (
    SELECT DATE(created_at) AS order_date, SUM(pay_amount) AS daily_amount
    FROM orders
    GROUP BY DATE(created_at)
) t;
帧范围说明
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW从头到当前行(默认)
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW当前行和前 2 行
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING前 1 行、当前行、后 1 行
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING当前行到末尾
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING整个分区

5.5 CTE(Common Table Expressions)

CTE 使用 WITH 关键字定义临时命名结果集,提高 SQL 可读性。

5.5.1 非递归 CTE

-- 基础 CTE:统计高消费用户
WITH high_spenders AS (
    SELECT 
        user_id,
        SUM(pay_amount) AS total_spent,
        COUNT(*) AS order_count
    FROM orders
    GROUP BY user_id
    HAVING total_spent > 1000
)
SELECT 
    u.username,
    u.email,
    hs.total_spent,
    hs.order_count
FROM high_spenders hs
JOIN users u ON hs.user_id = u.id
ORDER BY hs.total_spent DESC;

-- 多个 CTE
WITH monthly_stats AS (
    SELECT 
        DATE_FORMAT(created_at, '%Y-%m') AS month,
        COUNT(*) AS order_count,
        SUM(pay_amount) AS revenue
    FROM orders
    WHERE created_at >= '2026-01-01'
    GROUP BY DATE_FORMAT(created_at, '%Y-%m')
),
ranked_months AS (
    SELECT 
        month, order_count, revenue,
        RANK() OVER (ORDER BY revenue DESC) AS revenue_rank
    FROM monthly_stats
)
SELECT * FROM ranked_months WHERE revenue_rank <= 3;

5.5.2 递归 CTE

-- 组织架构树:查询某员工的所有上级
WITH RECURSIVE manager_chain AS (
    -- 锚点(Anchor):起始行
    SELECT id, name, manager_id, 1 AS level
    FROM employees
    WHERE id = 5  -- 从员工 ID=5 开始
    
    UNION ALL
    
    -- 递归部分
    SELECT e.id, e.name, e.manager_id, mc.level + 1
    FROM employees e
    INNER JOIN manager_chain mc ON e.id = mc.manager_id
)
SELECT * FROM manager_chain ORDER BY level;

-- 生成序列(1 到 100)
WITH RECURSIVE seq AS (
    SELECT 1 AS n
    UNION ALL
    SELECT n + 1 FROM seq WHERE n < 100
)
SELECT n FROM seq;

-- 计算日期序列
WITH RECURSIVE dates AS (
    SELECT DATE('2026-01-01') AS dt
    UNION ALL
    SELECT dt + INTERVAL 1 DAY FROM dates WHERE dt < '2026-01-31'
)
SELECT dt FROM dates;

⚠️ 递归 CTE 限制:默认最大递归深度为 1000,可以调整:

SET SESSION cte_max_recursion_depth = 10000;

5.6 集合操作与高级查询技巧

5.6.1 INTERSECT(8.0.31+)

-- 查询同时有"已支付"和"已发货"订单的用户
SELECT user_id FROM orders WHERE status = 1
INTERSECT
SELECT user_id FROM orders WHERE status = 2;

5.6.2 EXCEPT(8.0.31+)

-- 查询有"已支付"但没有"退款"的用户
SELECT user_id FROM orders WHERE status = 1
EXCEPT
SELECT user_id FROM orders WHERE status = 5;

5.6.3 JSON 聚合

-- JSON_ARRAYAGG:将多行聚合成 JSON 数组
SELECT 
    u.username,
    JSON_ARRAYAGG(o.order_no) AS order_list
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.username;

-- JSON_OBJECTAGG:将多行聚合成 JSON 对象
SELECT 
    JSON_OBJECTAGG(student, total_score) AS score_map
FROM (
    SELECT student, SUM(score) AS total_score
    FROM scores
    GROUP BY student
) t;

业务场景

场景 1:排行榜查询

-- 用户消费排行榜 TOP 10
WITH user_spending AS (
    SELECT 
        user_id,
        SUM(pay_amount) AS total_spent,
        COUNT(*) AS order_count,
        MAX(created_at) AS last_order_date
    FROM orders
    WHERE status IN (1, 2, 3)  -- 有效订单
    GROUP BY user_id
)
SELECT 
    ROW_NUMBER() OVER (ORDER BY us.total_spent DESC) AS ranking,
    u.username,
    us.total_spent,
    us.order_count,
    us.last_order_date
FROM user_spending us
JOIN users u ON us.user_id = u.id
ORDER BY us.total_spent DESC
LIMIT 10;

场景 2:留存率分析

-- 次日留存率
WITH first_visit AS (
    SELECT user_id, MIN(DATE(created_at)) AS first_date
    FROM user_logs
    GROUP BY user_id
),
retention AS (
    SELECT 
        fv.first_date,
        COUNT(DISTINCT fv.user_id) AS new_users,
        COUNT(DISTINCT ul.user_id) AS retained_users
    FROM first_visit fv
    LEFT JOIN user_logs ul 
        ON fv.user_id = ul.user_id
        AND DATE(ul.created_at) = fv.first_date + INTERVAL 1 DAY
    GROUP BY fv.first_date
)
SELECT 
    first_date,
    new_users,
    retained_users,
    ROUND(retained_users / new_users * 100, 2) AS retention_rate_pct
FROM retention
ORDER BY first_date;

扩展阅读