强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

ClickHouse 教程 / 聚合函数与窗口函数

聚合函数与窗口函数

聚合函数和窗口函数是 ClickHouse 分析查询的核心能力,掌握它们是高效使用 ClickHouse 的关键。


1. 前置准备

CREATE DATABASE IF NOT EXISTS tutorial;

CREATE TABLE tutorial.events
(
    event_date Date,
    event_time DateTime,
    user_id UInt64,
    session_id String,
    event_type LowCardinality(String),
    page_url String,
    duration UInt32,
    amount Decimal128(2),
    device LowCardinality(String),
    city LowCardinality(String)
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(event_date)
ORDER BY (event_date, user_id, event_time);

-- 插入测试数据
INSERT INTO tutorial.events VALUES
    ('2026-05-01', '2026-05-01 08:00:00', 1, 's1', 'view',     '/home',     30, 0,      'mobile',  'Beijing'),
    ('2026-05-01', '2026-05-01 08:05:00', 1, 's1', 'click',    '/product/1', 0, 0,      'mobile',  'Beijing'),
    ('2026-05-01', '2026-05-01 08:10:00', 1, 's1', 'purchase', '/checkout',  0, 299.00, 'mobile',  'Beijing'),
    ('2026-05-01', '2026-05-01 09:00:00', 2, 's2', 'view',     '/home',     45, 0,      'desktop', 'Shanghai'),
    ('2026-05-01', '2026-05-01 09:05:00', 2, 's2', 'click',    '/product/2', 0, 0,      'desktop', 'Shanghai'),
    ('2026-05-01', '2026-05-01 09:10:00', 2, 's2', 'view',     '/about',    20, 0,      'desktop', 'Shanghai'),
    ('2026-05-02', '2026-05-02 10:00:00', 1, 's3', 'view',     '/home',     35, 0,      'mobile',  'Beijing'),
    ('2026-05-02', '2026-05-02 10:10:00', 1, 's3', 'purchase', '/checkout',  0, 599.00, 'mobile',  'Beijing'),
    ('2026-05-02', '2026-05-02 11:00:00', 3, 's4', 'view',     '/home',     25, 0,      'tablet',  'Guangzhou'),
    ('2026-05-02', '2026-05-02 11:05:00', 3, 's4', 'click',    '/product/1', 0, 0,      'tablet',  'Guangzhou'),
    ('2026-05-03', '2026-05-03 08:30:00', 2, 's5', 'view',     '/product/1', 40, 0,     'desktop', 'Shanghai'),
    ('2026-05-03', '2026-05-03 08:35:00', 2, 's5', 'purchase', '/checkout',  0, 199.00, 'desktop', 'Shanghai'),
    ('2026-05-03', '2026-05-03 14:00:00', 4, 's6', 'view',     '/home',     50, 0,      'mobile',  'Beijing'),
    ('2026-05-03', '2026-05-03 14:05:00', 4, 's6', 'click',    '/product/3', 0, 0,      'mobile',  'Beijing'),
    ('2026-05-03', '2026-05-03 14:15:00', 4, 's6', 'purchase', '/checkout',  0, 899.00, 'mobile',  'Beijing');

2. 常用聚合函数

2.1 基础聚合

SELECT
    count()                          AS total_events,       -- 总行数
    count(user_id)                   AS non_null_users,     -- 非 NULL 计数
    uniq(user_id)                    AS unique_users,       -- 去重用户数(近似)
    uniqExact(user_id)               AS exact_unique_users, -- 去重用户数(精确)
    sum(amount)                      AS total_revenue,      -- 总金额
    avg(amount)                      AS avg_revenue,        -- 平均金额
    min(amount)                      AS min_revenue,        -- 最小金额
    max(amount)                      AS max_revenue,        -- 最大金额
    median(amount)                   AS median_revenue,     -- 中位数
    stddevSamp(amount)               AS std_dev,            -- 标准差
    varSamp(amount)                  AS variance            -- 方差
FROM tutorial.events;

2.2 条件聚合

SELECT
    event_date,
    count()                                     AS total_events,
    countIf(event_type = 'purchase')            AS purchases,
    countIf(event_type = 'view')                AS views,
    sumIf(amount, event_type = 'purchase')      AS purchase_amount,
    avgIf(amount, event_type = 'purchase')      AS avg_purchase,
    uniqIf(user_id, event_type = 'purchase')    AS buying_users,
    round(countIf(event_type = 'purchase') / count() * 100, 2) AS conversion_rate
FROM tutorial.events
GROUP BY event_date
ORDER BY event_date;

预期输出:

┌─event_date─┬─total_events─┬─purchases─┬─views─┬─purchase_amount─┬─avg_purchase─┬─buying_users─┬─conversion_rate─┐
│ 2026-05-01 │            6 │         1 │     2 │          299.00 │       299.00 │            1 │           16.67 │
│ 2026-05-02 │            3 │         1 │     1 │          599.00 │       599.00 │            1 │           33.33 │
│ 2026-05-03 │            6 │         2 │     2 │         1098.00 │       549.00 │            2 │           33.33 │
└────────────┴──────────────┴───────────┴───────┴─────────────────┴──────────────┴──────────────┴─────────────────┘

2.3 数组聚合

SELECT
    user_id,
    groupArray(event_type)                AS event_list,           -- 所有值
    groupUniqArray(event_type)            AS unique_events,        -- 去重后的值
    groupArraySorted(5)(event_type)       AS top5_events,          -- 排序后取前5
    groupArrayLast(3)(page_url)           AS last3_pages,          -- 最后3个值
    groupArrayFirst(3)(page_url)          AS first3_pages          -- 最前3个值
FROM tutorial.events
GROUP BY user_id;

3. 高级聚合函数

3.1 argMin / argMax

-- argMin: 按某个值最小/最大时取对应的其他列
-- argMax: 按某个值最大时取对应的其他列

SELECT
    user_id,
    argMin(page_url, event_time)           AS first_page,      -- 第一次访问的页面
    argMax(page_url, event_time)           AS last_page,       -- 最后一次访问的页面
    argMax(amount, event_time)             AS last_amount,     -- 最后一次金额
    min(event_time)                        AS first_seen,
    max(event_time)                        AS last_seen
FROM tutorial.events
GROUP BY user_id;

3.2 any / anyHeavy / anyLast

SELECT
    user_id,
    any(device)         AS any_device,          -- 任意一个值(不确定)
    anyHeavy(city)      AS heavy_city,          -- 出现最频繁的值
    anyLast(event_type) AS last_event           -- 最后一个值
FROM tutorial.events
GROUP BY user_id;

3.3 topK / topKWeighted

-- topK: 频率最高的 K 个值(近似算法)
SELECT
    topK(3)(page_url)          AS top3_pages,
    topK(3)(event_type)        AS top3_events,
    topK(3)(city)              AS top3_cities
FROM tutorial.events;

-- topKWeighted: 带权重的 topK
SELECT
    topKWeighted(3)(city, toUInt64(amount)) AS top_cities_by_revenue
FROM tutorial.events
WHERE amount > 0;

3.4 quantile 系列

SELECT
    event_date,
    -- 分位数
    quantile(0.50)(duration)    AS p50_duration,
    quantile(0.90)(duration)    AS p90_duration,
    quantile(0.95)(duration)    AS p95_duration,
    quantile(0.99)(duration)    AS p99_duration,
    -- 精确分位数(较慢)
    quantileExact(0.95)(duration) AS p95_exact,
    -- TDigest 近似分位数
    quantileTDigest(0.95)(duration) AS p95_tdigest,
    -- 多分位数一次计算
    quantiles(0.25, 0.50, 0.75, 0.90, 0.95)(duration) AS percentiles
FROM tutorial.events
GROUP BY event_date
ORDER BY event_date;

3.5 统计函数

SELECT
    -- 相关性
    corr(duration, amount)              AS correlation,
    -- 协方差
    covarSamp(duration, amount)         AS covar_samp,
    covarPop(duration, amount)          AS covar_pop,
    -- 线性回归
    linReg(duration, amount)            AS linear_reg_slope,
    -- 信息熵
    entropy(event_type)                 AS event_entropy,
    -- 基尼系数
    gini(amount)                        AS gini_coeff
FROM tutorial.events;

4. GROUP BY 进阶

4.1 ROLLUP

-- ROLLUP: 从右到左逐级去除维度,生成小计和总计行
SELECT
    event_date,
    device,
    count()                     AS events,
    uniq(user_id)               AS users,
    sum(amount)                 AS revenue
FROM tutorial.events
GROUP BY ROLLUP(event_date, device)
ORDER BY event_date, device;

输出类似:

event_date  | device  | events | users | revenue
------------|---------|--------|-------|--------
2026-05-01  | mobile  |    3   |   1   |  299.00   ← 明细行
2026-05-01  | desktop |    3   |   1   |    0.00   ← 明细行
2026-05-01  |         |    6   |   2   |  299.00   ← 小计(按日期)
2026-05-02  | mobile  |    2   |   1   |  599.00
2026-05-02  | tablet  |    1   |   1   |    0.00
2026-05-02  |         |    3   |   2   |  599.00   ← 小计
...         |         |    ... |  ...  |   ...
            |         |   15   |   4   | 1996.00   ← 总计

4.2 CUBE

-- CUBE: 生成所有维度组合的聚合结果
SELECT
    event_date,
    device,
    city,
    count() AS events
FROM tutorial.events
GROUP BY CUBE(event_date, device, city)
ORDER BY event_date, device, city;
-- 输出: (event_date,device,city) / (event_date,device) / (event_date,city) / (device,city) / (event_date) / (device) / (city) / ()

4.3 GROUPING SETS

-- GROUPING SETS: 自定义分组组合
SELECT
    event_date,
    device,
    city,
    count() AS events,
    uniq(user_id) AS users
FROM tutorial.events
GROUP BY GROUPING SETS (
    (event_date, device),    -- 按日期 + 设备
    (event_date, city),      -- 按日期 + 城市
    (device, city)           -- 按设备 + 城市
)
ORDER BY event_date, device, city;

4.4 GROUP BY WITH TOTALS

-- WITH TOTALS: 在结果末尾添加总计行
SELECT
    device,
    city,
    count() AS events,
    sum(amount) AS revenue
FROM tutorial.events
GROUP BY device, city
WITH TOTALS
ORDER BY revenue DESC;

-- 访问总计行
SELECT
    device,
    city,
    count() AS events
FROM tutorial.events
GROUP BY device, city
WITH TOTALS
ORDER BY events DESC
LIMIT 10;

💡 提示ROLLUP 适合层次维度(如 年→月→日),CUBE 适合所有维度组合分析,GROUPING SETS 适合自定义聚合维度组合。


5. 窗口函数

窗口函数在不折叠行的情况下对数据进行计算,每行保留,同时可以访问"窗口"内的其他行。

5.1 ROW_NUMBER / RANK / DENSE_RANK

-- 每个用户的事件按时间排序编号
SELECT
    user_id,
    event_time,
    event_type,
    page_url,
    ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY event_time) AS row_num,
    RANK()       OVER (PARTITION BY user_id ORDER BY event_time) AS rank_num,
    DENSE_RANK() OVER (PARTITION BY user_id ORDER BY event_type) AS dense_rank_num
FROM tutorial.events
ORDER BY user_id, event_time;

-- 场景:每个用户最近一次购买事件
SELECT * FROM (
    SELECT
        user_id,
        event_time,
        amount,
        ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY event_time DESC) AS rn
    FROM tutorial.events
    WHERE event_type = 'purchase'
)
WHERE rn = 1;

5.2 LAG / LEAD

-- LAG: 向前取值(取前一行)
-- LEAD: 向后取值(取后一行)
SELECT
    user_id,
    event_time,
    event_type,
    page_url,
    -- 前一次事件的时间
    LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_event_time,
    -- 前一次事件的类型
    LAG(event_type) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_event_type,
    -- 后一次事件的时间
    LEAD(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS next_event_time,
    -- 当前事件与前一次事件的时间间隔(秒)
    dateDiff('second',
        LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time),
        event_time
    ) AS seconds_since_prev
FROM tutorial.events
ORDER BY user_id, event_time;

5.3 FIRST_VALUE / LAST_VALUE / NTH_VALUE

SELECT
    user_id,
    event_time,
    event_type,
    amount,
    -- 窗口内第一行的值
    FIRST_VALUE(page_url) OVER w AS first_page,
    -- 窗口内最后一行的值(需要正确的帧定义)
    LAST_VALUE(page_url)  OVER w AS current_page,
    -- 窗口内第 N 行的值
    NTH_VALUE(page_url, 2) OVER w AS second_page
FROM tutorial.events
WINDOW w AS (PARTITION BY user_id ORDER BY event_time
             ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
ORDER BY user_id, event_time;

⚠️ 注意LAST_VALUE 默认帧是 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW,所以默认返回的是当前行的值。要获取真正的最后一行,需要显式指定帧范围。

5.4 SUM … OVER / AVG … OVER / COUNT … OVER

-- 累计求和
SELECT
    event_date,
    user_id,
    amount,
    -- 累计金额(按时间排序)
    sum(amount) OVER (PARTITION BY user_id ORDER BY event_time) AS cumulative_amount,
    -- 总金额(整个窗口)
    sum(amount) OVER (PARTITION BY user_id) AS user_total,
    -- 每日金额占比
    round(amount / sum(amount) OVER (PARTITION BY event_date) * 100, 2) AS daily_pct
FROM tutorial.events
WHERE amount > 0
ORDER BY user_id, event_time;

5.5 其他窗口函数

SELECT
    user_id,
    event_time,
    event_type,
    -- 分区内行号
    ROW_NUMBER() OVER w AS row_num,
    -- 分区总行数
    COUNT(*) OVER w AS total_rows,
    -- 当前行占分区的百分比
    round(ROW_NUMBER() OVER w / COUNT(*) OVER w * 100, 2) AS pct,
    -- NTILE: 将分区分为 N 组
    NTILE(4) OVER w AS quartile,
    -- 百分位排名
    round(PERCENT_RANK() OVER w * 100, 2) AS percent_rank,
    -- 累积分布
    round(CUME_DIST() OVER w * 100, 2) AS cum_dist
FROM tutorial.events
WINDOW w AS (PARTITION BY user_id ORDER BY event_time)
ORDER BY user_id, event_time;

6. 窗口帧(Window Frame)

6.1 ROWS 帧

-- ROWS: 按物理行数定义帧
SELECT
    event_date,
    user_id,
    amount,
    -- 滑动窗口:前 2 行 + 当前行 + 后 2 行
    sum(amount) OVER (
        PARTITION BY user_id
        ORDER BY event_time
        ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
    ) AS moving_sum_5,
    -- 当前行及之前所有行
    avg(amount) OVER (
        PARTITION BY user_id
        ORDER BY event_time
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS running_avg
FROM tutorial.events
WHERE amount > 0
ORDER BY user_id, event_time;

6.2 RANGE 帧

-- RANGE: 按值范围定义帧
SELECT
    event_date,
    user_id,
    amount,
    -- 金额在当前值 ±100 范围内的行
    count(*) OVER (
        ORDER BY amount
        RANGE BETWEEN 100 PRECEDING AND 100 FOLLOWING
    ) AS similar_amount_count
FROM tutorial.events
WHERE amount > 0
ORDER BY amount;

6.3 GROUPS 帧

-- GROUPS: 按排序值的分组定义帧
SELECT
    event_date,
    user_id,
    event_type,
    count(*) OVER (
        ORDER BY event_type
        GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING
    ) AS related_events_count
FROM tutorial.events
ORDER BY event_type;

6.4 默认帧行为

-- 不指定帧时的默认行为:
-- 有 ORDER BY: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-- 无 ORDER BY: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING

-- 示例:累计和(隐式帧)
SELECT
    event_date,
    amount,
    sum(amount) OVER (ORDER BY event_date) AS implicit_frame,
    sum(amount) OVER (ORDER BY event_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS explicit_frame
FROM tutorial.events
WHERE amount > 0;

7. 窗口函数命名(WINDOW 子句)

-- 使用 WINDOW 子句复用窗口定义
SELECT
    user_id,
    event_time,
    amount,
    ROW_NUMBER() OVER w AS row_num,
    sum(amount) OVER w AS cumulative_amount,
    avg(amount) OVER w AS running_avg,
    max(amount) OVER w AS running_max,
    count(*) OVER w AS running_count
FROM tutorial.events
WINDOW w AS (PARTITION BY user_id ORDER BY event_time)
ORDER BY user_id, event_time;

8. 性能优化

8.1 聚合性能优化

-- ✅ 使用 uniq 代替 COUNT(DISTINCT)(近似计算,快 10~100 倍)
SELECT uniq(user_id) FROM tutorial.events;

-- ✅ 使用 quantile 代替排序取中位数
SELECT quantile(0.5)(amount) FROM tutorial.events;

-- ✅ 使用 -State/-Merge 函数配合物化视图预聚合
-- 聚合状态表
CREATE TABLE tutorial.events_agg
(
    event_date Date,
    event_type LowCardinality(String),
    count_state AggregateFunction(count, UInt64),
    sum_state AggregateFunction(sum, Decimal128(2)),
    uniq_state AggregateFunction(uniq, UInt64)
)
ENGINE = AggregatingMergeTree()
ORDER BY (event_date, event_type);

-- 物化视图
CREATE MATERIALIZED VIEW tutorial.events_agg_mv TO tutorial.events_agg AS
SELECT
    event_date,
    event_type,
    countState()       AS count_state,
    sumState(amount)   AS sum_state,
    uniqState(user_id) AS uniq_state
FROM tutorial.events
GROUP BY event_date, event_type;

-- 查询时使用合并函数
SELECT
    event_date,
    event_type,
    countMerge(count_state) AS events,
    sumMerge(sum_state) AS revenue,
    uniqMerge(uniq_state) AS unique_users
FROM tutorial.events_agg
GROUP BY event_date, event_type;

8.2 窗口函数性能优化

-- ✅ 减少窗口函数数量(合并窗口定义)
-- ❌ 低效
SELECT
    sum(a) OVER (PARTITION BY x ORDER BY y),
    avg(a) OVER (PARTITION BY x ORDER BY y),
    count() OVER (PARTITION BY x ORDER BY y)
FROM t;

-- ✅ 高效
SELECT
    sum(a) OVER w,
    avg(a) OVER w,
    count() OVER w
FROM t
WINDOW w AS (PARTITION BY x ORDER BY y);

-- ✅ 使用 LIMIT 减少窗口函数计算行数
SELECT *
FROM (
    SELECT
        user_id,
        amount,
        ROW_NUMBER() OVER (ORDER BY amount DESC) AS rn
    FROM tutorial.events
)
WHERE rn <= 10;

9. 实际案例

9.1 留存分析

-- 用户次日留存率
WITH
    first_visit AS (
        SELECT
            user_id,
            min(event_date) AS first_date
        FROM tutorial.events
        GROUP BY user_id
    ),
    retention AS (
        SELECT
            fv.first_date,
            count(DISTINCT fv.user_id) AS cohort_size,
            count(DISTINCT e.user_id) AS retained_users,
            round(count(DISTINCT e.user_id) / count(DISTINCT fv.user_id) * 100, 2) AS retention_rate
        FROM first_visit AS fv
        LEFT JOIN tutorial.events AS e
            ON fv.user_id = e.user_id
            AND e.event_date = fv.first_date + INTERVAL 1 DAY
        GROUP BY fv.first_date
    )
SELECT
    first_date AS cohort_date,
    cohort_size,
    retained_users,
    retention_rate
FROM retention
ORDER BY first_date;

9.2 漏斗分析

-- 漏斗: view → click → purchase
WITH
    funnel AS (
        SELECT
            user_id,
            -- 是否完成了 view
            countIf(event_type = 'view')     > 0 AS has_view,
            -- 是否完成了 click(在 view 之后)
            countIf(event_type = 'click')    > 0 AS has_click,
            -- 是否完成了 purchase(在 click 之后)
            countIf(event_type = 'purchase') > 0 AS has_purchase
        FROM tutorial.events
        GROUP BY user_id
    )
SELECT
    count()                                           AS total_users,
    countIf(has_view)                                 AS step1_view,
    countIf(has_view AND has_click)                   AS step2_click,
    countIf(has_view AND has_click AND has_purchase)  AS step3_purchase,
    round(countIf(has_view AND has_click) / countIf(has_view) * 100, 2) AS view_to_click,
    round(countIf(has_view AND has_click AND has_purchase) / countIf(has_view AND has_click) * 100, 2) AS click_to_purchase
FROM funnel;

9.3 移动平均

-- 7 天移动平均
SELECT
    event_date,
    daily_revenue,
    avg(daily_revenue) OVER (
        ORDER BY event_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS moving_avg_7d
FROM (
    SELECT
        event_date,
        sum(amount) AS daily_revenue
    FROM tutorial.events
    WHERE amount > 0
    GROUP BY event_date
    ORDER BY event_date
);

9.4 用户分层(RFM)

-- RFM 分析: Recency / Frequency / Monetary
WITH
    user_stats AS (
        SELECT
            user_id,
            dateDiff('day', max(event_date), toDate('2026-05-04')) AS recency,
            count()                         AS frequency,
            sum(amount)                     AS monetary
        FROM tutorial.events
        WHERE event_type = 'purchase'
        GROUP BY user_id
    )
SELECT
    user_id,
    recency,
    frequency,
    monetary,
    -- 分层
    CASE
        WHEN recency <= 1 AND frequency >= 2 AND monetary >= 500 THEN 'Champions'
        WHEN recency <= 3 AND frequency >= 1 THEN 'Loyal'
        WHEN recency <= 1 THEN 'New Customers'
        WHEN recency <= 7 THEN 'At Risk'
        ELSE 'Lost'
    END AS user_segment
FROM user_stats
ORDER BY monetary DESC;

9.5 同比环比分析

-- 月度收入同比环比
WITH
    monthly AS (
        SELECT
            toStartOfMonth(event_date) AS month,
            sum(amount) AS revenue
        FROM tutorial.events
        WHERE amount > 0
        GROUP BY month
    )
SELECT
    month,
    revenue,
    -- 环比(上月)
    lagInFrame(revenue) OVER (ORDER BY month) AS prev_month,
    round((revenue - lagInFrame(revenue) OVER (ORDER BY month)) /
          lagInFrame(revenue) OVER (ORDER BY month) * 100, 2) AS mom_growth_pct
FROM monthly
ORDER BY month;

扩展阅读

  1. ClickHouse 聚合函数
  2. 窗口函数
  3. GROUP BY 进阶
  4. quantile 函数
  5. uniq 近似算法