ClickHouse 教程 / 聚合函数与窗口函数
聚合函数与窗口函数
聚合函数和窗口函数是 ClickHouse 分析查询的核心能力,掌握它们是高效使用 ClickHouse 的关键。
1. 前置准备
CREATE DATABASE IF NOT EXISTS tutorial;
CREATE TABLE tutorial.events
(
event_date Date,
event_time DateTime,
user_id UInt64,
session_id String,
event_type LowCardinality(String),
page_url String,
duration UInt32,
amount Decimal128(2),
device LowCardinality(String),
city LowCardinality(String)
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(event_date)
ORDER BY (event_date, user_id, event_time);
-- 插入测试数据
INSERT INTO tutorial.events VALUES
('2026-05-01', '2026-05-01 08:00:00', 1, 's1', 'view', '/home', 30, 0, 'mobile', 'Beijing'),
('2026-05-01', '2026-05-01 08:05:00', 1, 's1', 'click', '/product/1', 0, 0, 'mobile', 'Beijing'),
('2026-05-01', '2026-05-01 08:10:00', 1, 's1', 'purchase', '/checkout', 0, 299.00, 'mobile', 'Beijing'),
('2026-05-01', '2026-05-01 09:00:00', 2, 's2', 'view', '/home', 45, 0, 'desktop', 'Shanghai'),
('2026-05-01', '2026-05-01 09:05:00', 2, 's2', 'click', '/product/2', 0, 0, 'desktop', 'Shanghai'),
('2026-05-01', '2026-05-01 09:10:00', 2, 's2', 'view', '/about', 20, 0, 'desktop', 'Shanghai'),
('2026-05-02', '2026-05-02 10:00:00', 1, 's3', 'view', '/home', 35, 0, 'mobile', 'Beijing'),
('2026-05-02', '2026-05-02 10:10:00', 1, 's3', 'purchase', '/checkout', 0, 599.00, 'mobile', 'Beijing'),
('2026-05-02', '2026-05-02 11:00:00', 3, 's4', 'view', '/home', 25, 0, 'tablet', 'Guangzhou'),
('2026-05-02', '2026-05-02 11:05:00', 3, 's4', 'click', '/product/1', 0, 0, 'tablet', 'Guangzhou'),
('2026-05-03', '2026-05-03 08:30:00', 2, 's5', 'view', '/product/1', 40, 0, 'desktop', 'Shanghai'),
('2026-05-03', '2026-05-03 08:35:00', 2, 's5', 'purchase', '/checkout', 0, 199.00, 'desktop', 'Shanghai'),
('2026-05-03', '2026-05-03 14:00:00', 4, 's6', 'view', '/home', 50, 0, 'mobile', 'Beijing'),
('2026-05-03', '2026-05-03 14:05:00', 4, 's6', 'click', '/product/3', 0, 0, 'mobile', 'Beijing'),
('2026-05-03', '2026-05-03 14:15:00', 4, 's6', 'purchase', '/checkout', 0, 899.00, 'mobile', 'Beijing');
2. 常用聚合函数
2.1 基础聚合
SELECT
count() AS total_events, -- 总行数
count(user_id) AS non_null_users, -- 非 NULL 计数
uniq(user_id) AS unique_users, -- 去重用户数(近似)
uniqExact(user_id) AS exact_unique_users, -- 去重用户数(精确)
sum(amount) AS total_revenue, -- 总金额
avg(amount) AS avg_revenue, -- 平均金额
min(amount) AS min_revenue, -- 最小金额
max(amount) AS max_revenue, -- 最大金额
median(amount) AS median_revenue, -- 中位数
stddevSamp(amount) AS std_dev, -- 标准差
varSamp(amount) AS variance -- 方差
FROM tutorial.events;
2.2 条件聚合
SELECT
event_date,
count() AS total_events,
countIf(event_type = 'purchase') AS purchases,
countIf(event_type = 'view') AS views,
sumIf(amount, event_type = 'purchase') AS purchase_amount,
avgIf(amount, event_type = 'purchase') AS avg_purchase,
uniqIf(user_id, event_type = 'purchase') AS buying_users,
round(countIf(event_type = 'purchase') / count() * 100, 2) AS conversion_rate
FROM tutorial.events
GROUP BY event_date
ORDER BY event_date;
预期输出:
┌─event_date─┬─total_events─┬─purchases─┬─views─┬─purchase_amount─┬─avg_purchase─┬─buying_users─┬─conversion_rate─┐
│ 2026-05-01 │ 6 │ 1 │ 2 │ 299.00 │ 299.00 │ 1 │ 16.67 │
│ 2026-05-02 │ 3 │ 1 │ 1 │ 599.00 │ 599.00 │ 1 │ 33.33 │
│ 2026-05-03 │ 6 │ 2 │ 2 │ 1098.00 │ 549.00 │ 2 │ 33.33 │
└────────────┴──────────────┴───────────┴───────┴─────────────────┴──────────────┴──────────────┴─────────────────┘
2.3 数组聚合
SELECT
user_id,
groupArray(event_type) AS event_list, -- 所有值
groupUniqArray(event_type) AS unique_events, -- 去重后的值
groupArraySorted(5)(event_type) AS top5_events, -- 排序后取前5
groupArrayLast(3)(page_url) AS last3_pages, -- 最后3个值
groupArrayFirst(3)(page_url) AS first3_pages -- 最前3个值
FROM tutorial.events
GROUP BY user_id;
3. 高级聚合函数
3.1 argMin / argMax
-- argMin: 按某个值最小/最大时取对应的其他列
-- argMax: 按某个值最大时取对应的其他列
SELECT
user_id,
argMin(page_url, event_time) AS first_page, -- 第一次访问的页面
argMax(page_url, event_time) AS last_page, -- 最后一次访问的页面
argMax(amount, event_time) AS last_amount, -- 最后一次金额
min(event_time) AS first_seen,
max(event_time) AS last_seen
FROM tutorial.events
GROUP BY user_id;
3.2 any / anyHeavy / anyLast
SELECT
user_id,
any(device) AS any_device, -- 任意一个值(不确定)
anyHeavy(city) AS heavy_city, -- 出现最频繁的值
anyLast(event_type) AS last_event -- 最后一个值
FROM tutorial.events
GROUP BY user_id;
3.3 topK / topKWeighted
-- topK: 频率最高的 K 个值(近似算法)
SELECT
topK(3)(page_url) AS top3_pages,
topK(3)(event_type) AS top3_events,
topK(3)(city) AS top3_cities
FROM tutorial.events;
-- topKWeighted: 带权重的 topK
SELECT
topKWeighted(3)(city, toUInt64(amount)) AS top_cities_by_revenue
FROM tutorial.events
WHERE amount > 0;
3.4 quantile 系列
SELECT
event_date,
-- 分位数
quantile(0.50)(duration) AS p50_duration,
quantile(0.90)(duration) AS p90_duration,
quantile(0.95)(duration) AS p95_duration,
quantile(0.99)(duration) AS p99_duration,
-- 精确分位数(较慢)
quantileExact(0.95)(duration) AS p95_exact,
-- TDigest 近似分位数
quantileTDigest(0.95)(duration) AS p95_tdigest,
-- 多分位数一次计算
quantiles(0.25, 0.50, 0.75, 0.90, 0.95)(duration) AS percentiles
FROM tutorial.events
GROUP BY event_date
ORDER BY event_date;
3.5 统计函数
SELECT
-- 相关性
corr(duration, amount) AS correlation,
-- 协方差
covarSamp(duration, amount) AS covar_samp,
covarPop(duration, amount) AS covar_pop,
-- 线性回归
linReg(duration, amount) AS linear_reg_slope,
-- 信息熵
entropy(event_type) AS event_entropy,
-- 基尼系数
gini(amount) AS gini_coeff
FROM tutorial.events;
4. GROUP BY 进阶
4.1 ROLLUP
-- ROLLUP: 从右到左逐级去除维度,生成小计和总计行
SELECT
event_date,
device,
count() AS events,
uniq(user_id) AS users,
sum(amount) AS revenue
FROM tutorial.events
GROUP BY ROLLUP(event_date, device)
ORDER BY event_date, device;
输出类似:
event_date | device | events | users | revenue
------------|---------|--------|-------|--------
2026-05-01 | mobile | 3 | 1 | 299.00 ← 明细行
2026-05-01 | desktop | 3 | 1 | 0.00 ← 明细行
2026-05-01 | | 6 | 2 | 299.00 ← 小计(按日期)
2026-05-02 | mobile | 2 | 1 | 599.00
2026-05-02 | tablet | 1 | 1 | 0.00
2026-05-02 | | 3 | 2 | 599.00 ← 小计
... | | ... | ... | ...
| | 15 | 4 | 1996.00 ← 总计
4.2 CUBE
-- CUBE: 生成所有维度组合的聚合结果
SELECT
event_date,
device,
city,
count() AS events
FROM tutorial.events
GROUP BY CUBE(event_date, device, city)
ORDER BY event_date, device, city;
-- 输出: (event_date,device,city) / (event_date,device) / (event_date,city) / (device,city) / (event_date) / (device) / (city) / ()
4.3 GROUPING SETS
-- GROUPING SETS: 自定义分组组合
SELECT
event_date,
device,
city,
count() AS events,
uniq(user_id) AS users
FROM tutorial.events
GROUP BY GROUPING SETS (
(event_date, device), -- 按日期 + 设备
(event_date, city), -- 按日期 + 城市
(device, city) -- 按设备 + 城市
)
ORDER BY event_date, device, city;
4.4 GROUP BY WITH TOTALS
-- WITH TOTALS: 在结果末尾添加总计行
SELECT
device,
city,
count() AS events,
sum(amount) AS revenue
FROM tutorial.events
GROUP BY device, city
WITH TOTALS
ORDER BY revenue DESC;
-- 访问总计行
SELECT
device,
city,
count() AS events
FROM tutorial.events
GROUP BY device, city
WITH TOTALS
ORDER BY events DESC
LIMIT 10;
💡 提示:ROLLUP 适合层次维度(如 年→月→日),CUBE 适合所有维度组合分析,GROUPING SETS 适合自定义聚合维度组合。
5. 窗口函数
窗口函数在不折叠行的情况下对数据进行计算,每行保留,同时可以访问"窗口"内的其他行。
5.1 ROW_NUMBER / RANK / DENSE_RANK
-- 每个用户的事件按时间排序编号
SELECT
user_id,
event_time,
event_type,
page_url,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY event_time) AS row_num,
RANK() OVER (PARTITION BY user_id ORDER BY event_time) AS rank_num,
DENSE_RANK() OVER (PARTITION BY user_id ORDER BY event_type) AS dense_rank_num
FROM tutorial.events
ORDER BY user_id, event_time;
-- 场景:每个用户最近一次购买事件
SELECT * FROM (
SELECT
user_id,
event_time,
amount,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY event_time DESC) AS rn
FROM tutorial.events
WHERE event_type = 'purchase'
)
WHERE rn = 1;
5.2 LAG / LEAD
-- LAG: 向前取值(取前一行)
-- LEAD: 向后取值(取后一行)
SELECT
user_id,
event_time,
event_type,
page_url,
-- 前一次事件的时间
LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_event_time,
-- 前一次事件的类型
LAG(event_type) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_event_type,
-- 后一次事件的时间
LEAD(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS next_event_time,
-- 当前事件与前一次事件的时间间隔(秒)
dateDiff('second',
LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time),
event_time
) AS seconds_since_prev
FROM tutorial.events
ORDER BY user_id, event_time;
5.3 FIRST_VALUE / LAST_VALUE / NTH_VALUE
SELECT
user_id,
event_time,
event_type,
amount,
-- 窗口内第一行的值
FIRST_VALUE(page_url) OVER w AS first_page,
-- 窗口内最后一行的值(需要正确的帧定义)
LAST_VALUE(page_url) OVER w AS current_page,
-- 窗口内第 N 行的值
NTH_VALUE(page_url, 2) OVER w AS second_page
FROM tutorial.events
WINDOW w AS (PARTITION BY user_id ORDER BY event_time
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
ORDER BY user_id, event_time;
⚠️ 注意:LAST_VALUE 默认帧是 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW,所以默认返回的是当前行的值。要获取真正的最后一行,需要显式指定帧范围。
5.4 SUM … OVER / AVG … OVER / COUNT … OVER
-- 累计求和
SELECT
event_date,
user_id,
amount,
-- 累计金额(按时间排序)
sum(amount) OVER (PARTITION BY user_id ORDER BY event_time) AS cumulative_amount,
-- 总金额(整个窗口)
sum(amount) OVER (PARTITION BY user_id) AS user_total,
-- 每日金额占比
round(amount / sum(amount) OVER (PARTITION BY event_date) * 100, 2) AS daily_pct
FROM tutorial.events
WHERE amount > 0
ORDER BY user_id, event_time;
5.5 其他窗口函数
SELECT
user_id,
event_time,
event_type,
-- 分区内行号
ROW_NUMBER() OVER w AS row_num,
-- 分区总行数
COUNT(*) OVER w AS total_rows,
-- 当前行占分区的百分比
round(ROW_NUMBER() OVER w / COUNT(*) OVER w * 100, 2) AS pct,
-- NTILE: 将分区分为 N 组
NTILE(4) OVER w AS quartile,
-- 百分位排名
round(PERCENT_RANK() OVER w * 100, 2) AS percent_rank,
-- 累积分布
round(CUME_DIST() OVER w * 100, 2) AS cum_dist
FROM tutorial.events
WINDOW w AS (PARTITION BY user_id ORDER BY event_time)
ORDER BY user_id, event_time;
6. 窗口帧(Window Frame)
6.1 ROWS 帧
-- ROWS: 按物理行数定义帧
SELECT
event_date,
user_id,
amount,
-- 滑动窗口:前 2 行 + 当前行 + 后 2 行
sum(amount) OVER (
PARTITION BY user_id
ORDER BY event_time
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
) AS moving_sum_5,
-- 当前行及之前所有行
avg(amount) OVER (
PARTITION BY user_id
ORDER BY event_time
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS running_avg
FROM tutorial.events
WHERE amount > 0
ORDER BY user_id, event_time;
6.2 RANGE 帧
-- RANGE: 按值范围定义帧
SELECT
event_date,
user_id,
amount,
-- 金额在当前值 ±100 范围内的行
count(*) OVER (
ORDER BY amount
RANGE BETWEEN 100 PRECEDING AND 100 FOLLOWING
) AS similar_amount_count
FROM tutorial.events
WHERE amount > 0
ORDER BY amount;
6.3 GROUPS 帧
-- GROUPS: 按排序值的分组定义帧
SELECT
event_date,
user_id,
event_type,
count(*) OVER (
ORDER BY event_type
GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING
) AS related_events_count
FROM tutorial.events
ORDER BY event_type;
6.4 默认帧行为
-- 不指定帧时的默认行为:
-- 有 ORDER BY: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-- 无 ORDER BY: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-- 示例:累计和(隐式帧)
SELECT
event_date,
amount,
sum(amount) OVER (ORDER BY event_date) AS implicit_frame,
sum(amount) OVER (ORDER BY event_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS explicit_frame
FROM tutorial.events
WHERE amount > 0;
7. 窗口函数命名(WINDOW 子句)
-- 使用 WINDOW 子句复用窗口定义
SELECT
user_id,
event_time,
amount,
ROW_NUMBER() OVER w AS row_num,
sum(amount) OVER w AS cumulative_amount,
avg(amount) OVER w AS running_avg,
max(amount) OVER w AS running_max,
count(*) OVER w AS running_count
FROM tutorial.events
WINDOW w AS (PARTITION BY user_id ORDER BY event_time)
ORDER BY user_id, event_time;
8. 性能优化
8.1 聚合性能优化
-- ✅ 使用 uniq 代替 COUNT(DISTINCT)(近似计算,快 10~100 倍)
SELECT uniq(user_id) FROM tutorial.events;
-- ✅ 使用 quantile 代替排序取中位数
SELECT quantile(0.5)(amount) FROM tutorial.events;
-- ✅ 使用 -State/-Merge 函数配合物化视图预聚合
-- 聚合状态表
CREATE TABLE tutorial.events_agg
(
event_date Date,
event_type LowCardinality(String),
count_state AggregateFunction(count, UInt64),
sum_state AggregateFunction(sum, Decimal128(2)),
uniq_state AggregateFunction(uniq, UInt64)
)
ENGINE = AggregatingMergeTree()
ORDER BY (event_date, event_type);
-- 物化视图
CREATE MATERIALIZED VIEW tutorial.events_agg_mv TO tutorial.events_agg AS
SELECT
event_date,
event_type,
countState() AS count_state,
sumState(amount) AS sum_state,
uniqState(user_id) AS uniq_state
FROM tutorial.events
GROUP BY event_date, event_type;
-- 查询时使用合并函数
SELECT
event_date,
event_type,
countMerge(count_state) AS events,
sumMerge(sum_state) AS revenue,
uniqMerge(uniq_state) AS unique_users
FROM tutorial.events_agg
GROUP BY event_date, event_type;
8.2 窗口函数性能优化
-- ✅ 减少窗口函数数量(合并窗口定义)
-- ❌ 低效
SELECT
sum(a) OVER (PARTITION BY x ORDER BY y),
avg(a) OVER (PARTITION BY x ORDER BY y),
count() OVER (PARTITION BY x ORDER BY y)
FROM t;
-- ✅ 高效
SELECT
sum(a) OVER w,
avg(a) OVER w,
count() OVER w
FROM t
WINDOW w AS (PARTITION BY x ORDER BY y);
-- ✅ 使用 LIMIT 减少窗口函数计算行数
SELECT *
FROM (
SELECT
user_id,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) AS rn
FROM tutorial.events
)
WHERE rn <= 10;
9. 实际案例
9.1 留存分析
-- 用户次日留存率
WITH
first_visit AS (
SELECT
user_id,
min(event_date) AS first_date
FROM tutorial.events
GROUP BY user_id
),
retention AS (
SELECT
fv.first_date,
count(DISTINCT fv.user_id) AS cohort_size,
count(DISTINCT e.user_id) AS retained_users,
round(count(DISTINCT e.user_id) / count(DISTINCT fv.user_id) * 100, 2) AS retention_rate
FROM first_visit AS fv
LEFT JOIN tutorial.events AS e
ON fv.user_id = e.user_id
AND e.event_date = fv.first_date + INTERVAL 1 DAY
GROUP BY fv.first_date
)
SELECT
first_date AS cohort_date,
cohort_size,
retained_users,
retention_rate
FROM retention
ORDER BY first_date;
9.2 漏斗分析
-- 漏斗: view → click → purchase
WITH
funnel AS (
SELECT
user_id,
-- 是否完成了 view
countIf(event_type = 'view') > 0 AS has_view,
-- 是否完成了 click(在 view 之后)
countIf(event_type = 'click') > 0 AS has_click,
-- 是否完成了 purchase(在 click 之后)
countIf(event_type = 'purchase') > 0 AS has_purchase
FROM tutorial.events
GROUP BY user_id
)
SELECT
count() AS total_users,
countIf(has_view) AS step1_view,
countIf(has_view AND has_click) AS step2_click,
countIf(has_view AND has_click AND has_purchase) AS step3_purchase,
round(countIf(has_view AND has_click) / countIf(has_view) * 100, 2) AS view_to_click,
round(countIf(has_view AND has_click AND has_purchase) / countIf(has_view AND has_click) * 100, 2) AS click_to_purchase
FROM funnel;
9.3 移动平均
-- 7 天移动平均
SELECT
event_date,
daily_revenue,
avg(daily_revenue) OVER (
ORDER BY event_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7d
FROM (
SELECT
event_date,
sum(amount) AS daily_revenue
FROM tutorial.events
WHERE amount > 0
GROUP BY event_date
ORDER BY event_date
);
9.4 用户分层(RFM)
-- RFM 分析: Recency / Frequency / Monetary
WITH
user_stats AS (
SELECT
user_id,
dateDiff('day', max(event_date), toDate('2026-05-04')) AS recency,
count() AS frequency,
sum(amount) AS monetary
FROM tutorial.events
WHERE event_type = 'purchase'
GROUP BY user_id
)
SELECT
user_id,
recency,
frequency,
monetary,
-- 分层
CASE
WHEN recency <= 1 AND frequency >= 2 AND monetary >= 500 THEN 'Champions'
WHEN recency <= 3 AND frequency >= 1 THEN 'Loyal'
WHEN recency <= 1 THEN 'New Customers'
WHEN recency <= 7 THEN 'At Risk'
ELSE 'Lost'
END AS user_segment
FROM user_stats
ORDER BY monetary DESC;
9.5 同比环比分析
-- 月度收入同比环比
WITH
monthly AS (
SELECT
toStartOfMonth(event_date) AS month,
sum(amount) AS revenue
FROM tutorial.events
WHERE amount > 0
GROUP BY month
)
SELECT
month,
revenue,
-- 环比(上月)
lagInFrame(revenue) OVER (ORDER BY month) AS prev_month,
round((revenue - lagInFrame(revenue) OVER (ORDER BY month)) /
lagInFrame(revenue) OVER (ORDER BY month) * 100, 2) AS mom_growth_pct
FROM monthly
ORDER BY month;