描述
MergeState 组合器
可用于 avg
函数,以合并类型为 AverageFunction(avg, T) 的部分聚合状态,并
返回一个新的中间聚合状态。
示例用法
MergeState 组合器在多级聚合
场景中特别有用:当你希望合并预聚合状态,并继续将其保留为
状态 (而不是将其最终化) 以便后续处理时,尤其如此。为了说明这一点,我们来看
一个示例:如何将单台服务器的性能指标
转换为跨多个层级的分层聚合:服务器级 → 区域级
→ 数据中心级。
首先,我们创建一个表来存储原始数据:
CREATE TABLE raw_server_metrics
(
timestamp DateTime DEFAULT now(),
server_id UInt32,
region String,
datacenter String,
response_time_ms UInt32
)
ENGINE = MergeTree()
ORDER BY (region, server_id, timestamp);
CREATE TABLE server_performance
(
server_id UInt32,
region String,
datacenter String,
avg_response_time AggregateFunction(avg, UInt32)
)
ENGINE = AggregatingMergeTree()
ORDER BY (region, server_id);
CREATE MATERIALIZED VIEW server_performance_mv
TO server_performance
AS SELECT
server_id,
region,
datacenter,
avgState(response_time_ms) AS avg_response_time
FROM raw_server_metrics
GROUP BY server_id, region, datacenter;
CREATE TABLE region_performance
(
region String,
datacenter String,
avg_response_time AggregateFunction(avg, UInt32)
)
ENGINE = AggregatingMergeTree()
ORDER BY (datacenter, region);
CREATE MATERIALIZED VIEW region_performance_mv
TO region_performance
AS SELECT
region,
datacenter,
avgMergeState(avg_response_time) AS avg_response_time
FROM server_performance
GROUP BY region, datacenter;
-- 数据中心级别的表和 materialized view
CREATE TABLE datacenter_performance
(
datacenter String,
avg_response_time AggregateFunction(avg, UInt32)
)
ENGINE = AggregatingMergeTree()
ORDER BY datacenter;
CREATE MATERIALIZED VIEW datacenter_performance_mv
TO datacenter_performance
AS SELECT
datacenter,
avgMergeState(avg_response_time) AS avg_response_time
FROM region_performance
GROUP BY datacenter;
INSERT INTO raw_server_metrics (timestamp, server_id, region, datacenter, response_time_ms) VALUES
(now(), 101, 'us-east', 'dc1', 120),
(now(), 101, 'us-east', 'dc1', 130),
(now(), 102, 'us-east', 'dc1', 115),
(now(), 201, 'us-west', 'dc1', 95),
(now(), 202, 'us-west', 'dc1', 105),
(now(), 301, 'eu-central', 'dc2', 145),
(now(), 302, 'eu-central', 'dc2', 155);
- 服务级别
- 区域级别
- 数据中心级别
SELECT
server_id,
region,
avgMerge(avg_response_time) AS avg_response_ms
FROM server_performance
GROUP BY server_id, region
ORDER BY region, server_id;
┌─server_id─┬─region─────┬─avg_response_ms─┐
│ 301 │ eu-central │ 145 │
│ 302 │ eu-central │ 155 │
│ 101 │ us-east │ 125 │
│ 102 │ us-east │ 115 │
│ 201 │ us-west │ 95 │
│ 202 │ us-west │ 105 │
└───────────┴────────────┴─────────────────┘
SELECT
region,
datacenter,
avgMerge(avg_response_time) AS avg_response_ms
FROM region_performance
GROUP BY region, datacenter
ORDER BY datacenter, region;
┌─region─────┬─datacenter─┬────avg_response_ms─┐
│ us-east │ dc1 │ 121.66666666666667 │
│ us-west │ dc1 │ 100 │
│ eu-central │ dc2 │ 150 │
└────────────┴────────────┴────────────────────┘
SELECT
datacenter,
avgMerge(avg_response_time) AS avg_response_ms
FROM datacenter_performance
GROUP BY datacenter
ORDER BY datacenter;
┌─datacenter─┬─avg_response_ms─┐
│ dc1 │ 113 │
│ dc2 │ 150 │
└────────────┴─────────────────┘
INSERT INTO raw_server_metrics (timestamp, server_id, region, datacenter, response_time_ms) VALUES
(now(), 101, 'us-east', 'dc1', 140),
(now(), 201, 'us-west', 'dc1', 85),
(now(), 301, 'eu-central', 'dc2', 135);
SELECT
datacenter,
avgMerge(avg_response_time) AS avg_response_ms
FROM datacenter_performance
GROUP BY datacenter
ORDER BY datacenter;
┌─datacenter─┬────avg_response_ms─┐
│ dc1 │ 112.85714285714286 │
│ dc2 │ 145 │
└────────────┴────────────────────┘