设置表
CREATE OR REPLACE TABLE atp_matches_1960s ORDER BY tourney_id AS
SELECT tourney_id, surface, winner_name, loser_name, winner_seed, loser_seed, score
FROM url('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_{1968..1969}.csv')
SETTINGS schema_inference_make_columns_nullable=0,
schema_inference_hints='winner_seed Nullable(String), loser_seed Nullable(UInt8)';
CREATE OR REPLACE TABLE atp_matches_1970s ORDER BY tourney_id AS
SELECT tourney_id, surface, winner_name, loser_name, winner_seed, loser_seed, splitByWhitespace(score) AS score
FROM url('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_{1970..1979}.csv')
SETTINGS schema_inference_make_columns_nullable=0,
schema_inference_hints='winner_seed Nullable(UInt8), loser_seed Nullable(UInt8)';
CREATE OR REPLACE TABLE atp_matches_1980s ORDER BY tourney_id AS
SELECT tourney_id, surface, winner_name, loser_name, winner_seed, loser_seed, splitByWhitespace(score) AS score
FROM url('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_{1980..1989}.csv')
SETTINGS schema_inference_make_columns_nullable=0,
schema_inference_hints='winner_seed Nullable(UInt16), loser_seed Nullable(UInt16)';
CREATE OR REPLACE TABLE atp_matches_1990s ORDER BY tourney_id AS
SELECT tourney_id, surface, winner_name, loser_name, winner_seed, loser_seed, splitByWhitespace(score) AS score,
toBool(arrayExists(x -> position(x, 'W/O') > 0, score))::Nullable(bool) AS walkover,
toBool(arrayExists(x -> position(x, 'RET') > 0, score))::Nullable(bool) AS retirement
FROM url('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_{1990..1999}.csv')
SETTINGS schema_inference_make_columns_nullable=0,
schema_inference_hints='winner_seed Nullable(UInt16), loser_seed Nullable(UInt16), surface Enum(\'Hard\', \'Grass\', \'Clay\', \'Carpet\')';
多个表的 schema
SELECT * EXCEPT(position) FROM (
SELECT position, name,
any(if(table = 'atp_matches_1960s', type, null)) AS 1960s,
any(if(table = 'atp_matches_1970s', type, null)) AS 1970s,
any(if(table = 'atp_matches_1980s', type, null)) AS 1980s,
any(if(table = 'atp_matches_1990s', type, null)) AS 1990s
FROM system.columns
WHERE database = currentDatabase() AND table LIKE 'atp_matches%'
GROUP BY ALL
ORDER BY position ASC
)
SETTINGS output_format_pretty_max_value_width=25;
┌─name────────┬─1960s────────────┬─1970s───────────┬─1980s────────────┬─1990s─────────────────────┐
│ tourney_id │ String │ String │ String │ String │
│ surface │ String │ String │ String │ Enum8('Hard' = 1, 'Grass'⋯│
│ winner_name │ String │ String │ String │ String │
│ loser_name │ String │ String │ String │ String │
│ winner_seed │ Nullable(String) │ Nullable(UInt8) │ Nullable(UInt16) │ Nullable(UInt16) │
│ loser_seed │ Nullable(UInt8) │ Nullable(UInt8) │ Nullable(UInt16) │ Nullable(UInt16) │
│ score │ String │ Array(String) │ Array(String) │ Array(String) │
│ walkover │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ Nullable(Bool) │
│ retirement │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ Nullable(Bool) │
└─────────────┴──────────────────┴─────────────────┴──────────────────┴───────────────────────────┘
- 1970s 将
winner_seed的类型从Nullable(String)改为Nullable(UInt8),并将score从String改为Array(String)。 - 1980s 将
winner_seed和loser_seed的类型从Nullable(UInt8)改为Nullable(UInt16)。 - 1990s 将
surface从String改为Enum('Hard', 'Grass', 'Clay', 'Carpet'),并新增了walkover和retirement列。
使用 merge 查询多个表
SELECT loser_name, score
FROM merge('atp_matches*')
WHERE winner_name = 'John McEnroe'
AND loser_seed = 1;
┌─loser_name────┬─score───────────────────────────┐
│ Bjorn Borg │ ['6-3','6-4'] │
│ Bjorn Borg │ ['7-6','6-1','6-7','5-7','6-4'] │
│ Bjorn Borg │ ['7-6','6-4'] │
│ Bjorn Borg │ ['4-6','7-6','7-6','6-4'] │
│ Jimmy Connors │ ['6-1','6-3'] │
│ Ivan Lendl │ ['6-2','4-6','6-3','6-7','7-6'] │
│ Ivan Lendl │ ['6-3','3-6','6-3','7-6'] │
│ Ivan Lendl │ ['6-1','6-3'] │
│ Stefan Edberg │ ['6-2','6-3'] │
│ Stefan Edberg │ ['7-6','6-2'] │
│ Stefan Edberg │ ['6-2','6-2'] │
│ Jakob Hlasek │ ['6-3','7-6'] │
└───────────────┴─────────────────────────────────┘
winner_seed 在不同的表中使用了不同的类型:
SELECT loser_name, score, winner_seed
FROM merge('atp_matches*')
WHERE winner_name = 'John McEnroe'
AND loser_seed = 1
AND multiIf(
variantType(winner_seed) = 'UInt8', variantElement(winner_seed, 'UInt8') >= 3,
variantType(winner_seed) = 'UInt16', variantElement(winner_seed, 'UInt16') >= 3,
variantElement(winner_seed, 'String')::UInt16 >= 3
);
variantType 函数检查每一行中 winner_seed 的类型,再使用 variantElement 提取其底层值。
当类型为 String 时,我们会先将其转换为数值,再进行比较。
运行该查询后的结果如下所示:
┌─loser_name────┬─score─────────┬─winner_seed─┐
│ Bjorn Borg │ ['6-3','6-4'] │ 3 │
│ Stefan Edberg │ ['6-2','6-3'] │ 6 │
│ Stefan Edberg │ ['7-6','6-2'] │ 4 │
│ Stefan Edberg │ ['6-2','6-2'] │ 7 │
└───────────────┴───────────────┴─────────────┘
使用 merge 时,这些行来自哪个表?
_table 虚拟列,如下查询所示:
SELECT _table, loser_name, score, winner_seed
FROM merge('atp_matches*')
WHERE winner_name = 'John McEnroe'
AND loser_seed = 1
AND multiIf(
variantType(winner_seed) = 'UInt8', variantElement(winner_seed, 'UInt8') >= 3,
variantType(winner_seed) = 'UInt16', variantElement(winner_seed, 'UInt16') >= 3,
variantElement(winner_seed, 'String')::UInt16 >= 3
);
┌─_table────────────┬─loser_name────┬─score─────────┬─winner_seed─┐
│ atp_matches_1970s │ Bjorn Borg │ ['6-3','6-4'] │ 3 │
│ atp_matches_1980s │ Stefan Edberg │ ['6-2','6-3'] │ 6 │
│ atp_matches_1980s │ Stefan Edberg │ ['7-6','6-2'] │ 4 │
│ atp_matches_1980s │ Stefan Edberg │ ['6-2','6-2'] │ 7 │
└───────────────────┴───────────────┴───────────────┴─────────────┘
walkover 列各个值的数量:
SELECT _table, walkover, count()
FROM merge('atp_matches*')
GROUP BY ALL
ORDER BY _table;
┌─_table────────────┬─walkover─┬─count()─┐
│ atp_matches_1960s │ ᴺᵁᴸᴸ │ 7542 │
│ atp_matches_1970s │ ᴺᵁᴸᴸ │ 39165 │
│ atp_matches_1980s │ ᴺᵁᴸᴸ │ 36233 │
│ atp_matches_1990s │ true │ 128 │
│ atp_matches_1990s │ false │ 37022 │
└───────────────────┴──────────┴─────────┘
atp_matches_1990s 外,其余所有数据的 walkover 列都为 NULL。
如果 walkover 列为 NULL,我们需要更新查询,检查 score 列是否包含字符串 W/O:
SELECT _table,
multiIf(
walkover IS NOT NULL,
walkover,
variantType(score) = 'Array(String)',
toBool(arrayExists(
x -> position(x, 'W/O') > 0,
variantElement(score, 'Array(String)')
)),
variantElement(score, 'String') LIKE '%W/O%'
),
count()
FROM merge('atp_matches*')
GROUP BY ALL
ORDER BY _table;
score 的底层类型是 Array(String),我们就必须遍历整个数组来查找 W/O;而如果它的类型是 String,则只需在字符串中搜索 W/O。
┌─_table────────────┬─multiIf(isNo⋯, '%W/O%'))─┬─count()─┐
│ atp_matches_1960s │ true │ 242 │
│ atp_matches_1960s │ false │ 7300 │
│ atp_matches_1970s │ true │ 422 │
│ atp_matches_1970s │ false │ 38743 │
│ atp_matches_1980s │ true │ 92 │
│ atp_matches_1980s │ false │ 36141 │
│ atp_matches_1990s │ true │ 128 │
│ atp_matches_1990s │ false │ 37022 │
└───────────────────┴──────────────────────────┴─────────┘