后台回复:1024,获取500G视频教程 推荐阅读 转行成为数据分析师 牛逼,OpenAI新模型 o1 国内直接连! 《SQL145题第2版》正式发布!
大家好,我是岳哥。
开窗函数(Window Functions)是SQL中强大的分析工具,允许我们在不改变结果集行数的情况下进行复杂的聚合和分析操作。本文将系统地介绍开窗函数的用法和实际应用场景。
一、基础语法
二、常用开窗函数详解
三、实际应用场景
四、性能优化建议
五、常见错误和解决方案
需要本文PDF版本的同学,可以在公众号后台回复:开窗函数
一、基础语法
-- 开窗函数基本语法
SELECT
column1,
column2,
window_function() OVER (
[PARTITION BY partition_expression]
[ORDER BY sort_expression [ASC|DESC]]
[frame_clause]
)
FROM table_name;
-- 基础示例:计算每个部门的员工工资排名
SELECT
employee_name,
department,
salary,
RANK() OVER (
PARTITION BY department
ORDER BY salary DESC
) as salary_rank
FROM employees;
-- 常用开窗函数列表:
-- 1. 排序函数
ROW_NUMBER() -- 返回唯一的序号
RANK() -- 返回排名(相同值排名相同,但会占用名次)
DENSE_RANK() -- 返回排名(相同值排名相同,不会占用名次)
NTILE(n) -- 将记录分为n个组
-- 2. 聚合函数
SUM() -- 求和
AVG() -- 平均值
COUNT() -- 计数
MAX() -- 最大值
MIN() -- 最小值
-- 3. 偏移函数
LAG() -- 返回前面的值
LEAD() -- 返回后面的值
FIRST_VALUE() -- 返回第一个值
LAST_VALUE() -- 返回最后一个值
-- 4. 其他函数
PERCENT_RANK() -- 返回百分比排名
CUME_DIST() -- 返回累积分布值
二、常用开窗函数详解
1. 排序函数
-- 创建示例表和数据
CREATE TABLE sales (
id INT PRIMARY KEY,
salesperson VARCHAR(50),
region VARCHAR(50),
amount DECIMAL(10,2),
sale_date DATE
);
INSERT INTO sales VALUES
(1, 'John', 'North', 10000, '2024-01-01'),
(2, 'Jane', 'North', 15000, '2024-01-01'),
(3, 'Bob', 'South', 12000, '2024-01-01'),
(4, 'Alice', 'South', 12000, '2024-01-01'),
(5, 'John', 'North', 8000, '2024-01-02');
-- 1. ROW_NUMBER() 示例
-- 为每个区域的销售额按照金额大小分配唯一序号
SELECT
salesperson,
region,
amount,
ROW_NUMBER() OVER (
PARTITION BY region
ORDER BY amount DESC
) as unique_rank
FROM sales;
-- 2. RANK() 示例
-- 为每个区域的销售额进行排名(相同值排名相同,会占用名次)
SELECT
salesperson,
region,
amount,
RANK() OVER (
PARTITION BY region
ORDER BY amount DESC
) as rank_with_gaps
FROM sales;
-- 3. DENSE_RANK() 示例
-- 为每个区域的销售额进行排名(相同值排名相同,不会占用名次)
SELECT
salesperson,
region,
amount,
DENSE_RANK() OVER (
PARTITION BY region
ORDER BY amount DESC
) as rank_without_gaps
FROM sales;
-- 4. NTILE(n) 示例
-- 将每个区域的销售额分为2组
SELECT
salesperson,
region,
amount,
NTILE(2) OVER (
PARTITION BY region
ORDER BY amount DESC
) as tile_number
FROM sales;
-- 5. 比较不同排序函数
SELECT
salesperson,
region,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) as row_num,
RANK() OVER (ORDER BY amount DESC) as rank_num,
DENSE_RANK() OVER (ORDER BY amount DESC) as dense_rank_num
FROM sales;
2. 聚合函数
-- 1. SUM() 示例
-- 计算每个区域的累计销售额
SELECT
salesperson,
region,
amount,
sale_date,
SUM(amount) OVER (
PARTITION BY region
ORDER BY sale_date
) as running_total,
SUM(amount) OVER (PARTITION BY region) as region_total
FROM sales;
-- 2. AVG() 示例
-- 计算移动平均值
SELECT
salesperson,
region,
amount,
sale_date,
AVG(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as moving_average
FROM sales;
-- 3. COUNT() 示例
-- 计算累计销售笔数
SELECT
salesperson,
region,
amount,
sale_date,
COUNT(*) OVER (
PARTITION BY region
ORDER BY sale_date
) as running_count
FROM sales;
-- 4. MAX() & MIN() 示例
-- 查找每个区域的最高和最低销售额
SELECT
salesperson,
region,
amount,
MAX(amount) OVER (PARTITION BY region) as region_max,
MIN(amount) OVER (PARTITION BY region) as region_min,
amount - MIN(amount) OVER (PARTITION BY region) as amount_above_min
FROM sales;
-- 5. 组合使用示例
-- 计算每笔销售占区域总额的百分比
SELECT
salesperson,
region,
amount,
amount / SUM(amount) OVER (PARTITION BY region) * 100 as percentage_of_region,
amount / SUM(amount) OVER () * 100 as percentage_of_total
FROM sales;
-- 6. 滑动窗口示例
SELECT
salesperson,
region,
amount,
sale_date,
AVG(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
) as centered_average,
SUM(amount) OVER (
ORDER BY sale_date
RANGE BETWEEN INTERVAL '1' DAY PRECEDING AND CURRENT ROW
) as rolling_sum
FROM sales;
3. 偏移函数
-- 1. LAG() 示例
-- 比较当前销售额与上一次销售额
SELECT
salesperson,
region,
sale_date,
amount,
LAG(amount) OVER (
PARTITION BY salesperson
ORDER BY sale_date
) as previous_amount,
amount - LAG(amount) OVER (
PARTITION BY salesperson
ORDER BY sale_date
) as amount_change
FROM sales;
-- 2. LEAD() 示例
-- 比较当前销售额与下一次销售额
SELECT
salesperson,
region,
sale_date,
amount,
LEAD(amount) OVER (
PARTITION BY salesperson
ORDER BY sale_date
) as next_amount,
amount - LEAD(amount) OVER (
PARTITION BY salesperson
ORDER BY sale_date
) as amount_change
FROM sales;
-- 3. FIRST_VALUE() 示例
-- 比较每个区域的当前销售额与最高销售额
SELECT
salesperson,
region,
amount,
FIRST_VALUE(amount) OVER (
PARTITION BY region
ORDER BY amount DESC
) as highest_in_region,
amount - FIRST_VALUE(amount) OVER (
PARTITION BY region
ORDER BY amount DESC
) as diff_from_highest
FROM sales;
-- 4. LAST_VALUE() 示例
-- 比较当前销售额与区域最低销售额
SELECT
salesperson,
region,
amount,
LAST_VALUE(amount) OVER (
PARTITION BY region
ORDER BY amount DESC
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) as lowest_in_region,
amount - LAST_VALUE(amount) OVER (
PARTITION BY region
ORDER BY amount DESC
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) as diff_from_lowest
FROM sales;
-- 5. 多重偏移示例
SELECT
salesperson,
region,
sale_date,
amount,
LAG(amount, 1) OVER (ORDER BY sale_date) as prev_1,
LAG(amount, 2) OVER (ORDER BY sale_date) as prev_2,
LEAD(amount, 1) OVER (ORDER BY sale_date) as next_1,
LEAD(amount, 2) OVER (ORDER BY sale_date) as next_2
FROM sales;
-- 6. 带默认值的偏移
SELECT
salesperson,
region,
sale_date,
amount,
LAG(amount, 1, 0) OVER (
PARTITION BY region
ORDER BY sale_date
) as prev_amount_default_0,
LEAD(amount, 1, 0) OVER (
PARTITION BY region
ORDER BY sale_date
) as next_amount_default_0
FROM sales;
三、实际应用场景
1. 销售分析
-- 创建销售数据表
CREATE TABLE sales_data (
id INT PRIMARY KEY,
product_id INT,
category VARCHAR(50),
sale_date DATE,
amount DECIMAL(10,2),
quantity INT,
salesperson_id INT
);
-- 1. 计算产品销售趋势
SELECT
product_id,
sale_date,
amount,
-- 计算7天移动平均销售额
AVG(amount) OVER (
PARTITION BY product_id
ORDER BY sale_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as seven_day_avg,
-- 计算同比增长
amount - LAG(amount, 365) OVER (
PARTITION BY product_id
ORDER BY sale_date
) as yoy_growth,
-- 计算环比增长
amount - LAG(amount) OVER (
PARTITION BY product_id
ORDER BY sale_date
) as mom_growth
FROM sales_data;
-- 2. 计算销售排名和占比
SELECT
category,
product_id,
SUM(amount) as total_sales,
-- 计算品类内排名
RANK() OVER (
PARTITION BY category
ORDER BY SUM(amount) DESC
) as category_rank,
-- 计算占品类销售额比例
SUM(amount) / SUM(SUM(amount)) OVER (
PARTITION BY category
) * 100 as category_percentage,
-- 计算占总销售额比例
SUM(amount) / SUM(SUM(amount)) OVER () * 100 as total_percentage
FROM sales_data
GROUP BY category, product_id;
-- 3. 销售目标达成分析
WITH monthly_sales AS (
SELECT
salesperson_id,
DATE_FORMAT(sale_date, '%Y-%m') as sale_month,
SUM(amount) as monthly_amount
FROM sales_data
GROUP BY salesperson_id, DATE_FORMAT(sale_date, '%Y-%m')
)
SELECT
salesperson_id,
sale_month,
monthly_amount,
-- 计算累计销售额
SUM(monthly_amount) OVER (
PARTITION BY salesperson_id
ORDER BY sale_month
) as ytd_amount,
-- 计算同比增长
monthly_amount - LAG(monthly_amount, 12) OVER (
PARTITION BY salesperson_id
ORDER BY sale_month
) as yoy_growth,
-- 计算3个月移动平均
AVG(monthly_amount) OVER (
PARTITION BY salesperson_id
ORDER BY sale_month
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) as three_month_avg
FROM monthly_sales;
-- 4. 销售异常检测
WITH daily_stats AS (
SELECT
sale_date,
SUM(amount) as daily_amount,
AVG(SUM(amount)) OVER (
ORDER BY sale_date
ROWS BETWEEN 30 PRECEDING AND 1 PRECEDING
) as avg_amount,
STDDEV(SUM(amount)) OVER (
ORDER BY sale_date
ROWS BETWEEN 30 PRECEDING AND 1 PRECEDING
) as stddev_amount
FROM sales_data
GROUP BY sale_date
)
SELECT
sale_date,
daily_amount,
avg_amount,
-- 检测异常值(超过2个标准差)
CASE
WHEN ABS(daily_amount - avg_amount) > 2 * stddev_amount
THEN 'Anomaly'
ELSE 'Normal'
END as status,
-- 计算Z分数
(daily_amount - avg_amount) / stddev_amount as z_score
FROM daily_stats;
2. 用户行为分析
-- 创建用户行为表
CREATE TABLE user_events (
id INT PRIMARY KEY,
user_id INT,
event_type VARCHAR(50),
event_time TIMESTAMP,
page_id VARCHAR(50),
session_id VARCHAR(50)
);
-- 1. 用户会话分析
WITH session_analysis AS (
SELECT
session_id,
user_id,
event_time,
event_type,
-- 计算会话持续时间
LEAD(event_time) OVER (
PARTITION BY session_id
ORDER BY event_time
) as next_event_time,
-- 计算上一个事件
LAG(event_type) OVER (
PARTITION BY session_id
ORDER BY event_time
) as previous_event
FROM user_events
)
SELECT
session_id,
COUNT(*) as events_count,
-- 计算会话时长(秒)
TIMESTAMPDIFF(
SECOND,
MIN(event_time),
MAX(event_time)
) as session_duration,
-- 计算平均事件间隔
AVG(TIMESTAMPDIFF(
SECOND,
event_time,
next_event_time
)) as avg_time_between_events
FROM session_analysis
GROUP BY session_id;
-- 2. 用户路径分析
WITH path_analysis AS (
SELECT
user_id,
session_id,
event_type,
event_time,
-- 构建用户路径
STRING_AGG(
event_type,
' -> '
) OVER (
PARTITION BY session_id
ORDER BY event_time
) as user_path,
-- 计算路径步骤
ROW_NUMBER() OVER (
PARTITION BY session_id
ORDER BY event_time
) as step_number
FROM user_events
)
SELECT
user_path,
COUNT(*) as path_count
FROM path_analysis
WHERE step_number = (
-- 选择完整路径
SELECT MAX(step_number)
FROM path_analysis p2
WHERE p2.session_id = path_analysis.session_id
)
GROUP BY user_path
ORDER BY path_count DESC;
-- 3. 用户粘性分析
WITH user_activity AS (
SELECT
user_id,
DATE(event_time) as activity_date,
COUNT(DISTINCT session_id) as daily_sessions,
COUNT(*) as daily_events
FROM user_events
GROUP BY user_id, DATE(event_time)
)
SELECT
user_id,
activity_date,
daily_sessions,
daily_events,
-- 计算连续活跃天数
COUNT(*) OVER (
PARTITION BY user_id
ORDER BY activity_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as rolling_7day_active,
-- 计算7天平均会话数
AVG(daily_sessions) OVER (
PARTITION BY user_id
ORDER BY activity_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as avg_7day_sessions
FROM user_activity;
-- 4. 用户分群分析
WITH user_metrics AS (
SELECT
user_id,
COUNT(DISTINCT DATE(event_time)) as active_days,
COUNT(DISTINCT session_id) as total_sessions,
COUNT(*) as total_events
FROM user_events
GROUP BY user_id
)
SELECT
user_id,
active_days,
total_sessions,
total_events,
-- 基于活跃度分组
NTILE(4) OVER (
ORDER BY active_days DESC
) as activity_quartile,
-- 计算用户分位数
PERCENT_RANK() OVER (
ORDER BY total_events
) as event_percentile
FROM user_metrics;
3. 财务分析
-- 创建财务数据表
CREATE TABLE financial_data (
id INT PRIMARY KEY,
account_id VARCHAR(50),
transaction_date DATE,
amount DECIMAL(10,2),
transaction_type VARCHAR(50),
category VARCHAR(50)
);
-- 1. 现金流分析
WITH cash_flow AS (
SELECT
account_id,
transaction_date,
SUM(CASE
WHEN transaction_type = 'income' THEN amount
ELSE -amount
END) as net_amount
FROM financial_data
GROUP BY account_id, transaction_date
)
SELECT
account_id,
transaction_date,
net_amount,
-- 计算累计现金流
SUM(net_amount) OVER (
PARTITION BY account_id
ORDER BY transaction_date
) as running_balance,
-- 计算30天移动平均
AVG(net_amount) OVER (
PARTITION BY account_id
ORDER BY transaction_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as moving_avg_30d,
-- 计算最大回撤
MIN(SUM(net_amount) OVER (
PARTITION BY account_id
ORDER BY transaction_date
)) OVER (
PARTITION BY account_id
ORDER BY transaction_date
) as max_drawdown
FROM cash_flow;
-- 2. 支出分析
SELECT
category,
EXTRACT(YEAR_MONTH FROM transaction_date) as month,
SUM(amount) as monthly_spend,
-- 计算环比增长
(SUM(amount) - LAG(SUM(amount)) OVER (
PARTITION BY category
ORDER BY EXTRACT(YEAR_MONTH FROM transaction_date)
)) / LAG(SUM(amount)) OVER (
PARTITION BY category
ORDER BY EXTRACT(YEAR_MONTH FROM transaction_date)
) * 100 as mom_growth,
-- 计算占比
SUM(amount) / SUM(SUM(amount)) OVER (
PARTITION BY EXTRACT(YEAR_MONTH FROM transaction_date)
) * 100 as category_percentage,
-- 计算同环比
SUM(amount) / LAG(SUM(amount), 12) OVER (
PARTITION BY category
ORDER BY EXTRACT(YEAR_MONTH FROM transaction_date)
) * 100 - 100 as yoy_growth
FROM financial_data
WHERE transaction_type = 'expense'
GROUP BY category, EXTRACT(YEAR_MONTH FROM transaction_date);
-- 3. 预算分析
WITH monthly_budget AS (
SELECT
category,
EXTRACT(YEAR_MONTH FROM transaction_date) as month,
SUM(amount) as actual_spend,
5000 as budget_amount -- 示例预算金额
FROM financial_data
WHERE transaction_type = 'expense'
GROUP BY category, EXTRACT(YEAR_MONTH FROM transaction_date)
)
SELECT
category,
month,
actual_spend,
budget_amount,
-- 计算预算使用率
actual_spend / budget_amount * 100 as budget_usage_percentage,
-- 计算预算差异
budget_amount - actual_spend as budget_variance,
-- 计算预算状态
CASE
WHEN actual_spend > budget_amount THEN 'Over Budget'
WHEN actual_spend > budget_amount * 0.9 THEN 'Near Budget'
ELSE 'Under Budget'
END as budget_status
FROM monthly_budget;
-- 4. 异常交易检测
WITH transaction_stats AS (
SELECT
account_id,
transaction_date,
amount,
-- 计算历史平均值和标准差
AVG(amount) OVER (
PARTITION BY account_id, transaction_type
ORDER BY transaction_date
ROWS BETWEEN 90 PRECEDING AND 1 PRECEDING
) as avg_amount,
STDDEV(amount) OVER (
PARTITION BY account_id, transaction_type
ORDER BY transaction_date
ROWS BETWEEN 90 PRECEDING AND 1 PRECEDING
) as stddev_amount
FROM financial_data
)
SELECT
account_id,
transaction_date,
amount,
avg_amount,
-- 计算Z分数
(amount - avg_amount) / NULLIF(stddev_amount, 0) as z_score,
-- 标记异常交易
CASE
WHEN ABS((amount - avg_amount) / NULLIF(stddev_amount, 0)) > 3
THEN 'Anomaly'
ELSE 'Normal'
END as transaction_status
FROM transaction_stats
WHERE stddev_amount IS NOT NULL;
四、性能优化建议
1. 索引优化
-- 1. 为分区列创建索引
CREATE INDEX idx_sales_region_date ON sales(region, sale_date);
-- 2. 为排序列创建索引
CREATE INDEX idx_sales_amount ON sales(amount DESC);
-- 3. 复合索引优化
CREATE INDEX idx_sales_region_amount_date ON sales(region, amount DESC, sale_date);
-- 执行计划分析示例
EXPLAIN ANALYZE
SELECT
salesperson,
region,
amount,
RANK() OVER (
PARTITION BY region
ORDER BY amount DESC
) as amount_rank
FROM sales;
-- 优化前的查询
SELECT
s.*,
COUNT(*) OVER (PARTITION BY region) as region_count,
SUM(amount) OVER (PARTITION BY region) as region_total,
amount / SUM(amount) OVER (PARTITION BY region) as percentage
FROM sales s;
-- 优化后的查询(使用子查询预聚合)
WITH region_stats AS (
SELECT
region,
COUNT(*) as region_count,
SUM(amount) as region_total
FROM sales
GROUP BY region
)
SELECT
s.*,
rs.region_count,
rs.region_total,
s.amount / rs.region_total as percentage
FROM sales s
JOIN region_stats rs ON s.region = rs.region;
-- 分批处理大数据集
WITH RECURSIVE date_series AS (
SELECT MIN(sale_date) as date_point
FROM sales
UNION ALL
SELECT date_point + INTERVAL 1 MONTH
FROM date_series
WHERE date_point < (SELECT MAX(sale_date) FROM sales)
),
monthly_windows AS (
SELECT
s.*,
SUM(amount) OVER (
PARTITION BY region
ORDER BY sale_date
RANGE BETWEEN INTERVAL '1' MONTH PRECEDING
AND CURRENT ROW
) as monthly_sum
FROM sales s
JOIN date_series d
ON s.sale_date >= d.date_point
AND s.sale_date < d.date_point + INTERVAL 1 MONTH
)
SELECT *
FROM monthly_windows;
2. 最佳实践
-- 1. 使用适当的窗口范围
-- 不好的实践:使用无限制的窗口
SELECT
sale_date,
amount,
AVG(amount) OVER (
ORDER BY sale_date
RANGE BETWEEN UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
) as overall_avg
FROM sales;
-- 好的实践:限制窗口范围
SELECT
sale_date,
amount,
AVG(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 30 PRECEDING
AND CURRENT ROW
) as moving_avg_30d
FROM sales;
-- 2. 合理使用分区
-- 不好的实践:过度分区
SELECT
sale_date,
region,
salesperson,
product_id,
amount,
AVG(amount) OVER (
PARTITION BY region, salesperson, product_id
ORDER BY sale_date
) as avg_amount
FROM sales;
-- 好的实践:适当分区
SELECT
sale_date,
region,
amount,
AVG(amount) OVER (
PARTITION BY region
ORDER BY sale_date
) as region_avg_amount
FROM sales;
-- 3. 使用CTE优化复杂查询
-- 不好的实践:嵌套窗口函数
SELECT
sale_date,
amount,
amount / SUM(amount) OVER () as total_percentage,
amount / SUM(amount) OVER (PARTITION BY region) as region_percentage,
RANK() OVER (ORDER BY amount DESC) as overall_rank,
RANK() OVER (PARTITION BY region ORDER BY amount DESC) as region_rank
FROM sales;
-- 好的实践:使用CTE分解复杂查询
WITH sales_metrics AS (
SELECT
sale_date,
region,
amount,
SUM(amount) OVER () as total_amount,
SUM(amount) OVER (PARTITION BY region) as region_amount
FROM sales
)
SELECT
sale_date,
amount,
amount / total_amount as total_percentage,
amount / region_amount as region_percentage,
RANK() OVER (ORDER BY amount DESC) as overall_rank,
RANK() OVER (PARTITION BY region ORDER BY amount DESC) as region_rank
FROM sales_metrics;
-- 4. 适当使用FILTER子句(PostgreSQL支持)
-- 不好的实践:使用CASE WHEN
SELECT
product_id,
SUM(CASE WHEN transaction_type = 'sale' THEN amount ELSE 0 END) OVER w as sales,
SUM(CASE WHEN transaction_type = 'refund' THEN amount ELSE 0 END) OVER w as refunds
FROM transactions
WINDOW w AS (PARTITION BY product_id ORDER BY transaction_date);
-- 好的实践:使用FILTER
SELECT
product_id,
SUM(amount) FILTER (WHERE transaction_type = 'sale') OVER w as sales,
SUM(amount) FILTER (WHERE transaction_type = 'refund') OVER w as refunds
FROM transactions
WINDOW w AS (PARTITION BY product_id ORDER BY transaction_date);
-- 5. 适当处理NULL值
-- 不好的实践:忽略NULL处理
SELECT
sale_date,
amount,
LAG(amount) OVER (ORDER BY sale_date) as prev_amount
FROM sales;
-- 好的实践:合理处理NULL
SELECT
sale_date,
amount,
COALESCE(
LAG(amount) OVER (ORDER BY sale_date),
AVG(amount) OVER ()
) as prev_amount
FROM sales;
五、常见错误和解决方案
1. 错误示例
-- 1. 错误:在WHERE子句中使用窗口函数
-- 不正确的查询
SELECT
sale_date,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) as rn
FROM sales
WHERE ROW_NUMBER() OVER (ORDER BY amount DESC) <= 10;
-- 正确的解决方案
WITH ranked_sales AS (
SELECT
sale_date,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) as rn
FROM sales
)
SELECT * FROM ranked_sales WHERE rn <= 10;
-- 2. 错误:在聚合函数中使用窗口函数
-- 不正确的查询
SELECT
region,
SUM(
ROW_NUMBER() OVER (ORDER BY amount)
) as total
FROM sales
GROUP BY region;
-- 正确的解决方案
WITH numbered_sales AS (
SELECT
region,
ROW_NUMBER() OVER (ORDER BY amount) as row_num
FROM sales
)
SELECT
region,
SUM(row_num) as total
FROM numbered_sales
GROUP BY region;
-- 3. 错误:窗口函数的嵌套使用
-- 不正确的查询
SELECT
sale_date,
amount,
ROW_NUMBER() OVER (
ORDER BY
RANK() OVER (ORDER BY amount DESC)
) as nested_rank
FROM sales;
-- 正确的解决方案
WITH ranked_sales AS (
SELECT
sale_date,
amount,
RANK() OVER (ORDER BY amount DESC) as amount_rank
FROM sales
)
SELECT
sale_date,
amount,
ROW_NUMBER() OVER (ORDER BY amount_rank) as nested_rank
FROM ranked_sales;
-- 4. 错误:混淆ROWS和RANGE子句
-- 可能导致问题的查询
SELECT
sale_date,
amount,
AVG(amount) OVER (
ORDER BY sale_date
RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
) as future_avg
FROM sales;
-- 更精确的查询
SELECT
sale_date,
amount,
AVG(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN CURRENT ROW AND 30 FOLLOWING
) as next_30_days_avg
FROM sales;
-- 5. 错误:未正确处理重复值
-- 可能导致问题的查询
SELECT
sale_date,
amount,
FIRST_VALUE(amount) OVER (
ORDER BY sale_date
) as first_amount
FROM sales;
-- 更健壮的查询
SELECT
sale_date,
amount,
FIRST_VALUE(amount) OVER (
ORDER BY sale_date, id -- 添加额外的排序键确保确定性
) as first_amount
FROM sales;
2. 调试技巧
-- 1. 使用EXPLAIN ANALYZE查看执行计划
EXPLAIN ANALYZE
SELECT
sale_date,
amount,
SUM(amount) OVER (
PARTITION BY region
ORDER BY sale_date
) as running_total
FROM sales;
-- 2. 分步调试复杂查询
-- 步骤1:检查基础数据
SELECT
sale_date,
amount,
region
FROM sales
ORDER BY sale_date
LIMIT 10;
-- 步骤2:添加单个窗口函数
SELECT
sale_date,
amount,
region,
SUM(amount) OVER (PARTITION BY region) as region_total
FROM sales
ORDER BY sale_date
LIMIT 10;
-- 步骤3:添加更复杂的窗口计算
SELECT
sale_date,
amount,
region,
SUM(amount) OVER (PARTITION BY region) as region_total,
amount / SUM(amount) OVER (PARTITION BY region) as percentage
FROM sales
ORDER BY sale_date
LIMIT 10;
-- 3. 验证窗口范围
-- 检查前后N行数据
SELECT
sale_date,
amount,
LAG(amount, 2) OVER w as prev_2,
LAG(amount, 1) OVER w as prev_1,
LEAD(amount, 1) OVER w as next_1,
LEAD(amount, 2) OVER w as next_2
FROM sales
WINDOW w AS (ORDER BY sale_date)
LIMIT 10;
-- 4. 使用临时表存储中间结果
CREATE TEMPORARY TABLE tmp_window_results AS
SELECT
sale_date,
amount,
region,
ROW_NUMBER() OVER (PARTITION BY region ORDER BY amount DESC) as rank_num
FROM sales;
-- 检查临时结果
SELECT * FROM tmp_window_results WHERE rank_num <= 5;
-- 5. 添加调试列
SELECT
sale_date,
amount,
region,
-- 调试信息
COUNT(*) OVER (PARTITION BY region) as partition_size,
ROW_NUMBER() OVER (PARTITION BY region ORDER BY sale_date) as row_in_partition,
FIRST_VALUE(sale_date) OVER (PARTITION BY region ORDER BY sale_date) as partition_start,
LAST_VALUE(sale_date) OVER (
PARTITION BY region
ORDER BY sale_date
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) as partition_end
FROM sales;
-- 6. 性能监控
CREATE TABLE window_function_log (
query_id INT AUTO_INCREMENT PRIMARY KEY,
query_text TEXT,
execution_time DECIMAL(10,2),
rows_processed INT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
DELIMITER $$
CREATE PROCEDURE debug_window_function(IN p_query TEXT)
BEGIN
DECLARE v_start_time TIMESTAMP;
DECLARE v_end_time TIMESTAMP;
DECLARE v_rows INT;
SET v_start_time = CURRENT_TIMESTAMP;
-- 执行查询
SET @sql = p_query;
PREPARE stmt FROM @sql;
EXECUTE stmt;
SET v_end_time = CURRENT_TIMESTAMP;
-- 记录执行情况
INSERT INTO window_function_log (
query_text,
execution_time,
rows_processed,
created_at
)
SELECT
p_query,
TIMESTAMPDIFF(MICROSECOND, v_start_time, v_end_time) / 1000000,
ROW_COUNT(),
v_start_time;
DEALLOCATE PREPARE stmt;
END $$
DELIMITER ;