一、窗口函数说明
1.窗口函数的语句结构
函数名 ([expr]) over(子句)
函数名 ([expr]) over( partition by <要分列的组> order by <要排序的列> rows between <数据范围>)
其中over关键字后子句包含三个分析子句
分组子句partition by
排序子句order by
窗口子句rows
sum A over (partition by B order by C rows between D1 and D2)
avg A over(partition by B order by C rows between D1 and D2)
rows between 2 preceding and current row # 取当前行和前面两行
rows between unbounded preceding and current row # 包括本行和之前所有的行
rows between current row and unbounded following # 包括本行和之后所有的行
rows between 3 preceding and 1 following # 取前面三行和下面一行,包含当前行,总共五行
注意:
当order by后面缺少窗口从句条件,窗口规范默认是rows between unbounded preceding and current row.
当order by和窗口从句都缺失, 窗口规范默认是 rows between unbounded preceding and unbounded following
2.窗口函数的分类
专用窗口函数 |
聚合类窗口函数 |
rank() 并列排名会跳号 |
sum() |
dense_rank() 并列排名不会跳号 |
count() |
row_number() 生成行的编号 |
avg() |
|
max() |
|
min() |
二、案例分析(电商平台订单信息)
1.数据准备
表结构
列名 |
释义 |
user_name |
用户名 |
piece |
购买数量 |
price |
价格 |
pay_amount |
支付金额 |
goods_category |
商品品类 |
pay_time |
支付日期 |
建表语句
use lagou;
create table user_trade (
user_name varchar(20),
piece int,
price double,
pay_amount double,
goods_category varchar(20),
pay_time date
);
2.窗口函数之累计计算函数
需求1: 查询出2019年每月的支付总额和当年累积支付总额
-- step1 过滤出2019年数据
select * from user_trade where year(pay_time)=2019;
-- step2 在1的基础上,按照月份进行group by 分组,统计每个月份的支付总额
select month(pay_time),sum(pay_amount)
from user_trade
where year(pay_time) = 2019
group by month(pay_time);
-- step3 在2的基础上应用窗口函数实现需求
select a.month,a.pay_amount,
sum(a.pay_amount) over (order by a.month)
from(
select month(pay_time) month,sum(pay_amount) pay_amount
from user_trade
where year(pay_time) = 2019
group by month(pay_time)
) a;
需求2: 查询出2018-2019年每月的支付总额和当年累积支付总额
-- step1 过滤出2018-2019年数据
select * from user_trade where year(pay_time) in(2018,2019);
-- step2 在1的基础上,按照月份进行group by 分组,统计每个月份的支付总额
select year(pay_time),month(pay_time),sum(pay_amount)
from user_trade
where year(pay_time) in(2018,2019)
group by year(pay_time),month(pay_time);
-- step3 在2的基础上应用窗口函数实现需求
select a.year,a.month,a.pay_amount,
sum(a.pay_amount) over (partition by a.year order by a.month) #基于年份进行了分组
from(
select year(pay_time) year,month(pay_time) month,sum(pay_amount) pay_amount
from user_trade
where year(pay_time) in(2018,2019)
group by year(pay_time),month(pay_time)
) a;
需求3: 查询出2019年每月的近三个月的移动平均支付金额
#操作rows窗口范围
select a.month,a.pay_amount,
avg(a.pay_amount) over (order by a.month,rows between 2 preceding and current row) avg_pay_amount
from(
select month(pay_time) month,sum(pay_amount) pay_amount
from user_trade
where year(pay_time) = 2019
group by month(pay_time)
) a;
需求4: 查询出每4个月的最大月总支付金额
#操作rows窗口范围
select a.month,a.pay_amount,
max(a.pay_amount) over (order by a.month,rows between 3 preceding and current row) max_pay_amount
from(
select substring(pay_time,1,7) month,sum(pay_amount) pay_amount
from user_trade
group by substring(pay_time,1,7)
) a;
3.窗口函数之排序函数
需求1: 2020年1月,购买商品品类数的用户排名
select user_name,count(distinct(goods_category)) category_count,
dense_rank() over (order by count(distinct(goods_category))) order1
from user_trade
where substring(pay_time,1,7) = '2020-01'
group by user_name;
需求2: 2020年2月的支付用户,按照支付金额分为5组后的结果
select user_name,sum(pay_amount),
ntile(5) over (order by sum(pay_amount) desc) level
from user_trade
where substring(pay_time,1,7) = '2020-02'
group by user_name;
需求3: 查询出2020年支付金额排名前30%的所有用户
select a.user_name,a.pay_amount,a.level
from (
select user_name,sum(pay_amount),
ntile(10) over (order by sum(pay_amount) desc) level
from user_trade
where year(pay_time) = 2020
group by user_name
) a
where a.level in(1,2,3);
3.窗口函数之偏移分析函数
需求1:解释lag(),lead()用法
select user_name,pay_time,
lag(pay_time,1,pay_time) over (partition by user_name order by pay_time) lag1,
lag(pay_time) lag0,
lag(pay_time,1) lag2
from user_trade
where user_name in ('King','West');
注:lag(pay_time,1,pay_time)为向上偏移量,其中第一个pay_time为需要偏移的字段,1为偏移量,第二个pay_time为默认值,有默认值时,取不到偏移量,则显示为它本身;没有默认值时,取不到偏移量,则显示为null
lead()为向下偏移量,其中1为偏移量
需求2:查询出支付时间间隔超过100天的用户数
select count(distinct(user_name))
from(
select user_name,pay_time,
lead(pay_time,1) over (partition by user_name order by pay_time) lead_time,
from user_trade
where user_name in ('King','West')
) a
where datediff(a.lead_time,a.pay_time) > 100;
需求2:查询出每年支付时间间隔最长的用户
#step1,算出时间间隔
select a.years,a.user_name,
datediff(a.pay_time,a.lag_time) interval_days
from(
#把相邻订单的pay_time放置到一行,便于后期求两个订单之间的间隔(按年和用户名进行分组)
#相邻订单中的一个订单的pay_time移动了下来成为了新的一列(当前订单pay_time-lag_timeX)
select user_name,pay_time,year(pay_time) as years
lag(pay_time) over (partition by user_name,year(pay_time) order by pay_time asc) lag_time,
from user_trade
) a;
#step2,查询出每年支付时间间隔最长的用户
select b.years,b.user_name,b.interval_days
from(
select a.years,a.user_name,
datediff(a.pay_time,a.lag_time) interval_days,
rank() over(partition by a.years order by datediff(a.pay_time,a.lag_time) desc) rank
from(
#把相邻订单的pay_time放置到一行,便于后期求两个订单之间的间隔(按年和用户名进行分组)
#相邻订单中的一个订单的pay_time移动了下来成为了新的一列(当前订单pay_time-lag_timeX)
select user_name,pay_time,year(pay_time) as years
lag(pay_time) over (partition by user_name,year(pay_time) order by pay_time asc) lag_time,
from user_trade
) a
) b
where b.rank1=1;
附上源数据供大家练习
电商平台订单信息案例分析源数据