创建日活记录表
linux> vi user_dau.txt
1,zhansan,2023-02-10
2,lisi,2023-02-10
3,wangwu,2023-02-10
4,xiaohong,2023-02-10
5,aaa,2023-02-10
6,bbb,2023-02-10
7,ccc,2023-02-10
8,ddd,2023-02-10
9,eee,2023-02-10
hive>create table user_dau(
user_id string,
name string,
datestr string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
hive>load data local inpath '/root/tmp_data/user_dau.txt' into table user_dau;
创建历史记录表
linux>5,aaa,2023-01-26,2023-02-01
6,bbb,2023-01-27,2023-02-02
7,ccc,2023-01-27,2023-02-04
8,ddd,2023-02-07,2023-02-09
9,eee,2023-02-06,2023-02-09
10,lisi,2023-02-02,2023-02-08
11,zhangsan,2023-02-02,2023-02-09
12,wangwu,2023-02-04,2023-02-09
13,fff,2023-02-05,2023-02-09
14,cs,2023-02-03,2023-02-09
hive>create table user_hsu(
user_id string,
name string,
first_dt string,
last_dt string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
hive>load data local inpath '/root/tmp_data/user_hus.txt' into table user_hsu;
查看当日日活表,创建计算日活表
hive>select * from user_demo;
#创建日活表
hive>create table user_dau_rec(
user_id string,
name string
)
partitioned by (dt string)
stored as orc;
计算日活
hive>insert into user_dau_rec partition (dt='2023-02-10')
select
user_id,name
from user_dau
group by user_id,name;
#如果user_dau表有分区
hive>insert into user_dau_rec partition (dt='2023-02-10')
select
user_id,name
from user_dau where dt='2023-02-10'
group by user_id,name;
#计算
hive>select count(user_id) from user_dau_rec where dt='2023-02-10';
计算日新 当日表和历史表left join 筛出历史表是空的数据得出日新
hive>select
d.user_id,
d.name,
d.datestr
from user_dau d
left join user_hsu h
on (d.user_id=h.user_id)
where h.user_id is null;
查看当日与历史表对比
hive>select
dau.*,
hsu.*
from user_dau dau full join user_hsu hsu
on dau.user_id = hsu.user_id;
+--------------+-----------+--------------+--------------+-----------+---------------+--------------+--+
| dau.user_id | dau.name | dau.datestr | hsu.user_id | hsu.name | hsu.first_dt | hsu.last_dt |
+--------------+-----------+--------------+--------------+-----------+---------------+--------------+--+
| 1 | zhansan | 2023-02-10 | NULL | NULL | NULL | NULL |
| NULL | NULL | NULL | 10 | lisi | 2023-02-02 | 2023-02-08 |
| NULL | NULL | NULL | 11 | zhangsan | 2023-02-02 | 2023-02-09 |
| NULL | NULL | NULL | 12 | wangwu | 2023-02-04 | 2023-02-09 |
| NULL | NULL | NULL | 13 | fff | 2023-02-05 | 2023-02-09 |
| NULL | NULL | NULL | 14 | cs | 2023-02-03 | 2023-02-09 |
| 2 | lisi | 2023-02-10 | NULL | NULL | NULL | NULL |
| 3 | wangwu | 2023-02-10 | NULL | NULL | NULL | NULL |
| 4 | xiaohong | 2023-02-10 | NULL | NULL | NULL | NULL |
| 5 | aaa | 2023-02-10 | 5 | aaa | 2023-01-26 | 2023-02-01 |
| 6 | bbb | 2023-02-10 | 6 | bbb | 2023-01-27 | 2023-02-02 |
| 7 | ccc | 2023-02-10 | 7 | ccc | 2023-01-27 | 2023-02-04 |
| 8 | ddd | 2023-02-10 | 8 | ddd | 2023-02-07 | 2023-02-09 |
| 9 | eee | 2023-02-10 | 9 | eee | 2023-02-06 | 2023-02-09 |
+--------------+-----------+--------------+--------------+-----------+---------------+--------------+--+
14 rows selected (17.506 seconds)
计算10号数据合并更新历史登录记录表
hive>select
if(h.user_id is not null,h.user_id,d.user_id) as user_id,
if(h.user_id is not null,h.name,d.name) as name,
if(h.user_id is not null,h.first_dt,d.datestr) as first_dt,
if(d.user_id is not null,d.datestr,h.last_dt) as last_dt
from user_dau d
full join user_hsu h
on (d.user_id=h.user_id);
历史表不为空则值是h.USER_ID 否则是 d.USER_ID
如果要更新历史表在select前面加 insert into 表名 partition (dt='日期') 即可
计算用户留存(历史表计算(不含新增的2.10日))
hive>select
first_dt as dt,
datediff('2023-02-09',first_dt) as days,
count(if(last_dt=='2023-02-09',1,null)) as retail_counts
from user_hsu
group by first_dt;
+-------------+-------+----------------+--+
| dt | days | retail_counts |
+-------------+-------+----------------+--+
| 2023-01-26 | 14 | 0 |
| 2023-01-27 | 13 | 0 |
| 2023-02-02 | 7 | 1 |
| 2023-02-03 | 6 | 1 |
| 2023-02-04 | 5 | 1 |
| 2023-02-05 | 4 | 1 |
| 2023-02-06 | 3 | 1 |
| 2023-02-07 | 2 | 1 |
+-------------+-------+----------------+--+
8 rows selected (18.066 seconds)
没有增加条件,最多计算一个月的留存
hive>select
first_dt as dt,
count(1) as dau_counts,
datediff('2023-02-09',first_dt) as days,
count(if(last_dt='2023-02-09',1,null)) as retail_counts
from user_hsu
group by first_dt;
+-------------+-------------+-------+----------------+--+
| dt | dau_counts | days | retail_counts |
+-------------+-------------+-------+----------------+--+
| 2023-01-26 | 1 | 14 | 0 |
| 2023-01-27 | 2 | 13 | 0 |
| 2023-02-02 | 2 | 7 | 1 |
| 2023-02-03 | 1 | 6 | 1 |
| 2023-02-04 | 1 | 5 | 1 |
| 2023-02-05 | 1 | 4 | 1 |
| 2023-02-06 | 1 | 3 | 1 |
| 2023-02-07 | 1 | 2 | 1 |
+-------------+-------------+-------+----------------+--+
结果表述 最开始的数据 1.26日共1条,到2.9日依旧存留的共14天,共0人存留
表述 最开始的数据 2.7日共1条,到2.9日依旧存留的共2天,共1人存留