1. WINDOW子句
1.1 数据准备
CREATE EXTERNAL TABLE t1(
cookieid string,
createtime string, --day
pv INT
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' ;
select * from t1;
cookieid createtime pv
-------------------------
cookie1 2015-04-10 1
cookie1 2015-04-11 5
cookie1 2015-04-12 7
cookie1 2015-04-13 3
cookie1 2015-04-14 2
cookie1 2015-04-15 4
cookie1 2015-04-16 4
1.2 window 子句简单解释
如果不指定 ROWS BETWEEN , 默认为从起点到当前行
如果不指定 ORDER BY , 则将分组内所有值累加
关键是理解 ROWS BETWEEN 含义, 也叫做 WINDOW子句;
关键字 | 说明 |
---|---|
preceding | 往前 |
following | 往后 |
current row | 当前行 |
unbounded | 起点 |
unbounded preceding | 表示从前面的起点 |
unbounded following | 表示到后面的终点 |
1.3 语法示例
avg,min,max,sum的用法相同
1.3.1 默认为从起点到当前行
SELECT
cookieid,
createtime,pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS sum1,
MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS min1,
MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS max1,
AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS avg1
FROM t1;
//查询结果
cookieid createtime pv sum2 min2 max2 avg2
--------------------------------------------------
cookie1 2015-04-10 1 1 1 1 1.0
cookie1 2015-04-11 5 6 1 5 3.0
cookie1 2015-04-12 7 13 1 7 4.333
cookie1 2015-04-13 3 16 1 7 4.0
cookie1 2015-04-14 2 18 1 7 3.6
cookie1 2015-04-15 4 22 1 7 3.666
cookie1 2015-04-16 4 26 1 7 3.714
计算范围
pv1: 分组内从起点到当前行
如
11号= 10号+11号
12号= 10号+11号+12号
1.3.2 从起点到当前行
SELECT
cookieid,
createtime,pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS sum2,
MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS min2,
MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS max2,
AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS avg2
FROM t1;
//查询结果
cookieid createtime pv sum2 min2 max2 avg2
-------------------------------------------------
cookie1 2015-04-10 1 1 1 1 1.0
cookie1 2015-04-11 5 6 1 5 3.0
cookie1 2015-04-12 7 13 1 7 4.333
cookie1 2015-04-13 3 16 1 7 4.0
cookie1 2015-04-14 2 18 1 7 3.6
cookie1 2015-04-15 4 22 1 7 3.666
cookie1 2015-04-16 4 26 1 7 3.714
计算范围
pv2: 分组内从起点到当前行(计算逻辑如pv1)
如
11号= 10号+11号
12号= 10号+11号+12号
1.3.3 分组内所有行
SELECT
cookieid,
createtime,pv,
SUM(pv) OVER(PARTITION BY cookieid) AS sum3,
MIN(pv) OVER(PARTITION BY cookieid) AS min3,
MAX(pv) OVER(PARTITION BY cookieid) AS max3,
AVG(pv) OVER(PARTITION BY cookieid) AS avg3
FROM t1;
//查询结果
cookieid createtime pv sum3 min3 max3 avg3
--------------------------------------------------------------
cookie1 2015-04-10 1 26 1 7 3.7142857142857144
cookie1 2015-04-11 5 26 1 7 3.7142857142857144
cookie1 2015-04-12 7 26 1 7 3.7142857142857144
cookie1 2015-04-13 3 26 1 7 3.7142857142857144
cookie1 2015-04-14 2 26 1 7 3.7142857142857144
cookie1 2015-04-15 4 26 1 7 3.7142857142857144
cookie1 2015-04-16 4 26 1 7 3.7142857142857144
计算范围
pv3: 分组内(cookie1)所有行
1.3.4 当前行+往前3行
SELECT
cookieid,
createtime,pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS sum4,
MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS min4,
MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS max4,
AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS avg4
FROM t1;
//查询结果
cookieid createtime pv sum4 min4 max4 avg4
-------------------------------------------------
cookie1 2015-04-10 1 1 1 1 1.0
cookie1 2015-04-11 5 6 1 5 3.0
cookie1 2015-04-12 7 13 1 7 4.333
cookie1 2015-04-13 3 16 1 7 4.0
cookie1 2015-04-14 2 17 2 7 4.25
cookie1 2015-04-15 4 16 2 7 4.0
cookie1 2015-04-16 4 13 2 4 3.25
pv4: 分组内当前行+往前3行,
如
11号=10号+11号
12号=10号+11号+12号
13号=10号+11号+12号+13号
14号=11号+12号+13号+14号
1.3.5 当前行+往前3行+往后1行
SELECT
cookieid,
createtime,pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS sum5,
MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS min5,
MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS max5,
AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS avg5
FROM t1;
//查询结果
cookieid createtime pv sum5 min5 max5 avg5
-------------------------------------------------------------
cookie1 2015-04-10 1 6 1 5 3.0
cookie1 2015-04-11 5 13 1 7 4.333333333333333
cookie1 2015-04-12 7 16 1 7 4.0
cookie1 2015-04-13 3 18 1 7 3.6
cookie1 2015-04-14 2 21 2 7 4.2
cookie1 2015-04-15 4 20 2 7 4.0
cookie1 2015-04-16 4 13 2 4 3.25
计算范围
pv5: 分组内当前行+往前3行+往后1行,
如
sum14号=11号+12号+13号+14号+15号
=5+7+3+2+4=21
1.3.6 当前行+往后所有行
SELECT
cookieid,
createtime,pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS sum6,
MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS min6,
MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS max6,
AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS avg6
FROM t1;
//查询结果
cookieid createtime pv sum6 min6 max6 avg6
---------------------------------------------------------------
cookie1 2015-04-10 1 26 1 7 3.7142857142857144
cookie1 2015-04-11 5 25 2 7 4.166666666666667
cookie1 2015-04-12 7 20 2 7 4.0
cookie1 2015-04-13 3 13 2 4 3.25
cookie1 2015-04-14 2 10 2 4 3.3333333333333335
cookie1 2015-04-15 4 8 4 4 4.0
cookie1 2015-04-16 4 4 4 4 4.0
计算范围
pv6: 分组内当前行+往后所有行,
如
sum13号=13号+14号+15号+16号 =3+2+4+4 =13,
sum14号=14号+15号+16号 =2+4+4 =10
2. 序列函数
2.1 准备数据
向上面创建的t1表中添加如下数据
cookie2,2015-04-10,2
cookie2,2015-04-11,3
cookie2,2015-04-12,5
cookie2,2015-04-13,6
cookie2,2015-04-14,3
cookie2,2015-04-15,9
cookie2,2015-04-16,7
select * from t1;
cookieid createtime pv
-------------------------
cookie1 2015-04-10 1
cookie1 2015-04-11 5
cookie1 2015-04-12 7
cookie1 2015-04-13 3
cookie1 2015-04-14 2
cookie1 2015-04-15 4
cookie1 2015-04-16 4
cookie2 2015-04-10 2
cookie2 2015-04-11 3
cookie2 2015-04-12 5
cookie2 2015-04-13 6
cookie2 2015-04-14 3
cookie2 2015-04-15 9
cookie2 2015-04-16 7
2.2 ntile
ntile(n) 用于将分组数据按照顺序切分成n片, 返回当前切片值
ntile 不支持 rows between , 比如 ntile(2) over (partition by cookieid order by createtime rows between 3 preceding and current row)
如果切片不均匀, 默认增加第一个切片的分布
SELECT
cookieid,
createtime,
pv,
NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn1, --分组内将数据分成2片
NTILE(3) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn2, --分组内将数据分成3片
NTILE(4) OVER(ORDER BY createtime) AS rn3 --将所有数据分成4片
FROM t1
ORDER BY cookieid,createtime;
cookieid createtime pv rn1 rn2 rn3
----------------------------------------
cookie1 2015-04-10 1 1 1 1
cookie1 2015-04-11 5 1 1 1
cookie1 2015-04-12 7 1 1 2
cookie1 2015-04-13 3 1 2 2
cookie1 2015-04-14 2 2 2 3
cookie1 2015-04-15 4 2 3 3
cookie1 2015-04-16 4 2 3 4
cookie2 2015-04-10 2 1 1 1
cookie2 2015-04-11 3 1 1 1
cookie2 2015-04-12 5 1 1 2
cookie2 2015-04-13 6 1 2 2
cookie2 2015-04-14 3 2 2 3
cookie2 2015-04-15 9 2 3 4
cookie2 2015-04-16 7 2 3 4
--比如,统计一个cookie,pv数最多的前1/3的天
SELECT
cookieid,
createtime,
pv,
NTILE(3) OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn
FROM t1;
cookieid createtime pv rn
-----------------------------
cookie1 2015-04-12 7 1
cookie1 2015-04-11 5 1
cookie1 2015-04-15 4 1
cookie1 2015-04-16 4 2
cookie1 2015-04-13 3 2
cookie1 2015-04-14 2 3
cookie1 2015-04-10 1 3
cookie2 2015-04-15 9 1
cookie2 2015-04-16 7 1
cookie2 2015-04-13 6 1
cookie2 2015-04-12 5 2
cookie2 2015-04-11 3 2
cookie2 2015-04-14 3 3
cookie2 2015-04-10 2 3
--rn = 1 的记录,就是我们想要的结果
2.3 row_number
row_number() 从 1 开始, 按照顺序, 生成分组内记录的序列
比如, 按照pv 降序排列, 生成分组内每天的pv名次
row_number() 的应用场景非常多, 再比如, 获取分组内排序第一的记录, 获取一个session中的第一条refer等
--按照分组内pv 排序
SELECT
cookieid,
createtime,
pv,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn
FROM t1;
cookieid createtime pv rn
-----------------------------
cookie1 2015-04-12 7 1
cookie1 2015-04-11 5 2
cookie1 2015-04-15 4 3
cookie1 2015-04-16 4 4
cookie1 2015-04-13 3 5
cookie1 2015-04-14 2 6
cookie1 2015-04-10 1 7
cookie2 2015-04-15 9 1
cookie2 2015-04-16 7 2
cookie2 2015-04-13 6 3
cookie2 2015-04-12 5 4
cookie2 2015-04-11 3 5
cookie2 2015-04-14 3 6
cookie2 2015-04-10 2 7
--所有数据按照pv排序
SELECT
cookieid,
createtime,
pv,
ROW_NUMBER() OVER( ORDER BY pv desc) AS rn
FROM t1;
cookieid createtime pv rn
-----------------------------
cookie2 2015-04-15 9 1
cookie1 2015-04-12 7 2
cookie2 2015-04-16 7 3
cookie2 2015-04-13 6 4
cookie1 2015-04-11 5 5
cookie2 2015-04-12 5 6
cookie1 2015-04-15 4 7
cookie1 2015-04-16 4 8
cookie1 2015-04-13 3 9
cookie2 2015-04-11 3 10
cookie2 2015-04-14 3 11
cookie1 2015-04-14 2 12
cookie2 2015-04-10 2 13
cookie1 2015-04-10 1 14
2.4 rank
rank() 生成数据项在分组中的排名, 若排名有相同的情况会在名次中留下空位
SELECT
cookieid,
createtime,
pv,
RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn1
FROM t1
WHERE cookieid = 'cookie1';
cookieid createtime pv rn1
-----------------------------
cookie1 2015-04-12 7 1
cookie1 2015-04-11 5 2
cookie1 2015-04-15 4 3
cookie1 2015-04-16 4 3
cookie1 2015-04-13 3 5
cookie1 2015-04-14 2 6
cookie1 2015-04-10 1 7
2.5 dense_rank
dense_rank() 生成数据项在分组中的排名, 若排名相等不会在名次中留下空位
SELECT
cookieid,
createtime,
pv,
DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn2
FROM t1
WHERE cookieid = 'cookie1';
cookieid createtime pv rn2
-------------------------------
cookie1 2015-04-12 7 1
cookie1 2015-04-11 5 2
cookie1 2015-04-15 4 3
cookie1 2015-04-16 4 3
cookie1 2015-04-13 3 4
cookie1 2015-04-14 2 5
cookie1 2015-04-10 1 6
3. 序列分析函数
3.1 数据准备
CREATE TABLE t2(
dept STRING,
userid string,
sal INT
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' ;
select * from t2;
dept userid sal
------------------
d1 user1 1000
d1 user2 2000
d1 user3 3000
d2 user4 4000
d2 user5 5000
3.2 CUME_DIST
cume_dist 小于等于当前值的行数/分组内总数
比如, 统计小于当前薪水的人数, 站总人数的比例
SELECT
dept,
userid,
sal,
CUME_DIST() OVER(ORDER BY sal) AS rn1,-- 小于当前值的数量占总数的占比
CUME_DIST() OVER(PARTITION BY dept ORDER BY sal) AS rn2 -- 分组内小于当前值的数量占总数的占比
FROM t2;
--查询结果
dept userid sal rn1 rn2
------------------------------
d1 user1 1000 0.2 0.333
d1 user2 2000 0.4 0.666
d1 user3 3000 0.6 1.0
d2 user4 4000 0.8 0.5
d2 user5 5000 1.0 1.0
结果说明
rn1 (没有partition) | rn2 (根据 dept 分 partition ) | |
---|---|---|
第1行 | 小于等于1000的行数为1,因此 1/5=0.2 | 分组内小于等于1000的行数为1,分组内总数为3,因此 1/3=0.333 |
第2行 | 小于等于2000的行数为2,因此 2/5=0.4 | 分组内小于等于2000的行数为2,分组内总数为3,因此 2/3=0.666 |
第3行 | 小于等于3000的行数为3,因此 3/5=0.6 | 分组内小于等于3000的行数为3,分组内总数为3,因此 3/3=1.0 |
第4行 | 小于等于4000的行数为4,因此 4/5=0.8 | 分组内小于等于4000的行数为1,分组内总数为2,因此 1/2=0.5 |
第5行 | 小于等于5000的行数为5,因此 5/5=1.0 | 分组内小于等于5000的行数为2,分组内总数为2,因此 2/2=1.0 |
3.3 PERCENT_RANK
percent_rank 分组内当前行的rank值 -1 / 分组内总行数-1
SELECT
dept,
userid,
sal,
PERCENT_RANK() OVER(ORDER BY sal) AS rn1, --分组内
PERCENT_RANK() OVER(PARTITION BY dept ORDER BY sal) AS rn2
FROM t2;
--查询结果
dept userid sal rn2 rn2
-----------------------------
d1 user1 1000 0.0 0.0
d1 user2 2000 0.25 0.5
d1 user3 3000 0.5 1.0
d2 user4 4000 0.75 0.0
d2 user5 5000 1.0 1.0
结果说明
rn1 (没有partition) | rn2 (根据 dept 分 partition ) | |
---|---|---|
第1行 | 小于等于1000的行数为1,因此,(1-1)/(5-1)=0.0 | 分组内小于等于1000的行数为1, 分组内总数为3, 因此(1-1)/(3-1)=0.0 |
第2行 | 小于等于2000的行数为2,因此,(2-1)/(5-1)=0.25 | 分组内小于等于2000的行数为2, 分组内总数为3,因此(2-1)/(3-1)=0.5 |
第3行 | 小于等于3000的行数为3,因此,(3-1)/(5-1)=0.5 | 分组内小于等于3000的行数为3, 分组内总数为3,因此(3-1)/(3-1)=1.0 |
第4行 | 小于等于4000的行数为4,因此,(4-1)/(5-1)=0.75 | 分组内小于等于4000的行数为1, 分组内总数为2,因此(1-1)/(2-1)=0.0 |
第5行 | 小于等于5000的行数为5,因此,(5-1)/(5-1)=1.0 | 分组内小于等于5000的行数为2, 分组内总数为2,因此(2-1)/(2-1)=1.0 |
4. 分析函数
4.1 数据准备
CREATE TABLE t3 (
cookieid string,
createtime string, --页面访问时间
url STRING --被访问页面
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' ;
select * from t3;
cookieid createtime url
------------------------------------
cookie1 2015-04-10 10:00:02 url2
cookie1 2015-04-10 10:00:00 url1
cookie1 2015-04-10 10:03:04 1url3
cookie1 2015-04-10 10:50:05 url6
cookie1 2015-04-10 11:00:00 url7
cookie1 2015-04-10 10:10:00 url4
cookie1 2015-04-10 10:50:01 url5
cookie2 2015-04-10 10:00:02 url22
cookie2 2015-04-10 10:00:00 url11
cookie2 2015-04-10 10:03:04 1url33
cookie2 2015-04-10 10:50:05 url66
cookie2 2015-04-10 11:00:00 url77
cookie2 2015-04-10 10:10:00 url44
cookie2 2015-04-10 10:50:01 url55
4.2 LAG
lag ( col , n , default ) 用于统计窗口往上第n 行值
参数说明:
col:列名
n:往上第 n 行(可选, 默认为 1 )
default:默认值(当往上第 n 行为null 的时候, 取默认值, 否则为null)
SELECT
cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,--组内排名
LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time, --取出上一个值
LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time --取出上两个值
FROM t3;
--查询结果
cookieid createtime url rn last_1_time last_2_time
----------------------------------------------------------------------------------
cookie1 2015-04-10 10:00:00 url1 1 1970-01-01 00:00:00 NULL
cookie1 2015-04-10 10:00:02 url2 2 2015-04-10 10:00:00 NULL
cookie1 2015-04-10 10:03:04 1url3 3 2015-04-10 10:00:02 2015-04-10 10:00:00
cookie1 2015-04-10 10:10:00 url4 4 2015-04-10 10:03:04 2015-04-10 10:00:02
cookie1 2015-04-10 10:50:01 url5 5 2015-04-10 10:10:00 2015-04-10 10:03:04
cookie1 2015-04-10 10:50:05 url6 6 2015-04-10 10:50:01 2015-04-10 10:10:00
cookie1 2015-04-10 11:00:00 url7 7 2015-04-10 10:50:05 2015-04-10 10:50:01
cookie2 2015-04-10 10:00:00 url11 1 1970-01-01 00:00:00 NULL
cookie2 2015-04-10 10:00:02 url22 2 2015-04-10 10:00:00 NULL
cookie2 2015-04-10 10:03:04 1url33 3 2015-04-10 10:00:02 2015-04-10 10:00:00
cookie2 2015-04-10 10:10:00 url44 4 2015-04-10 10:03:04 2015-04-10 10:00:02
cookie2 2015-04-10 10:50:01 url55 5 2015-04-10 10:10:00 2015-04-10 10:03:04
cookie2 2015-04-10 10:50:05 url66 6 2015-04-10 10:50:01 2015-04-10 10:10:00
cookie2 2015-04-10 11:00:00 url77 7 2015-04-10 10:50:05 2015-04-10 10:50:01
结果说明
last_1_time (往上第1行的值,默认’1970-01-01 00:00:00’) | last_2_time (指定了往上第2行的值,默认值为null) | |
---|---|---|
第1行 | last_1_time往上1行为NULL,取默认值 1970-01-01 00:00:00 | last_2_time往上2行取值为NULL |
第2行 | last_1_time往上1行值为第1行,值为2015-04-10 10:00:00 | last_2_time往上2行取值为NULL |
第3行 | last_1_time往上1行值为第2行,值为2015-04-10 10:03:02 | last_2_time往上2行取值为第1行,值为2015-04-10 10:00:00 |
第4行 | last_1_time往上1行值为第3行,值为2015-04-10 10:00:04 | last_2_time往上2行取值为第2行,值为2015-04-10 10:00:02 |
第5行 | last_1_time往上1行值为第4行,值为2015-04-10 10:10:00 | last_2_time往上2行取值为第3行,值为2015-04-10 10:03:04 |
第6行 | last_1_time往上1行值为第5行,值为2015-04-10 10:50:01 | last_2_time往上2行取值为第4行,值为2015-04-10 10:10:00 |
第7行 | last_1_time往上1行值为第6行,值为2015-04-10 10:50:05 | last_2_time往上2行取值为第5行,值为2015-04-10 10:50:01 |
4.3 LEAD
lead 与lag 相反
lead (col , n , default) 用于统计窗口往下第 n 行的值
参数说明:
col:列名
n:往下第 n 行(可选, 默认为 1 )
default: 默认值(当往下第 n 行为null 的时候, 取默认值, 否则为null)
SELECT
cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn, ---组内排名名
LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time, -- 取下一个的值
LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time --取下两个的值
FROM t3;
--查询结果
cookieid createtime url rn next_1_time next_2_time
-----------------------------------------------------------------------------------
cookie1 2015-04-10 10:00:00 url1 1 2015-04-10 10:00:02 2015-04-10 10:03:04
cookie1 2015-04-10 10:00:02 url2 2 2015-04-10 10:03:04 2015-04-10 10:10:00
cookie1 2015-04-10 10:03:04 1url3 3 2015-04-10 10:10:00 2015-04-10 10:50:01
cookie1 2015-04-10 10:10:00 url4 4 2015-04-10 10:50:01 2015-04-10 10:50:05
cookie1 2015-04-10 10:50:01 url5 5 2015-04-10 10:50:05 2015-04-10 11:00:00
cookie1 2015-04-10 10:50:05 url6 6 2015-04-10 11:00:00 NULL
cookie1 2015-04-10 11:00:00 url7 7 1970-01-01 00:00:00 NULL
cookie2 2015-04-10 10:00:00 url11 1 2015-04-10 10:00:02 2015-04-10 10:03:04
cookie2 2015-04-10 10:00:02 url22 2 2015-04-10 10:03:04 2015-04-10 10:10:00
cookie2 2015-04-10 10:03:04 1url33 3 2015-04-10 10:10:00 2015-04-10 10:50:01
cookie2 2015-04-10 10:10:00 url44 4 2015-04-10 10:50:01 2015-04-10 10:50:05
cookie2 2015-04-10 10:50:01 url55 5 2015-04-10 10:50:05 2015-04-10 11:00:00
cookie2 2015-04-10 10:50:05 url66 6 2015-04-10 11:00:00 NULL
cookie2 2015-04-10 11:00:00 url77 7 1970-01-01 00:00:00 NULL
结果说明
next_1_time(指定了往上第1行的值,默认为’1970-01-01 00:00:00’) | next_2_time(指定了往上第2行的值,默认值为null) | |
---|---|---|
第1行 | last_1_time往下1行值为第2行,值为2015-04-10 10:00:02 | last_2_time往下2行取值为第3行,值为2015-04-10 10:03:04 |
第2行 | last_1_time往下1行值为第3行,值为2015-04-10 10:03:04 | last_2_time往下2行取值为第4行,值为2015-04-10 10:10:00 |
第3行 | last_1_time往下1行值为第4行,值为2015-04-10 10:10:00 | last_2_time往下2行取值为第5行,值为2015-04-10 10:50:01 |
第4行 | last_1_time往下1行值为第5行,值为2015-04-10 10:50:01 | last_2_time往下2行取值为第6行,值为2015-04-10 10:50:05 |
第5行 | last_1_time往下1行值为第6行,值为2015-04-10 10:50:05 | last_2_time往下2行取值为第7行,值为2015-04-10 11:00:00 |
第6行 | last_1_time往下1行值为第7行,值为2015-04-10 11:00:00 | last_2_time往下2行取值为null |
第7行 | last_1_time往下1行为NULL,取默认值1970-01-01 00:00:00 | last_2_time往下2行取值为null |
4.4 FIRST_VALUE
取分组内排序后, 截止到当前行, 第一个值
SELECT
cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS first1
FROM t3;
cookieid createtime url rn first1
-------------------------------------------------
cookie1 2015-04-10 10:00:00 url1 1 url1
cookie1 2015-04-10 10:00:02 url2 2 url1
cookie1 2015-04-10 10:03:04 1url3 3 url1
cookie1 2015-04-10 10:10:00 url4 4 url1
cookie1 2015-04-10 10:50:01 url5 5 url1
cookie1 2015-04-10 10:50:05 url6 6 url1
cookie1 2015-04-10 11:00:00 url7 7 url1
cookie2 2015-04-10 10:00:00 url11 1 url11
cookie2 2015-04-10 10:00:02 url22 2 url11
cookie2 2015-04-10 10:03:04 1url33 3 url11
cookie2 2015-04-10 10:10:00 url44 4 url11
cookie2 2015-04-10 10:50:01 url55 5 url11
cookie2 2015-04-10 10:50:05 url66 6 url11
cookie2 2015-04-10 11:00:00 url77 7 url11
4.5 LAST_VALUE
取分组内排序后,截止到当前行,最后一个值
SELECT
cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1
FROM t3;
--查询结果
cookieid createtime url rn last1
-------------------------------------------------
cookie1 2015-04-10 10:00:00 url1 1 url1
cookie1 2015-04-10 10:00:02 url2 2 url2
cookie1 2015-04-10 10:03:04 1url3 3 1url3
cookie1 2015-04-10 10:10:00 url4 4 url4
cookie1 2015-04-10 10:50:01 url5 5 url5
cookie1 2015-04-10 10:50:05 url6 6 url6
cookie1 2015-04-10 11:00:00 url7 7 url7
cookie2 2015-04-10 10:00:00 url11 1 url11
cookie2 2015-04-10 10:00:02 url22 2 url22
cookie2 2015-04-10 10:03:04 1url33 3 1url33
cookie2 2015-04-10 10:10:00 url44 4 url44
cookie2 2015-04-10 10:50:01 url55 5 url55
cookie2 2015-04-10 10:50:05 url66 6 url66
cookie2 2015-04-10 11:00:00 url77 7 url77
--如果想要取分组内排序后最后一个值,则需要变通一下
SELECT
cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1,--截止到当前行的最后一个值, 也就是当前值
FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime DESC) AS last2 --取出分组内最后一个值
FROM t3
ORDER BY cookieid,createtime;
--查询结果
cookieid createtime url rn last1 last2
--------------------------------------------------------
cookie1 2015-04-10 10:00:00 url1 1 url1 url7
cookie1 2015-04-10 10:00:02 url2 2 url2 url7
cookie1 2015-04-10 10:03:04 1url3 3 1url3 url7
cookie1 2015-04-10 10:10:00 url4 4 url4 url7
cookie1 2015-04-10 10:50:01 url5 5 url5 url7
cookie1 2015-04-10 10:50:05 url6 6 url6 url7
cookie1 2015-04-10 11:00:00 url7 7 url7 url7
cookie2 2015-04-10 10:00:00 url11 1 url11 url77
cookie2 2015-04-10 10:00:02 url22 2 url22 url77
cookie2 2015-04-10 10:03:04 1url33 3 1url33 url77
cookie2 2015-04-10 10:10:00 url44 4 url44 url77
cookie2 2015-04-10 10:50:01 url55 5 url55 url77
cookie2 2015-04-10 10:50:05 url66 6 url66 url77
cookie2 2015-04-10 11:00:00 url77 7 url77 url77
--如果不指定ORDER BY,则默认按照记录在文件中的偏移量进行排序,会出现错误的结果,如下
SELECT
cookieid,
createtime,
url,
FIRST_VALUE(url) OVER(PARTITION BY cookieid) AS first2
FROM t3;
cookieid createtime url first2
----------------------------------------------
cookie1 2015-04-10 10:00:02 url2 url2
cookie1 2015-04-10 10:00:00 url1 url2
cookie1 2015-04-10 10:03:04 1url3 url2
cookie1 2015-04-10 10:50:05 url6 url2
cookie1 2015-04-10 11:00:00 url7 url2
cookie1 2015-04-10 10:10:00 url4 url2
cookie1 2015-04-10 10:50:01 url5 url2
cookie2 2015-04-10 10:00:02 url22 url22
cookie2 2015-04-10 10:00:00 url11 url22
cookie2 2015-04-10 10:03:04 1url33 url22
cookie2 2015-04-10 10:50:05 url66 url22
cookie2 2015-04-10 11:00:00 url77 url22
cookie2 2015-04-10 10:10:00 url44 url22
cookie2 2015-04-10 10:50:01 url55 url22
SELECT
cookieid,
createtime,
url,
LAST_VALUE(url) OVER(PARTITION BY cookieid) AS last2
FROM t3;
--查询结果
cookieid createtime url last2
----------------------------------------------
cookie1 2015-04-10 10:00:02 url2 url5
cookie1 2015-04-10 10:00:00 url1 url5
cookie1 2015-04-10 10:03:04 1url3 url5
cookie1 2015-04-10 10:50:05 url6 url5
cookie1 2015-04-10 11:00:00 url7 url5
cookie1 2015-04-10 10:10:00 url4 url5
cookie1 2015-04-10 10:50:01 url5 url5
cookie2 2015-04-10 10:00:02 url22 url55
cookie2 2015-04-10 10:00:00 url11 url55
cookie2 2015-04-10 10:03:04 1url33 url55
cookie2 2015-04-10 10:50:05 url66 url55
cookie2 2015-04-10 11:00:00 url77 url55
cookie2 2015-04-10 10:10:00 url44 url55
cookie2 2015-04-10 10:50:01 url55 url55
提示:在使用分析函数的过程中,要特别注意ORDER BY子句,用的不恰当,统计出的结果就不是你所期望的。
5. 分析函数
这几个分析函数通常用OLAP中, 不能累加, 而且需要根据不同维度上钻和下钻的指标系统, 比如 分, 小时, 天, 月的UV数。
5.1 数据准备
CREATE TABLE t4 (
month STRING,
day STRING,
cookieid STRING
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' ;
select * from t4;
month day cookieid
------------------------------
2015-03 2015-03-10 cookie1
2015-03 2015-03-10 cookie5
2015-03 2015-03-12 cookie7
2015-04 2015-04-12 cookie3
2015-04 2015-04-13 cookie2
2015-04 2015-04-13 cookie4
2015-04 2015-04-16 cookie4
2015-03 2015-03-10 cookie2
2015-03 2015-03-10 cookie3
2015-04 2015-04-12 cookie5
2015-04 2015-04-13 cookie6
2015-04 2015-04-15 cookie3
2015-04 2015-04-15 cookie2
2015-04 2015-04-16 cookie1
5.2 GROUPING SETS..GROUPING_ID
在一个group by 查询中, 根据不同的维度组合进行聚合,等价于将不同维度的group by 结果进行union all
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
grouping_id() as grouping_id
FROM t4
GROUP BY month,day
GROUPING SETS (month,day)
ORDER BY grouping_id();
--等价于
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS grouping_id FROM t4 GROUP BY month
UNION ALL
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS grouping_id FROM t4 GROUP BY day
--查询结果
month day uv grouping_id
------------------------------------------------
2015-03 NULL 5 1
2015-04 NULL 6 1
NULL 2015-04-12 2 2
NULL 2015-04-13 3 2
NULL 2015-03-12 1 2
NULL 2015-04-16 2 2
NULL 2015-04-15 2 2
NULL 2015-03-10 4 2
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
grouping_id() b
FROM t4
GROUP BY month,day
GROUPING SETS (month,day,(month,day))
ORDER BY grouping_id();
--等价于
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS grouping_id FROM t4 GROUP BY month
UNION ALL
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS grouping_id FROM t4 GROUP BY day
UNION ALL
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS grouping_id FROM t4 GROUP BY month,day
--查询结果
month day uv grouping_id
--------------------------------------------------
2015-03 2015-03-12 1 0
2015-04 2015-04-16 2 0
2015-04 2015-04-13 3 0
2015-03 2015-03-10 4 0
2015-04 2015-04-12 2 0
2015-04 2015-04-15 2 0
2015-03 NULL 5 1
2015-04 NULL 6 1
NULL 2015-04-12 2 2
NULL 2015-04-13 3 2
NULL 2015-03-12 1 2
NULL 2015-04-16 2 2
NULL 2015-04-15 2 2
NULL 2015-03-10 4 2
其中的 GROUPING__ID,表示结果属于哪一个分组集合。
5.3 CUBE
cube根据group by 的维度的所有组合进行聚合
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING_ID ()
FROM t4
GROUP BY month,day
WITH CUBE
ORDER BY GROUPING_ID();
--等价于
SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM t4
UNION ALL
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM t4 GROUP BY month
UNION ALL
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM t4 GROUP BY day
UNION ALL
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM t4 GROUP BY month,day
--查询结果
month day uv GROUPING__ID
-----------------------------------------
2015-03 2015-03-12 1 0
2015-04 2015-04-16 2 0
2015-04 2015-04-13 3 0
2015-03 2015-03-10 4 0
2015-04 2015-04-12 2 0
2015-04 2015-04-15 2 0
2015-03 NULL 5 1
2015-04 NULL 6 1
NULL 2015-04-12 2 2
NULL 2015-04-13 3 2
NULL 2015-03-12 1 2
NULL 2015-04-16 2 2
NULL 2015-04-15 2 2
NULL 2015-03-10 4 2
NULL NULL 7 3
5.4 ROLLUP
rollup 是cube的子集, 以最左侧的维度为主, 从该维度进行层次聚合
--比如,以month维度进行层级聚合:
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING_ID()
FROM t4
GROUP BY month,day
WITH ROLLUP
ORDER BY GROUPING_ID();
--可以实现这样的过程:月天的UV->月的UV->总UV
--查询结果
month day uv GROUPING_ID
---------------------------------------------------
2015-03 2015-03-12 1 0
2015-04 2015-04-16 2 0
2015-04 2015-04-13 3 0
2015-03 2015-03-10 4 0
2015-04 2015-04-12 2 0
2015-04 2015-04-15 2 0
2015-03 NULL 5 1
2015-04 NULL 6 1
NULL NULL 7 3
--把month和day调换顺序,则以day维度进行层级聚合:
SELECT
day,
month,
COUNT(DISTINCT cookieid) AS uv,
GROUPING_ID()
FROM t4
GROUP BY day,month
WITH ROLLUP
ORDER BY GROUPING_ID();
--可以实现这样的上钻过程:天月的UV->天的UV->总UV
--查询结果
day month uv GROUPING_ID
-------------------------------------------------------
2015-03-12 2015-03 1 0
2015-04-12 2015-04 2 0
2015-04-15 2015-04 2 0
2015-03-10 2015-03 4 0
2015-04-13 2015-04 3 0
2015-04-16 2015-04 2 0
2015-04-12 NULL 2 1
2015-04-13 NULL 3 1
2015-04-15 NULL 2 1
2015-03-10 NULL 4 1
2015-04-16 NULL 2 1
2015-03-12 NULL 1 1
NULL NULL 7 3
这里, 根据天和月进行聚合, 和根据天聚合结果是一样的, 因为有父子关系, 所以是其他维度组合的话, 就会不一样)
这种函数, 需要结合实际场景和数据去使用和研究, 只看说明的话, 很难理解