You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1317 lines
37 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

SHOW DATABASES;
CREATE DATABASE IF NOT EXISTS szt;
USE szt;
SHOW TABLES;
--1 ODS 原始表, 不做改动, 直接加载
DROP TABLE IF EXISTS ods_szt_data;
CREATE EXTERNAL TABLE ods_szt_data(
deal_date String,
close_date String,
card_no String,
deal_value String,
deal_type String,
company_name String,
car_no String,
station String,
conn_mark String,
deal_money String,
equ_no String)
PARTITIONED BY(DAY STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/ods/ods_szt_data';
LOAD DATA INPATH '/warehouse/szt.db/ods/szt-etl-data_2018-09-01.csv' OVERWRITE INTO TABLE ods_szt_data PARTITION(DAY = '2018-09-01');
SELECT * FROM ods_szt_data WHERE DAY = '2018-09-01' LIMIT 10;
SELECT collect_set(deal_type) FROM ods_szt_data; --["地铁出站", "地铁入站", "巴士"]
--2 DWD 过滤掉 巴士, 过滤掉不在运营时间的地铁出入站数据
-- DROP TABLE IF EXISTS dwd_fact_szt_detail;
DROP TABLE IF EXISTS dwd_fact_szt_in_out_detail;--地铁出入站
CREATE EXTERNAL TABLE dwd_fact_szt_in_out_detail(
deal_date String,
close_date String,
card_no String,
deal_value String,
deal_type String,
company_name String,
car_no String,
station String,
conn_mark String,
deal_money String,
equ_no String)
PARTITIONED BY(DAY STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/dwd/dwd_fact_szt_in_out_detail';
INSERT OVERWRITE TABLE dwd_fact_szt_in_out_detail partition(DAY = '2018-09-01')
SELECT deal_date,
close_date,
card_no,
deal_value,
deal_type,
company_name,
car_no,
station,
conn_mark,
deal_money,
equ_no
FROM ods_szt_data
WHERE deal_type != '巴士'
AND unix_timestamp(deal_date, 'yyyy-MM-dd HH:mm:ss') > unix_timestamp('2018-09-01 06:14:00', 'yyyy-MM-dd HH:mm:ss')
AND unix_timestamp(deal_date, 'yyyy-MM-dd HH:mm:ss') < unix_timestamp('2018-09-01 23:59:00', 'yyyy-MM-dd HH:mm:ss')
AND DAY = '2018-09-01'
ORDER BY deal_date;
SELECT count(*) FROM dwd_fact_szt_in_out_detail; --780937
-- -- - dwd_fact_szt_in_detail
DROP TABLE IF EXISTS dwd_fact_szt_in_detail;
CREATE EXTERNAL TABLE dwd_fact_szt_in_detail(
deal_date String,
close_date String,
card_no String,
deal_value String,
deal_type String,
company_name String,
car_no String,
station String,
conn_mark String,
deal_money String,
equ_no String)
PARTITIONED BY(DAY STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/dwd/dwd_fact_szt_in_detail';
INSERT OVERWRITE TABLE dwd_fact_szt_in_detail partition(DAY = '2018-09-01')
SELECT deal_date,
close_date,
card_no,
deal_value,
deal_type,
company_name,
car_no,
station,
conn_mark,
deal_money,
equ_no
FROM dwd_fact_szt_in_out_detail
WHERE deal_type = '地铁入站'
AND DAY = '2018-09-01'
ORDER BY deal_date ;
SELECT count(*) FROM dwd_fact_szt_in_detail;
--415386
-- -- - dwd_fact_szt_out_detail
DROP TABLE IF EXISTS dwd_fact_szt_out_detail;
CREATE EXTERNAL TABLE dwd_fact_szt_out_detail(
deal_date String,
close_date String,
card_no String,
deal_value String,
deal_type String,
company_name String,
car_no String,
station String,
conn_mark String,
deal_money String,
equ_no String)
PARTITIONED BY(DAY STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/dwd/dwd_fact_szt_out_detail';
SELECT count(*) FROM dwd_fact_szt_out_detail;
INSERT OVERWRITE TABLE dwd_fact_szt_out_detail partition(DAY = '2018-09-01')
SELECT deal_date,
close_date,
card_no,
deal_value,
deal_type,
company_name,
car_no,
station,
conn_mark,
deal_money,
equ_no
FROM dwd_fact_szt_in_out_detail
WHERE deal_type = '地铁出站'
AND DAY = '2018-09-01'
ORDER BY deal_date ;--365551
-- --DWS 宽表
DROP TABLE IF EXISTS dws_card_record_day_wide;
CREATE EXTERNAL TABLE dws_card_record_day_wide(
card_no STRING,
deal_date_arr ARRAY < STRING > ,
deal_value_arr ARRAY < STRING > ,
deal_type_arr ARRAY < STRING > ,
company_name_arr ARRAY < STRING > ,
station_arr ARRAY < STRING > ,
conn_mark_arr ARRAY < STRING > ,
deal_money_arr ARRAY < STRING > ,
equ_no_arr ARRAY < STRING > ,
`count` int)
PARTITIONED BY(DAY STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/dws/dws_card_record_day_wide';
INSERT OVERWRITE TABLE dws_card_record_day_wide PARTITION(DAY = '2018-09-01')
SELECT card_no,
collect_list(deal_date),
collect_list(deal_value),
collect_list(deal_type),
collect_list(company_name),
collect_list(station),
collect_list(conn_mark),
collect_list(deal_money),
collect_list(equ_no),
count(*) c
FROM dwd_fact_szt_in_out_detail
WHERE DAY = '2018-09-01'
GROUP BY card_no
ORDER BY c DESC;
--412082
SELECT * FROM dws_card_record_day_wide LIMIT 100;
--ADS 业务表, 当天的表现
DROP TABLE IF EXISTS ads_in_station_day_top;
CREATE EXTERNAL TABLE ads_in_station_day_top(
station STRING,
deal_date_arr ARRAY < STRING > ,
card_no_arr ARRAY < STRING > ,
company_name_arr ARRAY < STRING > ,
equ_no_arr ARRAY < STRING > ,
`count` int)
PARTITIONED BY(DAY string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/ads/ads_in_station_day_top';
INSERT OVERWRITE TABLE ads_in_station_day_top PARTITION(DAY = '2018-09-01')
SELECT station,
collect_list(deal_date),
collect_list(card_no),
collect_list(company_name),
collect_list(equ_no),
count(*) c
FROM dwd_fact_szt_in_detail
WHERE DAY = '2018-09-01'
GROUP BY station
ORDER BY c DESC;
SELECT * FROM ads_in_station_day_top LIMIT 20;
---
DROP TABLE IF EXISTS ads_out_station_day_top;
CREATE EXTERNAL TABLE ads_out_station_day_top(
station STRING,
deal_date_arr ARRAY < STRING > ,
card_no_arr ARRAY < STRING > ,
deal_value_arr ARRAY < STRING > ,
company_name_arr ARRAY < STRING > ,
conn_mark_arr ARRAY < STRING > ,
deal_money_arr ARRAY < STRING > ,
equ_no_arr ARRAY < STRING > ,
`count` int)
PARTITIONED BY(DAY string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/ads/ads_out_station_day_top';
INSERT OVERWRITE TABLE ads_out_station_day_top PARTITION(DAY = '2018-09-01')
SELECT station,
collect_list(deal_date),
collect_list(card_no),
collect_list(deal_value),
collect_list(company_name),
collect_list(conn_mark),
collect_list(deal_money),
collect_list(equ_no),
count(*) c
FROM dwd_fact_szt_out_detail
WHERE DAY = '2018-09-01'
GROUP BY station
ORDER BY c DESC;
SELECT * FROM ads_out_station_day_top LIMIT 20;
SELECT collect_list(station) FROM ads_out_station_day_top;
SELECT collect_list(station) FROM ads_in_station_day_top;
---
---ads_in_out_station_day_top
DROP TABLE IF EXISTS ads_in_out_station_day_top;
CREATE EXTERNAL TABLE ads_in_out_station_day_top(
station STRING,
deal_date_arr ARRAY < STRING > ,
card_no_arr ARRAY < STRING > ,
deal_value_arr ARRAY < STRING > ,
deal_type_arr ARRAY < STRING > ,
company_name_arr ARRAY < STRING > ,
conn_mark_arr ARRAY < STRING > ,
deal_money_arr ARRAY < STRING > ,
equ_no_arr ARRAY < STRING > ,
`count` int)
PARTITIONED BY(DAY string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/ads/ads_in_out_station_day_top';
INSERT OVERWRITE TABLE ads_in_out_station_day_top PARTITION(DAY = '2018-09-01')
SELECT station,
collect_list(deal_date),
collect_list(card_no),
collect_list(deal_value),
collect_list(deal_type),
collect_list(company_name),
collect_list(conn_mark),
collect_list(deal_money),
collect_list(equ_no),
count(*) c
FROM dwd_fact_szt_in_out_detail
WHERE DAY = '2018-09-01'
GROUP BY station
ORDER BY c DESC;
SELECT * FROM ads_in_out_station_day_top LIMIT 20;
-------------------------------------------------------------
---卡片单日消费排行榜
DROP TABLE IF EXISTS ads_card_deal_day_top;
CREATE EXTERNAL TABLE ads_card_deal_day_top (
card_no STRING,
deal_date_arr ARRAY<STRING>,
deal_sum DOUBLE,
company_name_arr ARRAY<STRING>,
station_arr ARRAY<STRING>,
conn_mark_arr ARRAY<STRING>,
deal_m_sum DOUBLE,
equ_no_arr ARRAY<STRING>,
`count` INT)
PARTITIONED BY(DAY string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/ads/ads_card_deal_day_top';
---
insert overwrite table ads_card_deal_day_top partition (day='2018-09-01')
SELECT
t1.card_no,
t1.deal_date_arr,
t2.deal_sum,
t1.company_name_arr,
t1.station_arr,
t1.conn_mark_arr,
t3.deal_m_sum,
t1.equ_no_arr,
t1.`count`
from
dws_card_record_day_wide as t1,
(SELECT card_no, sum(deal_v) OVER(PARTITION BY card_no) AS deal_sum FROM dws_card_record_day_wide LATERAL VIEW explode(deal_value_arr) tmp as deal_v )t2,
(SELECT card_no, sum(deal_m) OVER(PARTITION BY card_no) AS deal_m_sum FROM dws_card_record_day_wide LATERAL VIEW explode(deal_money_arr) tmp as deal_m )t3
WHERE t1.day='2018-09-01' AND
t1.card_no = t2.card_no AND
t2.card_no = t3.card_no
ORDER BY t2.deal_sum DESC--ok
;--ok
SELECT * from ads_card_deal_day_top LIMIT 100;
----------
SELECT card_no, sum(deal_v) OVER(PARTITION BY card_no) AS deal_sum
FROM dws_card_record_day_wide LATERAL VIEW explode(deal_value_arr) tmp as deal_v ORDER BY deal_sum DESC;--ok
SELECT sum(c1) FROM
dws_card_record_day_wide LATERAL VIEW explode(deal_value_arr) tmp as c1
WHERE card_no='HHAAJICJB';--ok
-- 分组
SELECT
card_no,
sum(deal_value) OVER(PARTITION BY card_no) as deal_sum
FROM
dwd_fact_szt_out_detail
ORDER BY deal_sum desc
;--ok
-----------------------------
SELECT collect_set(company_name) FROM dwd_fact_szt_out_detail;
--["地铁一号线","地铁四号线","地铁九号线","地铁五号线","地铁十一号线","地铁二号线","地铁三号线","地铁七号线"]
-- ads_line_send_passengers_day_top
DROP TABLE IF EXISTS ads_line_send_passengers_day_top;
CREATE EXTERNAL TABLE ads_line_send_passengers_day_top(
company_name String,
`count` int
)
PARTITIONED BY(DAY STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/warehouse/szt.db/ads/ads_line_send_passengers_day_top';
-- 查询 dwd_fact_szt_in_out_detail 进出站详情表
---------------------------------------------------------------------------
-- 进站,线路 +1
-- 出站直达,线路不加,-- 什么也不做
-- 出站联程,线路 +1.
--t1
SELECT company_name,
deal_type,
conn_mark,
count(*) c
FROM dwd_fact_szt_in_out_detail
WHERE DAY = '2018-09-01' and deal_type='地铁入站'
GROUP BY company_name,deal_type,conn_mark
ORDER BY c DESC ;
--LIMIT 10;
--t2
SELECT company_name,
deal_type,
conn_mark,
count(*) c
FROM dwd_fact_szt_in_out_detail
WHERE DAY = '2018-09-01' and deal_type='地铁出站' and conn_mark='1'
GROUP BY company_name,deal_type,conn_mark
ORDER BY c DESC ;
--t3
INSERT OVERWRITE TABLE ads_line_send_passengers_day_top PARTITION(DAY = '2018-09-01')
SELECT t1.company_name,
t1.c+t2.c AS c
FROM
(SELECT company_name,
deal_type,
conn_mark,
count(*) c
FROM dwd_fact_szt_in_out_detail
WHERE DAY = '2018-09-01' and deal_type='地铁入站'
GROUP BY company_name,deal_type,conn_mark)
t1,
(SELECT company_name,
deal_type,
conn_mark,
count(*) c
FROM dwd_fact_szt_in_out_detail
WHERE DAY = '2018-09-01' and deal_type='地铁出站' and conn_mark='1'
GROUP BY company_name,deal_type,conn_mark)
t2
WHERE t1.company_name=t2.company_name
ORDER BY c DESC ;
select * from ads_line_send_passengers_day_top;
-- 每日运输乘客最多的区间排行榜
----------------------------------------------------------------------------------
drop table if exists dws_in_out_sorted_card_date;
create external table dws_in_out_sorted_card_date (
card_no string,
deal_date string,
deal_type string,
station string
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/dws/dws_in_out_sorted_card_date';
insert overwrite table dws_in_out_sorted_card_date partition (day="2018-09-01")
select
card_no,
deal_date,
deal_type,
station
from dwd_fact_szt_in_out_detail
where day="2018-09-01"
order by card_no, deal_date;
----------------------------------------------------------------------------------
drop table if exists temp02;
create table temp02 as
select
card_no,
deal_date,
deal_type,
station,
concat_ws('@', deal_type, station) deal_type_station
from dws_in_out_sorted_card_date;
drop table if exists temp03;
create table temp03 as
select
card_no,
deal_date,
deal_type_station,
LEAD(deal_type_station,1) over(partition by card_no order by deal_date) as next_station
from temp02;
drop table if exists temp04;
create table temp04 as
select
card_no,
deal_type_station,
next_station,
concat_ws('>', deal_type_station, next_station) as station2station
from
temp03 where
substr(deal_type_station,0,4)='地铁入站'
and
substr(next_station,0,4)='地铁出站'
;
drop table if exists temp05;
create table temp05 as
select
regexp_replace(station2station,'地铁入站@|地铁出站@','') short_stations
from temp04;
drop table if exists temp06;
create table temp06 as
select
short_stations,
count(*) as `count`
from temp05
group by short_stations
order by `count` desc;
----------------------------------------------------------------------------------
-- 合并
drop table if exists ads_stations_send_passengers_day_top;
create external table ads_stations_send_passengers_day_top(
short_stations string,
`count` int
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_stations_send_passengers_day_top';
insert overwrite table ads_stations_send_passengers_day_top partition (day="2018-09-01")
select
short_stations,
count(*) as `count`
from (
select
regexp_replace(station2station,'地铁入站@|地铁出站@','') short_stations
from (
select
card_no,
deal_type_station,
next_station,
concat_ws('>', deal_type_station, next_station) as station2station
from (
select
card_no,
deal_date,
deal_type_station,
LEAD(deal_type_station,1) over(partition by card_no order by deal_date) as next_station
from (
select
card_no,
deal_date,
deal_type,
station,
concat_ws('@', deal_type, station) deal_type_station
from dws_in_out_sorted_card_date
where day='2018-09-01'
) temp02
) temp03 where
substr(deal_type_station,0,4)='地铁入站'
and
substr(next_station,0,4)='地铁出站'
) temp04
) temp05
group by short_stations
order by `count` desc;
----------------------------------------------------------------------------------
----------------------------------------------------------------------------------
--- 每条线路单程直达乘客耗时平均值排行榜
--建宽表
drop table if exists dws_in_out_sorted_card_date_wide;
create external table dws_in_out_sorted_card_date_wide (
card_no string,
deal_date string,
ts string,
deal_value string,
deal_type string,
company_name string,
station string,
conn_mark string,
deal_money string,
equ_no string
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/dws/dws_in_out_sorted_card_date_wide';
insert overwrite table dws_in_out_sorted_card_date_wide partition (day="2018-09-01")
select
card_no,
deal_date,
unix_timestamp(deal_date) ts,
deal_value,
deal_type,
company_name,
station,
conn_mark,
deal_money,
equ_no
from dwd_fact_szt_in_out_detail
where day="2018-09-01"
order by card_no, deal_date;
----------------------------------------------------------------------------------
-- 拼接单程,起始时间
drop table if exists temp02;
create table temp02 COMMENT '临时中间表,拼接单程+起始时间戳'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp02'
as select
card_no,
ts,
deal_type,
company_name,
station,
conn_mark,
concat_ws('@',ts, deal_type, station) ts_deal_type_station
from dws_in_out_sorted_card_date_wide
where day='2018-09-01'
;
-- 寻找下一程
drop table if exists temp03;
create table temp03 COMMENT '临时中间表,开窗寻找下一程'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp03'
as select
card_no,
ts,
company_name,
conn_mark,
ts_deal_type_station,
LEAD(ts_deal_type_station,1) over(partition by card_no order by ts) as ts_next_station
from temp02;
select substr('1535767701@地铁出站@少年宫',0,10);
select substr('1535767701@地铁出站@少年宫',11,1);
select substr('1535765725@地铁入站@向西村',12,4);
select substr('1535767701@地铁出站@少年宫',0,10)-substr('1535765725@地铁入站@向西村',0,10) as time_s;
-- 过滤合法行程
drop table if exists temp04;
create table temp04 COMMENT '临时中间表,过滤合法记录'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp04'
as select
card_no,
company_name,
ts_deal_type_station,
ts_next_station,
-- 求时差
substr(ts_next_station,0,10)-substr(ts_deal_type_station,0,10) as time_s
from
temp03 where
substr(ts_deal_type_station ,12,4)='地铁入站'
and
substr(ts_next_station ,12,4)='地铁出站'
and
conn_mark ='0'
;
-- 分组求平均
drop table if exists ads_line_single_ride_average_time_day_top;
create external table ads_line_single_ride_average_time_day_top(
company_name string,
avg_time_s double
)COMMENT '每条线路单程直达乘客耗时平均值排行榜'
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_line_single_ride_average_time_day_top';
insert overwrite table ads_line_single_ride_average_time_day_top partition (day="2018-09-01")
select
company_name,
avg(time_s) avg_time_s
from temp04
group by company_name
order by avg_time_s
;
select * from ads_line_single_ride_average_time_day_top;
----------------------------------------------------------------------------------
-- 合并
drop table if exists ads_line_single_ride_average_time_day_top;
create external table ads_line_single_ride_average_time_day_top(
company_name string,
avg_time_s double
)COMMENT '每条线路单程直达乘客耗时平均值排行榜'
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_line_single_ride_average_time_day_top';
insert overwrite table ads_line_single_ride_average_time_day_top partition (day="2018-09-01")
select
company_name,
avg(time_s) avg_time_s
from (
select
card_no,
company_name,
ts_deal_type_station,
ts_next_station,
-- 求时差
substr(ts_next_station,0,10)-substr(ts_deal_type_station,0,10) as time_s
from (
select
card_no,
ts,
company_name,
conn_mark,
ts_deal_type_station,
LEAD(ts_deal_type_station,1) over(partition by card_no order by ts) as ts_next_station
from (
select
card_no,
ts,
deal_type,
company_name,
station,
conn_mark,
concat_ws('@',ts, deal_type, station) ts_deal_type_station
from dws_in_out_sorted_card_date_wide
where day='2018-09-01'
) temp02
) temp03 where
substr(ts_deal_type_station ,12,4)='地铁入站'
and
substr(ts_next_station ,12,4)='地铁出站'
and
conn_mark ='0'
) temp04
group by company_name
order by avg_time_s
;
-----------------------------------------------------------------------------------
--所有乘客通勤时间平均值
--ads_all_passengers_single_ride_spend_time_average
-- 拼接单程,起始时间
drop table if exists temp02;
create table temp02 COMMENT '临时中间表,拼接单程+起始时间戳'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp02'
as select
card_no,
ts,
deal_type,
company_name,
station,
conn_mark,
concat_ws('@',ts, deal_type, station) ts_deal_type_station
from dws_in_out_sorted_card_date_wide where day='2018-09-01'
;
-- 寻找下一程
drop table if exists temp03;
create table temp03 COMMENT '临时中间表,开窗寻找下一程'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp03'
as select
card_no,
ts,
company_name,
conn_mark,
ts_deal_type_station,
LEAD(ts_deal_type_station,1) over(partition by card_no order by ts) as ts_next_station
from temp02;
select substr('1535767701@地铁出站@少年宫',0,10);
select substr('1535767701@地铁出站@少年宫',11,1);
select substr('1535765725@地铁入站@向西村',12,4);
select substr('1535767701@地铁出站@少年宫',0,10)-substr('1535765725@地铁入站@向西村',0,10) as time_s;
-- 过滤合法行程,允许多程,添加联程字段
drop table if exists temp04;
create table temp04 COMMENT '临时中间表,过滤合法记录'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp04'
as select
card_no,
company_name,
conn_mark,
ts_deal_type_station,
ts_next_station,
-- 求时差
substr(ts_next_station,0,10)-substr(ts_deal_type_station,0,10) as time_s
from
temp03 where
substr(ts_deal_type_station ,12,4)='地铁入站'
and
substr(ts_next_station ,12,4)='地铁出站'
;
-- 全表求平均
drop table if exists ads_all_passengers_single_ride_spend_time_average;
create external table ads_all_passengers_single_ride_spend_time_average (
all_avg_time_s double
)COMMENT '所有乘客通勤时间平均值'
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_all_passengers_single_ride_spend_time_average';
insert overwrite table ads_all_passengers_single_ride_spend_time_average partition (day="2018-09-01")
select
avg(time_s) all_avg_time_s
from temp04
;
select * from ads_all_passengers_single_ride_spend_time_average;
----------------------------------------------------------------------------------
-- 合并,注意这里有一些可以复用的临时表
drop table if exists ads_all_passengers_single_ride_spend_time_average;
create external table ads_all_passengers_single_ride_spend_time_average (
all_avg_time_s double
)COMMENT '所有乘客通勤时间平均值'
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_all_passengers_single_ride_spend_time_average';
SET hive.load.dynamic.partitions.thread=15;
insert overwrite table ads_all_passengers_single_ride_spend_time_average partition (day="2018-09-01")
select
avg(time_s) all_avg_time_s
from (
select
card_no,
company_name,
conn_mark,
ts_deal_type_station,
ts_next_station,
-- 求时差
substr(ts_next_station,0,10)-substr(ts_deal_type_station,0,10) as time_s
from (
select
card_no,
ts,
company_name,
conn_mark,
ts_deal_type_station,
LEAD(ts_deal_type_station,1) over(partition by card_no order by ts) as ts_next_station
from (
select
card_no,
ts,
deal_type,
company_name,
station,
conn_mark,
concat_ws('@',ts, deal_type, station) ts_deal_type_station
from dws_in_out_sorted_card_date_wide
where day='2018-09-01'
) temp02
) temp03 where
substr(ts_deal_type_station ,12,4)='地铁入站'
and
substr(ts_next_station ,12,4)='地铁出站'
) temp04
;
select * from ads_all_passengers_single_ride_spend_time_average;
-----------------------------------------------------------------------------------
--深圳地铁乘客单程通勤时间排行榜 ads_passenger_spend_time_day_top
--合并 过滤合法行程+ ads 建表
drop table if exists ads_passenger_spend_time_day_top;
create external table ads_passenger_spend_time_day_top
(
card_no string,
company_name string,
ts_deal_type_station string,
ts_next_station string,
time_s double
)
COMMENT '通勤时间排行榜倒序'
partitioned by (day string) row format delimited fields terminated by ','
location '/warehouse/szt.db/ads/ads_passenger_spend_time_day_top';
insert overwrite table ads_passenger_spend_time_day_top partition (day='2018-09-01')
select
card_no,
company_name,
ts_deal_type_station,
ts_next_station,
substr(ts_next_station,0,10)-substr(ts_deal_type_station,0,10) as time_s
from (
select
card_no,
ts,
company_name,
conn_mark,
ts_deal_type_station,
LEAD(ts_deal_type_station,1) over(partition by card_no order by ts) as ts_next_station
from
(
select
card_no,
ts,
deal_type,
company_name,
station,
conn_mark,
concat_ws('@',ts, deal_type, station) ts_deal_type_station
from dws_in_out_sorted_card_date_wide
where day='2018-09-01'
) temp02
)
temp03 where
substr(ts_deal_type_station ,12,4)='地铁入站'
and
substr(ts_next_station ,12,4)='地铁出站'
order by time_s desc
;
select * from ads_passenger_spend_time_day_top;
-----------------------------------------------------------------
-----------------------------------------------------------------
--内部表 每个站点入站闸机数量 ads_station_in_equ_num_top
--内部表 每个站点出站闸机数量 ads_station_out_equ_num_top
drop table if exists ads_station_in_equ_num_top;
create table ads_station_in_equ_num_top
row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_station_in_equ_num_top'
as
select company_name,station,
count(distinct equ_no) c
from dwd_fact_szt_in_detail
group by company_name,station
order by c desc
;
select * from ads_station_in_equ_num_top;
drop table if exists ads_station_out_equ_num_top;
create table ads_station_out_equ_num_top
row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_station_out_equ_num_top'
as
select company_name,station,
count(distinct equ_no) c
from dwd_fact_szt_out_detail
group by company_name,station
order by c desc
;
select * from ads_station_out_equ_num_top;
-- 各线路进站闸机数统计排行榜
drop table if exists ads_line_in_equ_num_top;
create table ads_line_in_equ_num_top
row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_line_in_equ_num_top'
as
select
company_name,
count(distinct equ_no) c
from
dwd_fact_szt_in_detail
group by company_name
order by c desc
;
select * from ads_line_in_equ_num_top;
-- 各线路出站闸机数统计排行榜
drop table if exists ads_line_out_equ_num_top;
create table ads_line_out_equ_num_top
row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_line_out_equ_num_top'
as
select
company_name,
count(distinct equ_no) c
from
dwd_fact_szt_out_detail
group by company_name
order by c desc
;
SELECT * from ads_line_out_equ_num_top;
------------------------------------------------------------------------------------
-- 车站收入排行榜 ads_station_deal_top
drop table if exists ads_station_deal_top;
create external table ads_station_deal_top(
company_name string,
station string,
deal_money_sum double,
deal_value_sum double
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_station_deal_top';
insert overwrite table ads_station_deal_top partition (day="2018-09-01")
select
company_name,
station,
sum(deal_money)/100 deal_money_sum,
sum(deal_value)/100 deal_value_sum
from dwd_fact_szt_out_detail
where day='2018-09-01'
group by company_name, station
order by deal_money_sum desc
;
select * from ads_station_deal_top;
-- 线路收入排行榜 ads_line_deal_top
drop table if exists ads_line_deal_top;
create external table ads_line_deal_top(
company_name string,
deal_money_sum double,
deal_value_sum double
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_line_deal_top';
insert overwrite table ads_line_deal_top partition (day="2018-09-01")
select
company_name,
sum(deal_money)/100 deal_money_sum,
sum(deal_value)/100 deal_value_sum
from dwd_fact_szt_out_detail
where day='2018-09-01'
group by company_name
order by deal_money_sum desc
;
select * from ads_line_deal_top;
------------------------------------------------------------------------------------
-- 各线路换乘出站乘客百分比排行榜
-- 各线路出站乘客统计
SELECT
company_name,
count(*) c1
from dwd_fact_szt_out_detail
where day='2018-09-01'
GROUP BY company_name
;--t1
-- 各线路出站直达乘客统计
SELECT
company_name,
count(*) c2
from dwd_fact_szt_out_detail
where day='2018-09-01' and conn_mark='1'
GROUP BY company_name
;--t2
-- 求比例,排序
drop table if exists ads_conn_ratio_day_top;
CREATE EXTERNAL TABLE ads_conn_ratio_day_top(
company_name STRING,
ratio DOUBLE
)
COMMENT '联程百分比'
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_conn_ratio_day_top';
insert overwrite table ads_conn_ratio_day_top partition (day="2018-09-01")
SELECT
t1.company_name,
t2.c2/t1.c1*100 rate
from
(
SELECT
company_name,
count(*) c1
from dwd_fact_szt_out_detail
where day='2018-09-01'
GROUP BY company_name
)
t1 FULL join
(
SELECT
company_name,
count(*) c2
from dwd_fact_szt_out_detail
where day='2018-09-01' and conn_mark='1'
GROUP BY company_name
)
t2
on t1.company_name=t2.company_name
ORDER BY rate desc
;
SELECT * from ads_conn_ratio_day_top;
------------------------------------------------------------------------------------
---各线路车费优惠人次百分比
-- 各线路出站乘客统计
SELECT
company_name,
count(*) c1
from dwd_fact_szt_out_detail
where day='2018-09-01'
GROUP BY company_name
;--t1
-- 各线路直达出站并且享受优惠的人数
SELECT
company_name,
count( if(deal_value - deal_money=0,null,1) ) sale_count
from dwd_fact_szt_out_detail
where day='2018-09-01' and conn_mark='0'
GROUP BY company_name
;--t2
-------------------------
drop table if exists ads_line_sale_ratio_top;
CREATE EXTERNAL TABLE ads_line_sale_ratio_top(
company_name STRING,
ratio DOUBLE
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_line_sale_ratio_top';
insert overwrite table ads_line_sale_ratio_top partition (day="2018-09-01")
SELECT t1.company_name,
t2.sale_count/t1.c1*100 ratio
from
(
SELECT
company_name,
count(*) c1
from dwd_fact_szt_out_detail
where day='2018-09-01'
GROUP BY company_name
)
t1,
(
SELECT
company_name,
count( if(deal_value - deal_money=0,null,1) ) sale_count
from dwd_fact_szt_out_detail
where day='2018-09-01' and conn_mark='0'
GROUP BY company_name
)
t2
where t1.company_name=t2.company_name
order by ratio desc
;
SELECT * from ads_line_sale_ratio_top;
------------------------------------------------------------------------------------
-------------------------------------------------------------
-- 换乘时间最久的乘客排行榜
-- 拼接单程,起始时间
drop table if exists temp022;
create table temp022 COMMENT '临时中间表,拼接单程:前缀:联程标记+# 起始时间戳+行程'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp022'
as select
card_no,
ts,
deal_type,
company_name,
station,
conn_mark,
concat_ws('@',conn_mark, ts, deal_type, station) conn_ts_deal_type_station
from dws_in_out_sorted_card_date_wide where day='2018-09-01'
;
-- 寻找下一程
drop table if exists temp033;
create table temp033 COMMENT '临时中间表,开窗寻找下一程'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp033'
as select
card_no,
ts,
company_name,
conn_mark,
conn_ts_deal_type_station,
LEAD(conn_ts_deal_type_station,1) over(partition by card_no order by ts) as conn_ts_next_station
from temp022;
select substr('0@1535760350@地铁入站@益田',3,10);
-- 过滤合法行程,允许多程,添加联程字段
drop table if exists temp044;
create table temp044 COMMENT '临时中间表,过滤合法记录'
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp044'
as select
card_no,
company_name,
conn_mark,
conn_ts_deal_type_station,
conn_ts_next_station,
substr(conn_ts_next_station,3,10)-substr(conn_ts_deal_type_station,3,10) as time_s
from
temp033 where
substr(conn_ts_deal_type_station ,14,4)='地铁入站'
and
substr(conn_ts_next_station ,14,4)='地铁出站'
;
-- 创建行程表
create external table dws_travel_info(
card_no string,
company_name string,
conn_mark string,
conn_ts_deal_type_station string,
conn_ts_next_station string,
time_s double
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/dws/dws_travel_info';
insert overwrite table dws_travel_info partition (day="2018-09-01")
select
card_no,
company_name,
conn_mark,
conn_ts_deal_type_station,
conn_ts_next_station,
substr(conn_ts_next_station,3,10)-substr(conn_ts_deal_type_station,3,10) as time_s
from
(
select
card_no,
ts,
company_name,
conn_mark,
conn_ts_deal_type_station,
LEAD(conn_ts_deal_type_station,1) over(partition by card_no order by ts) as conn_ts_next_station
from
(
select
card_no,
ts,
deal_type,
company_name,
station,
conn_mark,
concat_ws('@',conn_mark, ts, deal_type, station) conn_ts_deal_type_station
from dws_in_out_sorted_card_date_wide where day='2018-09-01'
)
temp022
)
temp033 where
substr(conn_ts_deal_type_station ,14,4)='地铁入站'
and
substr(conn_ts_next_station ,14,4)='地铁出站'
;
drop table if exists temp055;
create table temp055
row format delimited fields terminated by ',' location '/warehouse/szt.db/temp/temp055'
as select
card_no,
company_name,
substr(conn_ts_deal_type_station,19) pre_station,
substr(conn_ts_next_station,19) last_station,
time_s
from temp044
where
substr(conn_ts_next_station,1,1)='1'
order by time_s desc
;
select
card_no,
company_name,
pre_station,
last_station,
time_s
from temp055
where pre_station != last_station
order by time_s desc
;
------------------------合并
drop table if exists ads_conn_spend_time_top;
CREATE EXTERNAL TABLE ads_conn_spend_time_top(
card_no string,
company_name STRING,
pre_station string,
last_station string,
time_s double
)
partitioned by(day string) row format delimited fields terminated by ',' location '/warehouse/szt.db/ads/ads_conn_spend_time_top';
insert overwrite table ads_conn_spend_time_top partition (day="2018-09-01")
select
card_no,
company_name,
pre_station,
last_station,
time_s
from
(
select
card_no,
company_name,
substr(conn_ts_deal_type_station,19) pre_station,
substr(conn_ts_next_station,19) last_station,
time_s
from
dws_travel_info where day='2018-09-01'
where
substr(conn_ts_next_station,1,1)='1'
order by time_s desc
)
temp055
where pre_station != last_station
order by time_s desc
;
select * from ads_conn_spend_time_top;
------------------------------------------------------------------------------------
--------------------------------------- 孖岭
drop table if exists ods_line_station;
create table ods_line_station
(
line_no string comment '线路编号',
line_name string comment '线路名称',
station_no string comment '车站编号',
station_name string comment '车站名称',
line_station_id string comment '线站唯一名称'
) comment '线站详情表|孖岭|szmc.net-metro.csv' row format delimited fields terminated by ',' location '/warehouse/szt.db/ods/ods_line_station';
load data inpath '/warehouse/szt.db/ods/szmc.net-metro.csv' overwrite into table ods_line_station;
select * from ods_line_station;
----------------------------------------------------------------------------
-- 线路规划表,指定必要字段
drop table if exists ods_travel_plan;
create table ods_travel_plan
(
start_station_id string comment '始发站 id|qidiancode',
end_station_id string comment '终到站 id|zhondiancode',
switch_counts string comment '换乘次数|times',
tralev_all string comment '所有行程'
) comment '线路规划、换乘方案来自深圳地铁官网45932 条|MiniTime.jsons -->45932 csv'
row format delimited fields terminated by '\t' location '/warehouse/szt.db/ods/ods_travel_plan';
load data inpath '/warehouse/szt.db/ods/travel_plan.csv' overwrite into table ods_travel_plan;
select * from ods_travel_plan;
select get_json_object(tralev_all, "$.[0,1,2].next_sid")
from ods_travel_plan limit 1000,1050;
select start_station_id,
end_station_id,
switch_counts,
get_json_object( tralev_all, "$.[0,1,2,3,4,5].travel_time") travel_time,
get_json_object( tralev_all, "$.[0,1,2,3,4,5].switch_time") switch_time,
get_json_object( tralev_all, "$.[0:5:1].next_sid") next_sid,
get_json_object( tralev_all, "$.[0].next_sid") next_sid1,
get_json_object( tralev_all, "$.[0::1].next_sid") next_sid2
from ods_travel_plan limit 1000,1100;
------------------------------------------------------------------------------------------------------
-- 合并 ods_line_station + ods_travel_plan ---> dwd_travel_plan_detail
drop table if exists dwd_travel_plan_detail;
drop table if exists dwd_travel_plan_detail;
create table dwd_travel_plan_detail row format delimited fields terminated by '\t' location '/warehouse/szt.db/dwd/dwd_travel_plan_detail' as
select start_station_id,
line1.station_name as start_station_name,
end_station_id,
line2.station_name as end_station_name,
switch_counts,
tralev_all
from ods_line_station as line1,
ods_travel_plan as plan,
ods_line_station as line2
where line1.line_station_id =
plan.start_station_id
and plan.end_station_id =
line2.line_station_id;
-- 所有换乘方案
select * from dwd_travel_plan_detail;