-- 反例 selectid, user_id from partition_table_name; -- 执行起来会很慢,因为需要扫描所有分区 -- 正例 selectid,user_id from partition_table_name wheredate = '${date}'-- 只扫描制定的 date 分区
2. 不要偷懒使用 * 查全部数据字段,需要什么字段拿什么
对于一些大宽表来说,可能有七八十个字段,但是实际上只用到了其中两三个字段,造成了严重的资源浪费
-- 反例 select su.star_id, au.user_id, au.device_id from star_user_profile as su join ( select * from aweme_user_profile -- aweme_user_profile 可能有上百个字段 wheredate = ${date} and user_id > 0 ) as au on au.user_id = su.core_user_id; -- 正例 select su.star_id, au.user_id, au.device_id from star_user_profile as su join ( select user_id, device_id from aweme_user_profile wheredate = ${date} and user_id > 0 ) as au on au.user_id = su.core_user_id;
3. 分区条件都放到表后,join 前
-- 反例 select ui.id, ui.user_id, ui.device_id, di.device_brand from partition_user_profile_info as ui leftjoin partition_device_info as di on di.device_id = ui.device_id and di.device_id > 0 where ui.date = '${date}'and di.date = '${date}'
-- 正例 select ui.id, ui.user_id, ui.device_id, di.device_brand from (selectid, user_id, device_id from partition_user_profile_info wheredate = '${date}') as ui leftjoin (select device_id, device_brand from partition_device_info wheredate = '${date}') as di on di.device_id = ui.device_id
4. join时,改表的条件紧跟当前表,不要join一堆表后都放到一个on里面
-- 反例 select g.star_id, g.gender_distribution, a.age_distribution, p.province_distribution, c.city_distribution, d.device_brand_distribution, unix_timestamp() as update_time from p_gender_dist as g join p_age_dist as a join p_province_dist as p join p_city_dist as c join p_device_brand_dist as d on g.core_user_id = a.core_user_id and a.core_user_id = p.core_user_id and p.core_user_id = c.core_user_id and c.core_user_id = d.core_user_id -- 正例 select g.star_id, g.gender_distribution, a.age_distribution, p.province_distribution, c.city_distribution, d.device_brand_distribution, unix_timestamp() as update_time from p_gender_dist as g join p_age_dist as a on g.core_user_id = a.core_user_id join p_province_dist as p on a.core_user_id = p.core_user_id join p_city_dist as c on p.core_user_id = c.core_user_id join p_device_brand_dist as d on c.core_user_id = d.core_user_id
set spark.shuffle.io.maxRetries=1; set spark.shuffle.io.retryWait=0s; set spark.network.timeout=120s;
使用 external shuffle service 提高性能
set spark.shuffle.hdfs.enabled
6. sql 的严谨性
一条记录里经常会有各种空字段,记得对空字段进行处理
-- 反例 concat(":", tmp.province, cast(tmp.cnt as string))
-- 正例 concat_ws(":", case when tmp.province is NULL or tmp.province="" then "unknown" else tmp.province end, cast(tmp.cnt asstring))
做数据统计的时候,不要过分信任数据的准确性,需要自己做一层判断,例如
-- 反例 (select author_id, item_id, user_id from aweme_user_item_info) as item_reach;
-- 正例 (select author_id, item_id, user_id from aweme_user_item_info where item_id > 0) as item_reach -- 排除 item_id = 0 的脏数据
其他常用函数
获取group by 后每一组对该列的排位。使用场景举例:获取所有博主的最近 2 条微博
table: author_item_dict
author_id
weibo_id
create_time
1
23489032
1615217421
1
12309780
1615217419
2
43242511
1615217418
1
45435346
1615217422
select author_id, weibo_id from ( select author_id, weibo_id, row_number over(partitionby author_id orderby create_time) as rn from author_item_dict ) as ai where ai.rn >= 2