高端电子商务网站建设,蓝顿长沙网站制作公司,上海市建设工程咨询奖,济南做外贸网站的公司目录 1. 问题描述2. 问题复现3. 问题原因4. 问题解决 1. 问题描述
在Hive中#xff08;其他类似SQL#xff0c;比如PostgreSQL可能也存在此问题#xff09;#xff0c;当对多张表#xff08;3张及以上#xff09;进行full join时#xff0c;会存在每张表的主键都是唯一… 目录 1. 问题描述2. 问题复现3. 问题原因4. 问题解决 1. 问题描述
在Hive中其他类似SQL比如PostgreSQL可能也存在此问题当对多张表3张及以上进行full join时会存在每张表的主键都是唯一但当full join后会发现主键可能有重复。
2. 问题复现
2.1. 插入数据
with temp1 as (
select 1 as id ,张三 as name
union all
select 2 as id ,李四 as name
union all
select 3 as id ,王五 as name
),
temp2 as (
select 1 as id ,深圳 as city
union all
select 3 as id ,北京 as city
union all
select 4 as id ,上海 as city
),
temp3 as (
select 1 as id ,5000 as salary
union all
select 4 as id ,10000 as salary
union all
select 5 as id ,12000 as salary
)2.2. 查询SQL以及问题
selectcoalesce(a.id, b.id, c.id) as id, a.name, b.city, c.salary
from temp1 as afull join temp2 as b
on a.id b.idfull join temp3 as c
on a.id c.id当执行如上查询SQL时会发现其中 id 4 的数据有重复如下图所示
3. 问题原因
之所以会出现这样的问题是因为是以a表为主表但a表中只有 id 为 1、2、3 的数据但在b表中有id为4c表中也有id为4此时full join时a表会以空值和b表、c表中id为4的数据join这样关联后的表中就会出现2条id为4的数据
4. 问题解决
在后续的表full join时不单单使用第一张表的主键full join因为是full join所以肯定会存在第一张表为null而其他表有值的数据而使用 coalesce 方法对所有前面已经full join了的主键进行条件关联如下代码 方法1
selectcoalesce(a.id, b.id, c.id) as id, a.name, b.city, c.salary
from temp1 as afull join temp2 as b
on a.id b.idfull join temp3 as c
on coalesce(a.id, b.id) c.id结果如下 方法2
select temp.id, temp.name, temp.city, c.salary
from
(selectcoalesce(a.id, b.id) as id, a.name , b.city
from temp1 as afull join temp2 as b
on a.id b.id) temp
full join
temp3 as c
on temp.id c.id方法3
select
temp.id
,temp1.name
,temp2.city
,temp3.salary
from
(select id
from
(select
id
from temp1
union all
select
id
from temp2
union all
select
id
from temp3
) tt
group by id
) temp
left join temp1
on temp.id temp1.id
left join temp2
on temp.id temp2.id
left join temp3
on temp.id temp3.id