2016-04-26 69 views
1
(SELECT 
    id, 
    SUM(hits/ab) AS HAB 
FROM batting 
GROUP BY id 
) b 

SELECT id, bmonth, bstate FROM master a 

WHERE bmonth >= 0 AND bstate is NOT NULL 
GROUP By bmonth,bstate 

到目前为止,我有这个乱码,但我迷失在如何形成连接,然后继续。我不知道从哪里开始尽可能地做事。我们应该加入还是使用子查询?请协助下面的架构Hive加入或子查询混淆

看:

CREATE EXTERNAL TABLE IF NOT EXISTS batting 
    (id STRING, year INT, team STRING, 
    league STRING, games INT, ab INT, runs INT, hits INT, doubles INT, triples INT, 
    homeruns INT, rbi INT, sb INT, cs INT, walks INT, strikeouts INT, ibb INT, 
    hbp INT, sh INT, sf INT, gidp INT) 
ROW FORMAT DELIMITED FIELDS 
TERMINATED BY ',' LOCATION '/home/hduser/hivetest/batting'; 

CREATE EXTERNAL TABLE IF NOT EXISTS master 
    (id STRING, byear INT, bmonth INT, bday INT, bcountry STRING, bstate STRING, 
    bcity STRING, dyear INT, dmonth INT, dday INT, dcountry STRING, dstate STRING, 
    dcity STRING, fname STRING, lname STRING, name STRING, weight INT, height INT, 
    bats STRING, throws STRING, debut STRING, finalgame STRING, retro STRING, 
    bbref STRING) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/home/hduser/hivetest/master'; 

回答

1

首先确保ATLEAST 3名球员来自同一国家和同一month.You将获得一组来计算每个州/每月的ID并过滤结果,其中count(id)> = 3

select bstate,bmonth from master 
group by bstate,bmonth 
having count(id) >=3 

然后,您必须按照上述设置,按月份,状态和顺序按总和(点击)/总和(蝙蝠)加入击球表并获得第一排。

select a.bmonth,a.bstate,SUM(c.hits)/SUM(b.bats) hb 
from (select bmonth,bstate from master 
     group by bmonth,bstate 
     having count(id) >=3) a 
join master b on a.bstate=b.state and a.month = b.month 
join batting c on b.id = c.id 
group by a.bmonth,a.bstate 
order by hb 
limit 1; 
+0

喜感谢u为答案,但得到一个无法识别输入“(”“(”从源“选择”失败:ParseException的 – dedpo

+0

@dedpo我已经编辑了答案,尽量不要now.Sorry无法访问群集 –

+0

这不是预期的输出,但它对于如何执行多连接非常有用 – dedpo

0

下面是该查询

select id, sum(hits)/sum(ab) as output from (select m.id, b.ab, b.hits from master m, batting b where m.id = b.id and m.bmonth >= 0 AND m.bstate is NOT NULL) group by id