2017-07-05 21 views
0

我正在创建一个外部客户,其中包含客户ID,姓名和配偶姓名。

CREATE TABLE customer(cust id, name struct<fname:string,lname:string>,spouse_name struct<fname:string,lname:string> 
    )row format delimited 
    fields terminated by ',' 
    collection items terminated by '$'; 

我想知道,如果传入的数据来源是这样的

1,FNAME1$LNAME1,SPOUSE_FNAME1#SPOUSE_LNAME1 
2,FNAME2$LNAME2,SPOUSE_FNAME2#SPOUSE_LNAME2 

我不能在“收集项目”的语句两个分隔符。 '$'分隔符只会分隔FNAME *和LNAME *。它不会对SPOUSE_FNAME *和SPOUSE_LNAME *做任何事情。我们是否需要为此编写一个定制的serde?我不确定数据在真实世界中的样子,但很可能在某个时间点我们可以得到这样的数据。

+0

数据处理是应该仔细规划和管理。文本字段也可能包含符号'','''''或'#'。 “将得到我们得到的并且在时间到来时处理它”的方法,不会让你走得太远。 –

回答

0

一种可能的方法是将结构加载为简单的字符串并在视图中执行数据操作。

create external table customer 
(
    cust_id  int 
    ,name  string 
    ,spouse_name string 
) 
    row format delimited 
    fields terminated by ',' 
; 

select * from customer 
; 

+---------+---------------+-----------------------------+ 
| cust_id |  name  |   spouse_name   | 
+---------+---------------+-----------------------------+ 
|  1 | FNAME1$LNAME1 | SPOUSE_FNAME1#SPOUSE_LNAME1 | 
|  2 | FNAME2$LNAME2 | SPOUSE_FNAME2#SPOUSE_LNAME2 | 
+---------+---------------+-----------------------------+ 

create view customer_v 
as 
select cust_id 
     ,named_struct('fname',name[0]  ,'lname',name[1])  as name 
     ,named_struct('fname',spouse_name[0],'lname',spouse_name[1]) as spouse_name 

from (select cust_id 
       ,split(name,'\\$')  as name 
       ,split(spouse_name,'#') as spouse_name 

     from customer 
     ) c 
; 

select * from customer_v 
; 

+---------+-------------------------------------+---------------------------------------------------+ 
| cust_id |    name     |     spouse_name     | 
+---------+-------------------------------------+---------------------------------------------------+ 
|  1 | {"fname":"FNAME1","lname":"LNAME1"} | {"fname":"SPOUSE_FNAME1","lname":"SPOUSE_LNAME1"} | 
|  2 | {"fname":"FNAME2","lname":"LNAME2"} | {"fname":"SPOUSE_FNAME2","lname":"SPOUSE_LNAME2"} | 
+---------+-------------------------------------+---------------------------------------------------+ 
0

试试这个

CREATE TABLE customer(cust_id int, name String, spouse_name string) row format delimited fields terminated by ',' stored as textfile; 
load data inpath '<hdfs path of input file>' overwrite into table customer; 

CREATE external TABLE customer_tmp(cust_id int, name string,spouse_name string) 
row format delimited 
fields terminated by ',' 
stored as textfile location '/hdfs_location_of_customer_tmp'; 

insert overwrite table customer_tmp 
select cust_id,regexp_replace(name,'\\W\\b',':') as name,regexp_replace(spouse_name,'\\W\\b',':') as spouse_name from customer; 

CREATE TABLE customer_final(cust_id int, name struct<fname:string,lname:string>,spouse_name struct<fname:string,lname:string>) 
row format delimited 
fields terminated by ',' 
collection items terminated by ':' 
stored as textfile; 

load data inpath '/hdfs_location_of_customer_tmp/*' overwrite into table customer_final; 

请不要忘了让我们知道它的工作:)