2017-03-16 63 views
1

我要转换的以下数据集:将列值转换为单列的ID

transaction_id productsku 
1    SK0001 
1    SK0002 
2    AB0001 
2    AC0001 
2    AC0002 
3    BC0001 
4    BC0002 

理想的数据集是:

transaction_id x1  x2  x3 
1    SK0001 SK0002 
2    AB0001 AC0001 AC0002 
3    BC0001 
4    BC0002 

所以,我用下面的代码转换,但它失败了。基于transation_id然后获取productsku每个组

order <- cast(order_0, transaction_id ~ productsku) 

Using productsku as value column. Use the value argument to cast to override this choice 
Error in `[.data.frame`(data, , variables, drop = FALSE) : 
    undefined columns selected 

回答

1

尝试分裂。然后你可以在rbind列表中将列表的每个元素进行子集化,以便能够包含最大数量为productsku的元素。

L = lapply(split(df, df$transaction_id), function(a) a$productsku) 
max_length = max(lengths(L)) 
do.call(rbind, lapply(L, function(a) a[1:max_length])) 
# [,1]  [,2]  [,3]  
#1 "SK0001" "SK0002" NA  
#2 "AB0001" "AC0001" "AC0002" 
#3 "BC0001" NA  NA  
#4 "BC0002" NA  NA 


DATA

df = structure(list(transaction_id = c(1L, 1L, 2L, 2L, 2L, 3L, 4L), 
    productsku = c("SK0001", "SK0002", "AB0001", "AC0001", "AC0002", 
    "BC0001", "BC0002")), .Names = c("transaction_id", "productsku" 
    ), class = "data.frame", row.names = c(NA, -7L)) 
0

这是一种方式。我们的想法是使用separate在同一组中的变量联合,然后将它们分成不同的列:

library(tidyverse) 
df %>% 
    group_by(transaction_id) %>% 
    summarise(product=paste(productsku, collapse=", ")) %>% 
    separate(product, c("x1", "x2", "x3"), sep=", ") 

# A tibble: 4 × 4 
    transaction_id  x1  x2  x3 
*   <int> <chr> <chr> <chr> 
1    1 SK0001 SK0002 <NA> 
2    2 AB0001 AC0001 AC0002 
3    3 BC0001 <NA> <NA> 
4    4 BC0002 <NA> <NA> 
Warning message: 
Too few values at 3 locations: 1, 3, 4 
0

在2使用data.table的简单和快速的替代的步骤

library(data.table) 

# convert mydata into a data.table 
    setDT(mydata) 

# step 1: gather productsku values by transaction id 
    temp <- df[, .(product = toString(productsku)), by = list(transaction_id)] 

# step 2: separate productsku values in different columns 
    temp[, c("x1", "x2", "x3") := tstrsplit(product, ",", fill="")] # you can also use fill=NA 

temp 
#> transaction_id    product  x1  x2  x3 
#> 1:    1   SK0001, SK0002 SK0001 SK0002   
#> 2:    2 AB0001, AC0001, AC0002 AB0001 AC0001 AC0002 
#> 3:    3     BC0001 BC0001     
#> 4:    4     BC0002 BC0002  

使用dcast{data.table}另一个快速替代给出稍微不同的输出:

# Using dcast 
    dcast(df, transaction_id~productsku) 

#> transaction_id AB0001 AC0001 AC0002 BC0001 BC0002 SK0001 SK0002 
#> 1:    1  NA  NA  NA  NA  NA SK0001 SK0002 
#> 2:    2 AB0001 AC0001 AC0002  NA  NA  NA  NA 
#> 3:    3  NA  NA  NA BC0001  NA  NA  NA 
#> 4:    4  NA  NA  NA  NA BC0002  NA  NA