2012-07-29 74 views
1

我正在尝试重新格式化数据集my.data以获取my.data2语句下面显示的输出。具体来说,我想把最后4列my.data放在一行上,每record.id,其中my.data的最后4个列将占据新数据矩阵的列2-5,如果group=1和列6-9,如果group=2R:将组合的行按行组合

我写了下面的繁琐代码,但双for循环导致了一个我根本找不到的错误。 即使双for循环的工作,我怀疑有办成 同样的事情的一个更有效的方式 - (?也许reshape

感谢您的帮助纠正双for循环或更高效码。

my.data <- "record.id group s1 s2 s3 s4 
    1 1  2  0  1  3 
    1 2  0  0  0  12 
    2 1  0  0  0  0 
    3 1  10  0  0  0 
    4 1  1  0  0  0 
    4 2  0  0  0  0 
    8 2  0  2  2  0 
    9 1  0  0  0  0 
    9 2  0  0  0  0"  

my.data2 <- read.table(textConnection(my.data), header=T) 

# desired output 
# 
# 1  2  0  1  3  0  0  0  12 
# 2  0  0  0  0  0  0  0  0 
# 3 10  0  0  0  0  0  0  0 
# 4  1  0  0  0  0  0  0  0 
# 8  0  0  0  0  0  2  2  0 
# 9  0  0  0  0  0  0  0  0 

代码:

dat_sorted <- sort(unique(my.data2[,1])) 
my.seq <- match(my.data2[,1],dat_sorted) 

my.data3 <- cbind(my.seq, my.data2) 

group.min <- tapply(my.data3$group, my.data3$my.seq, min) 
group.max <- tapply(my.data3$group, my.data3$my.seq, max) 

# my.min <- group.min[my.data3[,1]] 
# my.max <- group.max[my.data3[,1]] 

my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9) 

x <- 1 

for(i in 1:max(my.data3$my.seq)) { 

    for(j in group.min[i]:group.max[i]) { 

     if(my.data3[x,1] == i) my.records[i,1] = i 

     # the two lines below seem to be causing an error 
     if((my.data3[x,1] == i) & (my.data3[x,3] == 1)) (my.records[i,2:5] = my.data3[x,4:7]) 
     if((my.data3[x,1] == i) & (my.data3[x,3] == 2)) (my.records[i,6:9] = my.data3[x,4:7]) 

     x <- x + 1 

    } 
} 

回答

2

你是对的,reshape在这里帮助。

library(reshape2) 
m <- melt(my.data2, id.var = c("record.id", "group")) 
dcast(m, record.id ~ group + variable, fill = 0) 
    record.id 1_s1 1_s2 1_s3 1_s4 2_s1 2_s2 2_s3 2_s4 
1   1 2 0 1 3 0 0 0 12 
2   2 0 0 0 0 0 0 0 0 
3   3 10 0 0 0 0 0 0 0 
4   4 1 0 0 0 0 0 0 0 
5   8 0 0 0 0 0 2 2 0 
6   9 0 0 0 0 0 0 0 0 

比较:

dfTest <- data.frame(record.id = rep(1:10e5, each = 2), group = 1:2, 
s1 = sample(1:10, 10e5 * 2, replace = TRUE), 
s2 = sample(1:10, 10e5 * 2, replace = TRUE), 
s3 = sample(1:10, 10e5 * 2, replace = TRUE), 
s4 = sample(1:10, 10e5 * 2, replace = TRUE)) 


system.time({ 
...# Your code 
}) 
Error in my.records[i, 1] = i : incorrect number of subscripts on matrix 
Timing stopped at: 41.61 0.36 42.56 

system.time({m <- melt(dfTest, id.var = c("record.id", "group")) 
       dcast(m, record.id ~ group + variable, fill = 0)}) 
    user system elapsed 
    25.04 2.78 28.72 
1

朱利叶斯的回答是好,但为了完整,我觉得我设法得到以下for循环的工作:

dat_x <- (unique(my.data2[,1])) 
my.seq <- match(my.data2[,1],dat_x) 

my.data3 <- as.data.frame(cbind(my.seq, my.data2)) 

my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9) 
my.records <- as.data.frame(my.records) 

my.records[,1] = unique(my.data3[,2]) 

for(i in 1:9) { 

     if(my.data3[i,3] == 1) (my.records[my.data3[i,1],c(2:5)] = my.data3[i,c(4:7)]) 
     if(my.data3[i,3] == 2) (my.records[my.data3[i,1],c(6:9)] = my.data3[i,c(4:7)]) 

}