2015-06-22 45 views
4

我有一个非常简单的数据集:保持第二次出现在一列中的R

ID Value  Time 
1 censored 1 
1 censored 2 
1 uncensored 3 
1 uncensored 4 
1 censored 5 
1 censored 6 
2 censored 1 
2 uncensored 2 
2 uncensored 3 
2 uncensored 4 
2 censored 5 

我想保持第一uncensored发生,我想保持第一censored发生后uncensored之一。例如:

ID Value  Time 
1 uncensored 3 
1 censored  5 
2 uncensored 2 
2 censored  5 

不是每个人都有他们在时间5的第一个审查日期,这仅仅是一个例子。
Value是一个二元变量:1用于审查,0用于未经审查,但我已经标记了它们。

+0

感谢大家的回答,他们真的很有帮助 – Lb93

回答

4

下面是另一个可能的data.table解决方案

library(data.table) 
setDT(df1)[, list(Value = c("uncensored", "censored"), 
        Time = c(Time[match("uncensored", Value)], 
          Time[(.N - match("uncensored", rev(Value))) + 2L])), 
        by = ID] 
# ID  Value Time 
# 1: 1 uncensored 3 
# 2: 1 censored 5 
# 3: 2 uncensored 2 
# 4: 2 censored 5 

或者类似地,使用which而不是match

setDT(df1)[, list(Value = c("uncensored", "censored"), 
        Time = c(Time[which(Value == "uncensored")[1L]], 
          Time[(.N - which(rev(Value) == "uncensored")[1L]) + 2L])), 
        by = ID] 
2

尝试

library(data.table) 
indx <- setDT(df1)[, gr:= rleid(Value), ID 
][, c(.I[Value=='uncensored'][1L], .I[Value=='censored' & gr>1][1L]) , ID]$V1 
df1[indx][,gr:=NULL] 
# ID  Value Time 
#1: 1 uncensored 3 
#2: 1 censored 5 
#3: 2 uncensored 2 
#4: 2 censored 5 

,或者使用类似的想法在@Thomas发布

indx <- setDT(df1)[, { 
      i1 <-.I[Value=='uncensored'][1L] 
      i2=.I[Value=='censored'] 
      list(c(i1,i2[i2>i1][1L])) }, ID]$V1 
df1[indx] 
# ID  Value Time 
#1: 1 uncensored 3 
#2: 1 censored 5 
#3: 2 uncensored 2 
#4: 2 censored 5 

或者使用dplyr

library(dplyr) 
df1 %>% 
    group_by(ID) %>% 
    slice(which(Value=='uncensored')[1L]:n()) %>% 
    slice(match(c('uncensored', 'censored'), Value)) 
# ID  Value Time 
#1 1 uncensored 3 
#2 1 censored 5 
#3 2 uncensored 2 
#4 2 censored 5 
0

尝试

result=c() 
for(i in unique(df$ID)){ 
    subdf = df[which(df$ID) == i), ] 
    idx = min(which(subdf$Value == 0)) 
    result = rbind(result, subdf[idx, ]) 
    idx = min(which(subdf$Value[-(1:idx)] == 1)) 
    result = rbind(result, subdf[idx, ]) 
} 

假设所需的观察结果总是存在。

4

您可以用标准分申请,结合战略做到这一点:

do.call(rbind, lapply(split(d, d$ID), function(x) { 
    u1 <- which(x$Value == "uncensored")[1] 
    c1 <- which((x$Value == "censored") & seq_along(x$Value) > u1)[1] 
    return(x[c(u1, c1),]) 
})) 

结果:

 ID  Value Time 
1.3 1 uncensored 3 
1.5 1 censored 5 
2.8 2 uncensored 2 
2.11 2 censored 5 
+1

这是一个不错的选择,你可以使用'setDT(d)[,list({u1 < - which (Value ==“uncensored”)[1]; c1 < - which((Value ==“censored”)&seq_along(Value)> u1)[1]; Value = c(u1,c1)}),by = ID]' –

0

以下可以适用,只要你想,以确定具有惯性WRT特定列行,(即使有多个级别或数字列类别列)

df <- read.table("clipboard") 
a <- c(TRUE) 
for (i in 1:(nrow(df)-1)) 
{ 
    a <- c(a,duplicated(df[i:(i+1),2])[2]) 
} 
df[!a,] 
1

既然你提到Value是一个二进制变量,这里的另一个想法使用dplyr

library(dplyr) 
df %>% 
    group_by(ID) %>% 
    ## convert the labels to binary 
    ## 1 for censored, and 0 for uncensored 
    mutate(Value = ifelse(Value == "censored", 1, 0)) %>% 
    ## filter first 'uncensored' value in each 'ID' group 
    ## or the 'censored' values that have 'uncensored' as a predecessor 
    filter(Value == 0 & row_number(Value) == 1 | Value == 1 & lag(Value) == 0) 

其中给出:

#Source: local data frame [4 x 3] 
#Groups: ID 
# 
# ID Value Time 
#1 1  0 3 
#2 1  1 5 
#3 2  0 2 
#4 2  1 5 
相关问题