2016-07-29 51 views
3

说我有一个data.table看起来如下:data.table:子集在过去24小时的观察,每组

dt = data.table(group = c(1,1,1,2,2,2,3,3,3),time = c("2016-03-09T08:31:00-05:00","2016-03-08T11:31:00-05:00","2016-03-06T08:31:00-05:00", 
               "2016-04-04T23:28:00-04:00","2016-04-10T23:28:00-04:00","2016-04-09T23:28:00-04:00", 
               "2016-05-11T19:52:00-04:00","2016-05-10T20:52:00-04:00","2016-04-11T19:52:00-04:00")) 

dt 
    group      time 
1:  1 2016-03-09T08:31:00-05:00 
2:  1 2016-03-08T11:31:00-05:00 
3:  1 2016-03-06T08:31:00-05:00 
4:  2 2016-04-04T23:28:00-04:00 
5:  2 2016-04-10T23:28:00-04:00 
6:  2 2016-04-09T23:28:00-04:00 
7:  3 2016-05-11T19:52:00-04:00 
8:  3 2016-05-10T20:52:00-04:00 
9:  3 2016-04-11T19:52:00-04:00 

对于每个组在此data.table,我想只保留在最近日期的24小时内的观测值。我为此制作了一个令人讨厌的解决方案,但它并不像我需要它在大型数据集上那么快。

library(lubridate) 
set(dt,j = "time",value = ymd_hms(dt[["time"]])) 
dt[,.(mostRecent = max(time),time),by = group][ 
    time > (mostRecent - days(1)),.(group,time)] 

    group    time 
1:  1 2016-03-09 13:31:00 
2:  1 2016-03-08 16:31:00 
3:  2 2016-04-11 03:28:00 
4:  3 2016-05-11 23:52:00 
5:  3 2016-05-11 00:52:00 

有没有人有如何完成更优雅/更快的提示?

回答

4

简单的解决方案为每个组创建一个截止时间(假定时间已经转化):

dt[, cutoff_time := max(time) - 24*60*60, by = group] 
dt[time > cutoff_time] 

编辑:

的“苹果牛优化MAX”的评论让我好奇,所以我为了比较的速度创造了一些较大的假数据。需要注意的是integer剧本很好地与两个max>=

require(data.table) 

require(microbenchmark) 

N = 100000 
N_g = 100 

all_times = seq(from = as.POSIXct('2016-01-01 10:00:00'), 
       to = as.POSIXct('2016-06-30 10:00:00'), 
       by = 60) 

all_times_int = as.integer(all_times) 

idx = sample(seq.int(length(all_times)), N, replace = TRUE) 

dt = data.table(group = sample(seq.int(N_g), N, replace = TRUE), 
       time = all_times[idx], 
       time_int = all_times_int[idx]) 

f1a = function (x) { 
    x[, cutoff_time := max(time) - 24*60*60, by = group] 
    x[time >= cutoff_time, list(group, time)] 
} 

f1b = function (x) { 
    x[, cutoff_time := max(time_int) - 24*60*60, by = group] 
    x[time_int >= cutoff_time, list(group, time)] 
} 

f2 = function (x) { 
    thresh_dt = x[, .(time = max(time)), by=group][, time := time - 24*60*60] 
    thresh_dt[x, on=c("group", "time"), roll=TRUE, nomatch=0][, list(group, time)] 
} 

microbenchmark(f1a(dt), 
       f1b(dt), 
       f2(dt)) 

Unit: milliseconds 
    expr  min  lq  mean median  uq  max neval 
f1a(dt) 9.842106 10.593243 11.593148 11.62311 12.478853 14.335338 100 
f1b(dt) 3.391178 3.763598 4.403264 4.00142 5.018182 8.335717 100 
    f2(dt) 14.422669 15.701397 17.090674 16.56990 17.695653 52.926897 100 

identical(f1a(dt), f1b(dt)) # TRUE 
identical(f1a(dt), f2(dt)) # TRUE 

编辑2: 还有一与N = 1,000,000N_g = 10,000组:

> microbenchmark(f1a(dt), 
+    f1b(dt), 
+    f2(dt), 
+    times = 10) 
Unit: milliseconds 
    expr  min  lq  mean median  uq  max neval 
f1a(dt) 634.91473 647.5662 670.74597 663.28238 694.29595 728.2481 10 
f1b(dt) 64.61488 67.3692 76.68925 68.42335 72.36862 113.1407 10 
    f2(dt) 205.67688 208.6491 229.65610 213.59476 249.16703 278.7713 10 

> microbenchmark(f1a(dt), 
+    f1b(dt), 
+    f2(dt), 
+    times = 10) 
Unit: milliseconds 
    expr  min  lq  mean median  uq  max neval 
f1a(dt) 620.11090 624.33587 645.0220 642.13648 657.74347 697.27674 10 
f1b(dt) 64.80214 67.43851 67.9140 67.99647 68.63552 69.74466 10 
    f2(dt) 198.39200 199.56088 209.6908 204.60183 216.23255 241.76792 10 

> microbenchmark(f1a(dt), 
+    f1b(dt), 
+    f2(dt), 
+    times = 10) 
Unit: milliseconds 
    expr  min  lq  mean median  uq  max neval 
f1a(dt) 619.2903 645.22617 656.58883 660.99508 664.82678 682.7618 10 
f1b(dt) 63.2454 67.31781 72.10255 68.19679 71.91441 106.7493 10 
    f2(dt) 195.9335 210.06171 222.19868 215.75979 241.74100 245.9022 10 
+0

我很困惑all_times_int。那些仍然是某种意义上的时代? – Frank

+0

AFAIK您可以将时间(以秒为单位)转换为'整数',只需从某个起点开始计算秒数。如果你关心时间差异,你可以简单地使用'整数',因为你不关心实际的'YYYY-mm-dd HH:MM:SS'表示......使用'integer'可以比使用' POSIX'。也许有更多经验的人可能会说'data.table :: IDateTime' ... – sbstn

+0

好的,谢谢,听起来像你失去了毫秒和一个很好的表示。 – Frank

4

首先,将阈值表中的:

thresh_dt = dt[, .(time = max(time)), by=group][, time := time - 24*60*60][] 

max从减去一天的几秒钟采取的the "GForce" optimized max优势分开服用。另请参阅?datatable.optimize

接下来,做一个滚动或非相等连接:

thresh_dt[dt, on=c("group", "time"), roll=TRUE, nomatch=0] 

# or, on data.table 1.9.7+ 
thresh_dt[dt, on=.(group, time <= time), nomatch=0] 

    group    time 
1:  1 2016-03-09 13:31:00 
2:  1 2016-03-08 16:31:00 
3:  2 2016-04-11 03:28:00 
4:  2 2016-04-10 03:28:00 
5:  3 2016-05-11 23:52:00 
6:  3 2016-05-11 00:52:00 

基准。只有当您拥有足够的组时,GForce max和rolling才会显示出来。我的示例数据扩展@ sbstn的,这样的组数是一个参数:

N = 5e6 
ng = 1e5 

all_times = seq(from = as.POSIXct('2016-01-01 10:00:00'), 
       to = as.POSIXct('2016-06-30 10:00:00'), 
       by = 60) 
all_times_int = as.integer(all_times)  
idx = sample(seq.int(length(all_times)), N, replace = TRUE) 
dt = data.table(group = sample(ng, N, replace = TRUE), 
       time = all_times[idx], 
       time_int = all_times_int[idx]) 

# sbstn, no gmax 
system.time({ 
    dt[, cutoff_time := max(time) - 24*60*60, by = group] 
    dt[time >= cutoff_time] 
}) 
# user system elapsed 
# 8.50 0.01 8.47 

# sbstn, with gmax 
system.time({ 
    dt[, maxtime := max(time), by = group][, cutoff_time := maxtime - 24*60*60] 
    dt[time >= maxtime] 
}) 
# user system elapsed 
# 4.98 0.01 4.99 

# gmax and roll 
system.time({ 
    thresh_dt = dt[, .(time = max(time)), by=group][, time := time - 24*60*60] 
    thresh_dt[dt, on=c("group", "time"), roll=TRUE, nomatch=0][, list(group, time)] 
}) 
# user system elapsed 
# 1.29 0.06 1.36 
# (Caveat: I didn't verify that these results match.) 

我的答案涉及分组两次行(一次计算最高,并再次加入的是原始数据)。通过切割下来到一个分组操作,克莱顿·斯坦利的回答还管理要快(至少我认为这是怎么回事):

system.time(dt[order(group, -time) 
    ][, groupP := shift(group, type='lag') 
    ][, head := is.na(groupP) | group != groupP 
    ][, copy(.SD)[.SD[head == T], rTime := i.time, on=c(group='group')] 
    ][time > (rTime - 24*60*60) 
    ][, .(group, time) 
    ][order(group, -time) 
    ]) 
# user system elapsed 
# 1.32 0.25 1.14 
3

可能的瓶颈是最大(*) ,通过计算。如果是这样的话:

dt[order(group, -time) 
    ][, groupP := shift(group, type='lag') 
    ][, head := is.na(groupP) | group != groupP 
    ][, copy(.SD)[.SD[head == T], rTime := i.time, on=c(group='group')] 
    ][time > (rTime - 24*60*60) 
    ][, .(group, time) 
    ][order(group, -time) 
    ] 

    group    time 
1:  1 2016-03-09 13:31:00 
2:  1 2016-03-08 16:31:00 
3:  2 2016-04-11 03:28:00 
4:  3 2016-05-11 23:52:00 
5:  3 2016-05-11 00:52:00 
> 
+1

Fyi,'max'是'j'中唯一的东西,我们做了一些额外的优化,如'dt [,max(time),by = group,verbose = TRUE]' – Frank

+0

@Frank我用你的测试代码,发现使用天(*)的减法是这段代码的瓶颈。好奇的是,你的机器上展示的这个展示与你测试的其他展示相比,因为它可能很接近。 –

+1

1.14秒,所以比滚动快一点,是的。 (非Equi加入,未在上述基准中显示,与滚动类似。) – Frank