2017-04-23 119 views
2

伪数据组:(从我的数据集差异ITEM_CODE是串在我的情况)优化for循环中的R

in_cluster <- data.frame(item_code = c(1:500)) 
in_cluster$cluster <- 
     sample(5, size = nrow(in_cluster), replace = TRUE) 
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0)) 
real_sales <- 
    data.frame(
      item_code = sample(500, size = 100000, replace = TRUE), 
      sales = sample(500, size = 100000, replace = TRUE) 
    ) 

mean_trajectory <- data.frame(sales = c(1:52)) 
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE) 
training_df <- data.frame(
     LTF_t_minus_1 = numeric(0), 
     LTF_t = numeric(0), 
     LTF_t_plus_1 = numeric(0), 
     RS_t_minus_1 = numeric(0), 
     RS_t = numeric(0), 
     STF_t_plus_1 = numeric(0) 
) 
training_df[nrow(training_df) + 1, ] <- 
     c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0 

week = 2 

我有R中一个简单的函数中,所有我做的是:

system.time({ 
    for (r in 1:nrow(in_cluster)) { 
      item <- in_cluster[r,] 
      sale_row <- 
        dplyr::filter(real_sales, item_code == item$item_code) 
      if (nrow(sale_row) > 2) { 
        new_df <- data.frame(
          LTF_t_minus_1 = mean_trajectory$sales[[week - 1]], 
          LTF_t = mean_trajectory$sales[[week]], 
          LTF_t_plus_1 = mean_trajectory$sales[[week + 1]], 
          RS_t_minus_1 = sale_row$sales[[week - 1]], 
          RS_t = sale_row$sales[[week]], 
          STF_t_plus_1 = sale_row$sales[[week + 1]] 
        ) 
        training_df <- 
          bind_rows(training_df, new_df) 
      } 
    } 
}) 

我很新的R和发现这个很奇怪看多小时实在是又多久(421.59 seconds通过500行循环)它通过数据帧采取循环。

EDIT_IMPORTANT:不过,对于上面给出的伪数据组时采取了1.10 seconds得到输出>可这是因为具有ITEM_CODE字符串?是否需要很多时间来处理字符串item_code。 (我没有使用假人数据集的字符串,因为我不知道该怎么对item_code 500个独特的字符串in_cluster,并且具有相同的字符串作为real_salesitem_code

我通过建议的方式少的其他文章阅读来优化将R代码,并使用用于bind_rows代替rbind或:使用bind_rows

training_df[nrow(training_df) + 1,] <- 
    c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]]) 

似乎已经通过36秒通过500行数据帧的循环时改善了性能in_cluster

在这种情况下可以使用lapply吗?我想下面的代码,并得到了一个错误:

Error in filter_impl(.data, dots) : $ operator is invalid for atomic vectors

myfun <- function(item, sales, mean_trajectory, week) { 
sale_row<- filter(sales, item_code == item$item_code) 
data.frame(
    LTF_t_minus_1 = mean_trajectory$sales[[week-1]], 
    LTF_t = mean_trajectory$sales[[week]], 
    LTF_t_plus_1 = mean_trajectory$sales[[week+1]], 
    RS_t_minus_1 = sale_row$sales[[week-1]], 
    RS_t = sale_row$sales[[week]], 
    STF_t_plus_1 = sale_row$sales[[week+1]]) 
} 

system.time({ 
     lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows() 
}) 

帮助lapply将不胜感激,但是我的主要目标是加快循环。

+1

请包括一个[可重现的示例](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610),这将使其他人更容易来帮你。 – Jaap

+0

@Jaap当然,是在上面。不过谢谢。 –

回答

5

好的,所以在你的代码中有很多不好的做法。

  1. 你每行操作
  2. 你正在创建2(!),每行(很贵)新数据帧
  3. 你成长在一个循环对象)training_df <- bind_rows(training_df, new_df)保持在每个迭代增长,而运行一个相当昂贵的操作(bind_rows))
  4. 你一次又一次地运行相同的操作,当你可以只运行一次(为什么你运行mean_trajectory$sales[[week-1]]和每行AL,而mean_trajectory与循环无关?你可以之后分配)。
  5. 而这样的例子不胜枚举...

我建议的替代简单data.table解决方案,将执行好得多。这个想法是首先在in_clusterreal_sales之间进行二进制连接(并且在连接时运行所有操作而不创建额外的数据帧并绑定它们)。然后,只运行一次所有mean_trajectory相关行。(我忽略了training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19)初始化,因为它是无关紧要这里,你可以在事后只用添加和rbind

library(data.table) #v1.10.4 
## First step 
res <- 
    setDT(real_sales)[setDT(in_cluster), # binary join 
        if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do 
           RS_t = sales[week],    # by condition 
           STF_t_plus_1 = sales[week + 1]), 
        on = "item_code", # The join key 
        by = .EACHI] # Do the operations per each join 

## Second step (run the `mean_trajectory` only once) 
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1], 
      LTF_t = mean_trajectory$sales[week], 
      LTF_t_plus_1 = mean_trajectory$sales[week + 1])] 

一些性能测试:

### Creating your data sets 
set.seed(123) 
N <- 1e5 
N2 <- 5e7 

in_cluster <- data.frame(item_code = c(1:N)) 

real_sales <- 
    data.frame(
    item_code = sample(N, size = N2, replace = TRUE), 
    sales = sample(N, size = N2, replace = TRUE) 
) 

mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE)) 

training_df <- data.frame(
    LTF_t_minus_1 = numeric(0), 
    LTF_t = numeric(0), 
    LTF_t_plus_1 = numeric(0), 
    RS_t_minus_1 = numeric(0), 
    RS_t = numeric(0), 
    STF_t_plus_1 = numeric(0) 
) 
week = 2 

############################### 
################# Your solution 
system.time({ 
    for (r in 1:nrow(in_cluster)) { 
    item <- in_cluster[r,, drop = FALSE] 
    sale_row <- 
     dplyr::filter(real_sales, item_code == item$item_code) 
    if (nrow(sale_row) > 2) { 
     new_df <- data.frame(
     LTF_t_minus_1 = mean_trajectory$sales[[week - 1]], 
     LTF_t = mean_trajectory$sales[[week]], 
     LTF_t_plus_1 = mean_trajectory$sales[[week + 1]], 
     RS_t_minus_1 = sale_row$sales[[week - 1]], 
     RS_t = sale_row$sales[[week]], 
     STF_t_plus_1 = sale_row$sales[[week + 1]] 
    ) 
     training_df <- 
     bind_rows(training_df, new_df) 
    } 
    } 
}) 
### Ran forever- I've killed it after half an hour 


###################### 
########## My solution 
library(data.table) 
system.time({ 
res <- 
    setDT(real_sales)[setDT(in_cluster), 
        if(.N > 2) .(RS_t_minus_1 = sales[week - 1], 
           RS_t = sales[week], 
           STF_t_plus_1 = sales[week + 1]), 
        on = "item_code", 
        by = .EACHI] 
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1], 
      LTF_t = mean_trajectory$sales[week], 
      LTF_t_plus_1 = mean_trajectory$sales[week + 1])] 
}) 

# user system elapsed 
# 2.42 0.05 2.47 

所以对于50MM行data.table解决方案跑了约2秒,而你的解决方案无休止地运行,直到我杀死它(半小时后)。