2012-10-09 70 views
1

我有一个填充了课程ID,学生ID,周数(第一周为1,第二周为2)的数据框,以及有关每个用户在每个课程中做了什么的一些信息每周。如果教师在那一周与该学生进行了“干预”,那么df的最后两列是非NA,否则是NA。我想比较每个学生在第一次干预之前的行为。数据框中的特定聚合

所以我想要做的是一个专栏'HasIntervened',这个专栏比学生的第一次干预少了几个星期,而真实周数大于或等于几周,但我有一个地狱创建这个简单列的时间。我相当肯定aggregate将成为未来的路,但我只是没有以正确的方式思考问题。

这是第60行的dput数据帧的(5名学生的价值):

structure(list(UserID = c(4188948L, 4188948L, 4188948L, 4188948L, 
4188948L, 4188948L, 4735684L, 4735684L, 4735684L, 4735684L, 4735684L, 
4735684L, 6292486L, 6292486L, 6292486L, 6292486L, 6292486L, 6292486L, 
6469671L, 6469671L, 6469671L, 6469671L, 6469671L, 6469671L, 6538263L, 
6538263L, 6538263L, 6538263L, 6538263L, 6538263L, 6621258L, 6621258L, 
6621258L, 6621258L, 6621258L, 6621258L, 6891869L, 6891869L, 6891869L, 
6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 
6891869L, 6891869L, 6978155L, 6978155L, 6978155L, 6978155L, 6978155L, 
6978155L, 7195846L, 7195846L, 7195846L, 7195846L, 7195846L, 7195846L 
), CourseID = c(6567871L, 6567871L, 6567871L, 6567871L, 6567871L, 
6567871L, 6567168L, 6567168L, 6567168L, 6567168L, 6567168L, 6567168L, 
6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 6567159L, 
6567159L, 6567159L, 6567159L, 6567159L, 6567159L, 6567162L, 6567162L, 
6567162L, 6567162L, 6567162L, 6567162L, 6567853L, 6567853L, 6567853L, 
6567853L, 6567853L, 6567853L, 6567159L, 6567159L, 6567159L, 6567159L, 
6567159L, 6567159L, 6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 
6567864L, 6567873L, 6567873L, 6567873L, 6567873L, 6567873L, 6567873L, 
6567859L, 6567859L, 6567859L, 6567859L, 6567859L, 6567859L), 
WeekInCourse = c(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 
3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 
4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 
5, 6, 1, 2, 3, 4, 5, 6), WeekPostCount = c(1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5L, 3L, 4L, 3L, 3L, 0L, 4L, 
0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 2L, 
2L, 0L, 0L, 4L, 0L, 3L, 0L, 3L, 0L, 0L, 0L), WeekLoginCount = c(2L, 
1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 4L, 4L, 1L, 0L, 
0L, 0L, 3L, 3L, 1L, 0L, 0L, 0L, 2L, 1L, 0L, 0L, 0L, 0L, 1L, 
1L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 4L, 1L, 0L, 0L, 
0L, 0L, 3L, 3L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), 
WeekPointsPercent = c(0, 0, 0, 0, 0, 0, 0, 0.185714285714286, 
0.375, 0.2, 0, 0, 0, 0.85, 0.7, 0.4, 0.7, 0.7, 0, 0.857142857142857, 
0.35, 0, 0, 0.712765957446808, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0, 0, 0, 0, 0, 0.5, 0.5, 
0, 0, 0.7, 1, 1, 0.375, 0.723076923076923, 0, 0.738636363636364 
), CumulativePointsPercent = c(0, 0, 0, 0, 0, 0, 0, 0.185714285714286, 
0.254545454545455, 0.235294117647059, 0.235294117647059, 
0.10958904109589, 0, 0.85, 0.8, 0.533333333333333, 0.55, 
0.563636363636364, 0, 0.857142857142857, 0.623076923076923, 
0.476470588235294, 0.476470588235294, 0.600558659217877, 
0, 1, 0.0666666666666667, 0.0666666666666667, 0.0461538461538462, 
0.0461538461538462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0.25, 0.166666666666667, 0.0555555555555556, 0.05, 0.0454545454545455, 
0, 0.5, 0.5, 0.166666666666667, 0.15, 0.2, 1, 1, 0.615384615384615, 
0.669230769230769, 0.621428571428571, 0.666666666666667), 
RiskEstimate = c(0.627717786405816, 0.986868933315635, 0.986687587608184, 
0.993909863003438, 0.997123961252086, 0.995862152216296, 
0.914011371723269, 0.925359536086114, 0.902625588346349, 
0.956922151061089, 0.977244888475535, 0.975006380719003, 
0.215420992232115, 0.174623555825523, 0.241380495376484, 
0.699712463799006, 0.692014530298594, 0.697966901130338, 
0.765071150059092, 0.763071307309743, 0.767261726128078, 
0.835918063362269, 0.854949153314029, 0.805318343915736, 
0.792873572656207, 0.790581615380765, 0.82622599277251, 0.9330287497742, 
0.965763061363497, 0.951226314109191, 0.851355921713566, 
0.991081300877175, 0.989671569185701, 0.995402298000919, 
0.997671718747865, 0.996593366142757, 0.738690043138604, 
0.865412845144037, 0.831369850200541, 0.93845410260835, 0.968400480533385, 
0.9533338828382, 0.624930735381371, 0.981915016747928, 0.985037736895337, 
0.994680902796769, 0.996907588471311, 0.995388109404559, 
0.887995464972052, 0.970620002831325, 0.97136665697772, 0.992618626388727, 
0.99543249839328, 0.992149889176406, 0.923802324633255, 0.984464950934932, 
0.978726967214146, 0.971473084822075, 0.97886220009245, 0.979311013989987 
), RiskBin = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L), InterventionID = c(NA, 26L, NA, NA, NA, 
NA, NA, NA, NA, NA, 50L, NA, NA, NA, NA, NA, 73L, NA, NA, 
NA, NA, NA, 56L, NA, NA, NA, NA, 46L, NA, NA, NA, 33L, NA, 
NA, NA, NA, 15L, NA, NA, 43L, 53L, NA, NA, NA, NA, NA, 71L, 
NA, NA, NA, NA, NA, 78L, NA, NA, 36L, NA, NA, 80L, NA), InterventionType = structure(c(NA, 
2L, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, 
2L, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, 3L, NA, NA, NA, 
2L, NA, NA, NA, NA, 3L, NA, NA, 3L, 2L, NA, NA, NA, NA, NA, 
2L, NA, NA, NA, NA, NA, 2L, NA, NA, 3L, NA, NA, 3L, NA), .Label = c("", 
"At-Risk Form", "Email", "Other", "Phone"), class = "factor")), .Names = c("UserID", 
"CourseID", "WeekInCourse", "WeekPostCount", "WeekLoginCount", 
"WeekPointsPercent", "CumulativePointsPercent", "RiskEstimate", 
"RiskBin", "InterventionID", "InterventionType"), row.names = c(NA, 
60L), class = "data.frame") 
+0

需要嘛,我敢肯定,你应该改为使用'ave'。 –

回答

2
courses$HasIntervened <- as.logical(with(courses, ave(InterventionID, 
               UserID, CourseID, # grouping factors 
              FUN=function(x) cumsum(!is.na(x))))) 
+0

我很确定这将是我的解决方案,但是'with'部分是什么?我正在查看它的文档,我不明白为什么它在这里有帮助。 –

+1

它允许您使用列名作为变量,使代码更具可读性。另外明智的是你会写:'ave(courses $ InterventionID,courses $ UserID,courses $ CourseID,function(x){...})'。从技术上讲,您正在创建一个“环境”,其中列是命名实体,而在.GlobalEnv中,这些值不能按名称访问。 –

+0

啊,好吧......这就像是临时打电话给'attach',那种。非常感谢!我希望有一些功能,我没有听说过,我应该使用,似乎是“大”。 –

1

试试这个:

foo = your.data 
foo$WeekInCourse[is.na(foo$InterventionID)]=Inf 
bar = setNames(aggregate(WeekInCourse ~ UserID, foo, min),c("UserID","FirstW")) 
foo = merge(foo, bar, by="UserID") 
your.data$HasIntervened = your.data$WeekInCourse >= foo$FirstW 
0

这应该工作:

library(plyr) 
ddply(df, .(UserID), function(x) { 
    i <- which.min((x$InterventionID)) 
    if(i>1) { 
     x$HasIntervened <- c(rep(FALSE,i-1), rep(TRUE, nrow(x)-i+1)) 
     } else { 
     x$HasIntervened <- TRUE  
     } 
    x 
    }) 
1

A data.table方法编码的优雅和存储效率

library(data.table) 
# assuming your data is in DF 
DT <- as.data.table(DF) 
# set the key to ensure that the data is sorted by week within 
# each user/course combination 
setkey(DT, UserID, CourseID, WeekInCourse) 
# using cumsum 
DT[,hasIntervened := cumsum(!is.na(InterventionID))>0 ,by =list(CourseID, UserID)] 

data.table语法避免了with