只是为了好玩我还创建了一个RCPP功能(比@JosephWood快不了多少):
########### original function
#(modified to return most frequent value instead of index)
agg.fun <- function(x,...){
if(sum(x)==0){
return(NA)
} else {
as.integer(names(which.max(table(x))))
}
}
########### @JosephWood function
fasterAgg.Fun <- function(x,...) {
myRle.Alt <- function (x1) {
n1 <- length(x1)
y1 <- x1[-1L] != x1[-n1]
i <- c(which(y1), n1)
x1[i][which.max(diff(c(0L, i)))]
}
if (sum(x)==0) {
return(NA)
} else {
myRle.Alt(sort(x, method="quick"))
}
}
########### Rcpp function
library(Rcpp)
library(inline)
aggrRcpp <- cxxfunction(signature(values='integer'), '
Rcpp::IntegerVector v(clone(values));
std::sort(v.begin(),v.end());
int n = v.size();
double sum = 0;
int currentValue = 0, currentCount = 0, maxValue = 0, maxCount = 0;
for(int i=0; i < n; i++) {
int value = v[i];
sum += value;
if(i==0 || currentValue != value){
if(currentCount > maxCount){
maxCount = currentCount;
maxValue = currentValue;
}
currentValue = value;
currentCount = 0;
}else{
currentCount++;
}
}
if(sum == 0){
return Rcpp::IntegerVector::create(NA_INTEGER);
}
if(currentCount > maxCount){
maxCount = currentCount;
maxValue = currentValue;
}
return wrap(maxValue) ;
', plugin="Rcpp", verbose=FALSE,
includes='')
# wrap it to support "..." argument
aggrRcppW <- function(x,...)aggrRcpp(x);
基准:
require(raster)
set.seed(123)
x <- matrix(rpois(10^8, 2), 10000)
a <- raster(x)
system.time(a1<-aggregate(a,fact=100,fun=agg.fun))
# user system elapsed
# 35.13 0.44 35.87
system.time(a2<-aggregate(a,fact=100,fun=fasterAgg.Fun))
# user system elapsed
# 8.20 0.34 8.59
system.time(a3<-aggregate(a,fact=100,fun=aggrRcppW))
# user system elapsed
# 5.77 0.39 6.22
########### all equal ?
all(TRUE,all.equal(a1,a2),all.equal(a2,a3))
# > [1] TRUE
请注意,'which.max(table(x))'返回具有最大重复值的索引,而不是值。在你的情况下,大多数情况下,索引将与值重合,但要确保具有应该使用'as.numeric(names(which.max(table(x))))'...的值... – digEmAll
就性能而言...嗯,我想你应该诉诸一些Rcpp代码块来获得一些东西... – digEmAll