R 3与自定义函数

library(data.table) 

df <- structure(list(
continuousNumericOne = c(3.82495116149284, 0.915662542284416, 0.751001771620762, NA, NA, 8.07583989184169, 4.57303752008246, 4.02747047825306, 2.79953011697721, 4.28614794390785), 
catagoricalFactorOne = structure(c(3L, 3L, 3L, NA, 3L, NA, 2L, 2L, 2L, NA), .Label = c("blue", "green", "red"), class = "factor"), 
continuousNumericTwo = c(NA, NA, 2.58285715825289, -2.71316582700148, 3.95645652249594, 1.96862094118233, 4.96960533647993, 6.15199683070215, 3.98091405116921, NA), 
catagoricalFactorTwo = structure(c(3L, 3L, 3L, NA, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("blue", "orange", "red"), class = "factor"), 
continuousNumericThree = c(3.43332616062442, 2.21448227693603, 2.31889349781533, NA, NA, 3.57539465909581, 3.28076535012702, NA, 3.15063300766727, 2.9556632429251), 
continuousNumericFour = c(7.77131807052585, NA, 6.5830522592014, NA, 7.36003333388333, 8.25217350122047, 7.18282902739316, 8.60641407074177, 4.87689328481095, NA)), 
.Names = c("continuousNumericOne", "catagoricalFactorOne", "continuousFactorTwo", "catagoricalFactorTwo", "continuousNumericThree", "continuousNumericFour"), 
row.names = c(NA, -10L), 
class = c("data.table", "data.frame")) 

> df 
    continuousNumericOne catagoricalFactorOne continuousFactorTwo catagoricalFactorTwo continuousNumericThree continuousNumericFour 
1:   3.8249512     red     NA     red    3.433326    7.771318 
2:   0.9156625     red     NA     red    2.214482     NA 
3:   0.7510018     red   2.582857     red    2.318893    6.583052 
4:     NA     NA   -2.713166     NA      NA     NA 
5:     NA     red   3.956457     red      NA    7.360033 
6:   8.0758399     NA   1.968621     red    3.575395    8.252174 
7:   4.5730375    green   4.969605    orange    3.280765    7.182829 
8:   4.0274705    green   6.151997    orange      NA    8.606414 
9:   2.7995301    green   3.980914    orange    3.150633    4.876893 
10:   4.2861479     NA     NA     blue    2.955663     NA

一个人怎么能做出一个自定义的函数来处理数据，如下所示来处理列数据的最佳方式......R 3与自定义函数

如果列一个明确的（因素），用'空白'代替所有NA
如果该列是连续的（数字），则额外的灵活性来进一步处理数据，例如首先将数据从0缩放到1，然后如果需要则替换NA ，也许是-1.1。

我已经花了大量时间进行列表，试图追踪列名，以及是否给定的列名因素与否，尝试通过应用不同的功能应用的方法，仍然没有运气。

如果有更好的方法，我全部都是耳朵。

来源

2017-08-11 ben_says

我们可以创建一个功能

f1 <- function(dat){ 
iCat <- which(sapply(dat, is.factor)) 
iNum <- which(sapply(dat, is.numeric)) 
dat[, (iCat) := lapply(.SD, function(x) { 
       levels(x) <- c(levels(x), "") 
       x[is.na(x)] <- "" 
       x}), .SDcols = iCat] 
    dat[, (iNum) := lapply(.SD, function(x) { 
       x1 <- as.vector(scale(x)) 
       x1[is.na(x1)] <- -1.1 
       x1}), .SDcols = iNum][] 
} 

f1(df) 
#continuousNumericOne catagoricalFactorOne continuousFactorTwo 
# 1:   0.07257304     red   -1.1000000 
# 2:   -1.18235090     red   -1.1000000 
# 3:   -1.25337745     red   -0.1400258 
# 4:   -1.10000000        -1.9826003 
# 5:   -1.10000000     red   0.3378723 
# 6:   1.90619723        -0.3537288 
# 7:   0.39526068    green   0.6903636 
# 8:   0.15992990    green   1.1017373 
# 9:   -0.36974314    green   0.3463815 
#10:   0.27151063        -1.1000000 
# catagoricalFactorTwo continuousNumericThree continuousNumericFour 
# 1:     red    0.83246346   0.43436598 
# 2:     red   -1.45562130   -1.10000000 
# 3:     red   -1.25961447   -0.52487557 
# 4:         -1.10000000   -1.10000000 
# 5:     red   -1.10000000   0.10235154 
# 6:     red    1.09916272   0.82254218 
# 7:    orange    0.54606741   -0.04069872 
# 8:    orange   -1.10000000   1.10850704 
# 9:    orange    0.30177540   -1.90219245 
#10:     blue   -0.06423321   -1.10000000

来源

2017-08-11 17:05:41 akrun

首先，您的示例数据与"Factor"数字列名。所以我提供了我自己的示例数据。

library(data.table) 

set.seed(1) 
df <- data.table(
    num1 = runif(10), 
    fac1 = factor(sample(letters, 10)), 
    num2 = runif(10), 
    fac2 = factor(sample(letters, 10)), 
    char = sample(letters, 10) 
)

至于任务，具有通用函数的方法调度符合法案。

process.factor <- function(x) { 
    # Replace with actual logic 
    rep_len("f", length(x)) 
} 

process.numeric <- function(x) { 
    # Replace with actual logic 
    rep_len("n", length(x)) 
} 

process.default <- function(x) { 
    # Replace with actual logic for "other" classes not specifically handled 
    rep_len("d", length(x)) 
} 

process <- function(x) { 
    UseMethod("process") 
}

然后我们只是lapply的data.table内的列，从而导致另一data.table。

df[, lapply(.SD, process)] 
#  num1 fac1 num2 fac2 char 
# 1: n f n f d 
# 2: n f n f d 
# 3: n f n f d 
# 4: n f n f d 
# 5: n f n f d 
# 6: n f n f d 
# 7: n f n f d 
# 8: n f n f d 
# 9: n f n f d 
# 10: n f n f d

来源

2017-08-11 17:14:17

如果还有其他列不是因素或数字，该怎么办？ 'process.default < - function（x）x'？ – Frank

'process.default'对于没有为'process.foo'创建的对象来说是一个全面的方法。无论你需要什么，你都可以制作'process.character'，'process.raw'。而'process'可以保持为对'UseMethod'的调用。编辑 - 添加'process.default'来回答，因为这是正确的做法。 –

因此，您可能不得不用'process.factor < - function（x）{level（x）< - c（levels（x），“”）; x [is.na（x）] < - “”; （scale）（x））;其中x和y分别表示一个或多个函数。 x [is.na（x）] < - -1.1; x }' – akrun

R 3与自定义函数

回答

相关问题