2016-10-04 42 views
0

我有一个200个变量的列表,我想总结那些高度相关的变量。总和相关变量

假设这是我的数据

mydata <- structure(list(APPLE= c(1L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 43L), 
         PEAR= c(2L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 41L), 
         PLUM = c(10L, 20L, 10L, 20L, 10L, 20L, 1L, 0L, 1L, 2010L,20L, 10L, 10L, 10L, 10L, 10L), 
         BANANA= c(2L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 1L), 
         LEMON = c(4L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 3L)), 
        .Names = c("APPLE", "PEAR", "PLUM", "BANANA", "LEMON"), 
        class = "data.frame", row.names = c(NA,-16L)) 

我发现这个代码,我不知道如何才能利用它为我的目的 https://stackoverflow.com/a/39484353/4797853

var.corelation <- cor(as.matrix(mydata), method="pearson") 

library(igraph) 
# prevent duplicated pairs 
var.corelation <- var.corelation*lower.tri(var.corelation) 
check.corelation <- which(var.corelation>0.62, arr.ind=TRUE) 

graph.cor <- graph.data.frame(check.corelation, directed = FALSE) 
groups.cor <- split(unique(as.vector(check.corelation)),   clusters(graph.cor)$membership) 
lapply(groups.cor,FUN=function(list.cor){rownames(var.corelation)[list.cor]}) 

输出调整,我我正在寻找的是2数据帧如下:

DF1

GROUP1  GROUP2  
    3    16 
    4    40 
ETC.. 

的值是一组中的值的总和

DF2

ORIGINAL_VAR GROUP 

APPLE   1 
PEAR   1 
PLUM   2 
BANANA  2 
LEMON   2 

回答

0

尝试此(假设你只聚为2组):

DF1 <- cbind.data.frame(GROUP1=rowSums(mydata[,groups.cor[[1]]]), 
         GROUP2=rowSums(mydata[,groups.cor[[2]]])) 
DF1 

    GROUP1 GROUP2 
1  3  16 
2  4  40 
3  10  72 
4  8  24 
5  732  14 
6  130  30 
7  86  5 
8  912  10 
9 1752  3 
10 156 2114 
11 1374  22 
12 756  14 
13 756 114 
14  68  22 
15 106  14 
16  84  14 

DF2 <- NULL 
for (i in 1:2) { 
    DF2 <- rbind(DF2, 
      cbind.data.frame(ORIGINAL_VAR=rownames(var.corelation)[groups.cor[[i]]], 
      GROUP=i)) 
} 

DF2 

    ORIGINAL_VAR GROUP 
1   PEAR  1 
2  APPLE  1 
3  BANANA  2 
4  LEMON  2 
5   PLUM  2 
+0

谢谢,这正是我期待的对于。 – user4797853