2017-08-09 13 views
0

我有一个CSV文件是这样的:多列的csv文本存在缺乏马特里

col1 col2 col3 
r1 a,b,c e,f  g 
r2 h,i    j,k 
r3 l  m,n,o 

一些细胞有多种文字逗号分隔,有的有单,部分none.I要转换此类似:

col1 col2 col3 
a 1  0  0 
b 1  0  0 
c 1  0  0 
e 0  1  0 
f 0  1  0 
g 0  0  1 
h 1  0  0 
i 1  0  0 
j 0  0  1 
k 0  0  1 
l 1  0  0 
m 0  1  0 
n 0  1  0 
o 0  1  0 

有什么建议吗?我尝试在Excel中的数据透视表,但没有得到所需的输出。 在此先感谢。

问候 Zillur

+0

在Excel中使用'countif()' – Kevin

回答

0

不知道这是否是最短的解决方案(可能不是),但它产生所需的输出。基本上,我们会遍历所有三列,并计算字符串的出现次数,然后获得一个长格式的数据帧,然后我们将其转换为您想要的宽格式。

library(tidyr) 
library(purrr) 

df <- data_frame(col1 = c("a,b,c", "h,i", "l"), 
      col2 = c("e,f", "", "m,n,o"), 
      col3 = c("g", "j,k", "")) 

let_df <- map_df(df, function(col){ 
    # map_df applies the function to each column of df 

    # split strings at "," and unlist to get vector of letters 
    letters <- unlist(str_split(col, ",")) 
    # delete "" 
    letters <- letters[nchar(letters) > 0] 

    # count occurrences for each letter 
    tab <- table(letters) 

    # replace with 1 if occurs more often 
    tab[tab > 1] <- 1 

    # create data frame from table 
    df <- data_frame(letter = names(tab), count = tab) 
    return(df) 
}, .id = "col") # id adds a column col that contains col1 - col3 

# bring data frame into wide format 
let_df %>% 
    spread(col, count, fill = 0) 
0

这样一个很大的问题要解决。这里是我的基础R:

col1 <- c("a,b,c","h,i","l") 
col2 <- c("e,f","","m,n,o") 
col3 <- c("g","j,k","") 
data <- data.frame(col1, col2, col3, stringsAsFactors = F) 

restructure <- function(df){ 
df[df==""] <- "missing" 
result_rows <- as.character() 
l <- list() 
for (i in seq_along(colnames(df))){ 
    df_col <- sort(unique(unlist(strsplit(gsub(" ", "",toString(df[[i]])), ",")))) 
    df_col <- df_col[!df_col %in% "missing"] 
    result_rows <- sort(unique(c(result_rows, df_col))) 
    l[i] <- list(df_col) 
} 
result <- data.frame(result_rows) 
for (j in seq_along(l)){ 
    result$temp <- NA 
    result$temp[match(l[[j]], result_rows)] <- 1 
    colnames(result)[colnames(result)=="temp"] <- colnames(df)[j] 
} 
result[is.na(result)] <- 0 
return(result) 
} 

> restructure(data) 
# result_rows col1 col2 col3 
#1   a 1 0 0 
#2   b 1 0 0 
#3   c 1 0 0 
#4   e 0 1 0 
#5   f 0 1 0 
#6   g 0 0 1 
#7   h 1 0 0 
#8   i 1 0 0 
#9   j 0 0 1 
#10   k 0 0 1 
#11   l 1 0 0 
#12   m 0 1 0 
#13   n 0 1 0 
#14   o 0 1 0 
+0

非常感谢。它完美的作品。我认为这将导致出版物,如果它发生,我也会在那里承认。最好的问候Zillur –