2016-10-28 34 views
1

的第一个字符我在R.多个R数据帧比较字符串检索是一致

df1<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("ACTG","ACT","GTAC","GTC","GACT"))) 
df2<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("AC","ATC","GTCA","GC","GAC"))) 

我试图确定以下数据帧什么最长的字符串,它是之间是一致的两个Nucs列。 到目前为止,我已经试过这样:

x1 <- strsplit(as.character(df1$Nucs),"") 
x2 <- strsplit(as.character(df2$Nucs),"") 
x <- Map(intersect, x1, x2) 
sapply(x, paste0, collapse="") 

这使我有以下几点:

[1] "AC" "ACT" "GTAC" "GC" "GAC" 

这不完全是我想是因为在网站3的情况下,我有GTAC和GTCA所以我只需要字符串中前两个字符一致的字符,即GT。

有没有人有任何想法,我该如何去做这件事?

回答

0

因此,这里是我的不是非常有效的解决方案:

df1 <- as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("ACTG","ACT","GTAC","GTC","GACT"))) 
df2 <- as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("AC","ATC","GTCA","GC","GAC"))) 
x1 <- strsplit(as.character(df1$Nucs),"") 
x2 <- strsplit(as.character(df2$Nucs),"") 
for(i in 1:nrow(df1)){ 
    a = "" 
    for(j in 1:min(length(x1[[i]]),length(x2[[i]]))){ 
    a= paste(a,x1[[i]][j] == x2[[i]][j],sep=",") 
    } 
    print(head(x1[[i]],sum(as.logical(strsplit(a,",")[[1]][-1])))) 
    } 

输出:

[1] "A" "C" 
[1] "A" 
[1] "G" "T" 
[1] "G" 
[1] "G" "A" "C" 

你要我评论的代码?

0

你可以试试这个,虽然有点冗长:

sapply(1:nrow(df1), function(x) { 
           s1 <- unlist(strsplit(as.character(df1$Nucs[x]), split = '')) 
           s2 <- unlist(strsplit(as.character(df2$Nucs[x]), split = '')) 
           n <- min(length(s1), length(s2)) 
           i <- 1 
           while(i <= n) { 
            if (s1[i] != s2[i]) { 
            break 
            } 
            i <- i + 1 
           } 
           if (i > 0) 
            paste(s1[1:(i-1)], collapse ='') 
           else 
            '' 
         }) 
# [1] "AC" "A" "GT" "G" "GAC" 
1

我也找到了解决方案,您可以尝试:

CompareVectors <- function(x, y){ 
    comp_length <- min(length(y), length(x)) 
    x <- x[1 : comp_length] 
    y <- y[1 : comp_length] 
    compare <- x == y 
    id <- which(compare == F)[1] 
    if(!is.na(id)){ 
    x <- x[which(compare[1: (id - 1)])] 
    } 
    return(paste(x, collapse = "")) 
} 

OUTPUT:

sapply(1 : length(x1), function(i) CompareVectors(x1[[i]], x2[[i]])) 

[1] "AC" "A" "GT" "G" "GAC" 
0

这里的另一种解决方案。它可能没有涵盖所有可能的情况,但这可能很容易扩展。

df1<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("ACTG","ACT","GTAC","GTC","GACT"))) 
df2<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("AC","ATC","GTCA","GC","GAC"))) 

mapply(x = as.list(df1$Nucs), y = as.list(df2$Nucs), FUN = function(x, y) { 
    x <- as.character(x); y <- as.character(y) # doesn't work with factors 

    # To keep everything in one easy to debug chunk, just switch in case 
    # x is shorter than y. 
    if (!(nchar(x) >= nchar(y))) { 
    xp <- y 
    yp <- x 
    } else { 
    xp <- x 
    yp <- y 
    } 

    # create elements to work on and vector for storage 
    to.glue <- strsplit(xp, "")[[1]] 
    out <- rep(NA, times = length(to.glue)) # used as output 

    # If one string is shorter than the other, extract one element 
    # at a time and see if there's a match in y. If yes, then pro- 
    # ceed to the second element, concatenate it with the first 
    # one and see if this pattern is present anywhere in y... 
    for (i in 1:length(to.glue)) { 
    glued <- paste(to.glue[1:i], collapse = "") 
    fm <- pmatch(x = glued, table = yp) 
    if (is.na(fm)) { 
     return(out[i-1]) 
    } else { 
     out[i] <- glued 
    } 
    } 
}) 

[1] "AC" "A" "GT" "G" "GAC"