2013-02-21 55 views
0

我有一个程序,我正在用Java编写,必须做2件事情,找到最长的公用子序列并对齐常见字符。 LCS工作得很好,但对齐部分只是循环或不做任何事情。最长的公共子序列差异

我尝试做这个算法我在维基百科上

function printDiff(C[0..m,0..n], X[1..m], Y[1..n], i, j) 
if i > 0 and j > 0 and X[i] = Y[j] 
    printDiff(C, X, Y, i-1, j-1) 
    print " " + X[i] 
else if j > 0 and (i = 0 or C[i,j-1] ≥ C[i-1,j]) 
    printDiff(C, X, Y, i, j-1) 
    print "+ " + Y[j] 
else if i > 0 and (j = 0 or C[i,j-1] < C[i-1,j]) 
    printDiff(C, X, Y, i-1, j) 
    print "- " + X[i] 
else 
    print "" 

发现这里是我写的代码(我删除了LCS部分)

static char[] input1 = "ABCDE".toCharArray(); 
static char[] input2 = "ACDC".toCharArray(); 
static int M = input1.length; 
static int N = input2.length; 
static int[][] opt = new int[M + 1][N + 1]; 

public static void printDiff(int opt[][], char input1[], char input2[]) { 
    int i = 0, j = 0; 
    while (i < input1.length && j < input2.length) { 
    if (i > 0 && j > 0 && input1[i] == input2[j]) { 
     System.out.print(" " + input1[i]); 
     i++; 
     j++; 
    } else if (j > 0 && (i == 0 || opt[i][j - 1] >= opt[i - 1][j])) { 
     System.out.print("+ " + input2[j]); 
     j++; 
    } else if (i > 0 && (j == 0 || opt[i][j - 1] < opt[i - 1][j])) { 
     System.out.print("- " + input1[i]); 
     i++; 
    } else { 
     System.out.print(""); 

    } 
    } 
} 

回答

2

我重写代码中使用了Wikipedia algorithm 。换句话说,我使用递归而不是where子句。我必须改变其中一个条件,因为Java是基于零索引的,而维基百科算法是基于一个索引的。

我不得不重新添加LCS功能,以便我可以计算出int[][]opt

我在if语句中添加了括号,以确保操作按照我想要的顺序完成。

我也修正了输出。维基百科算法具有"+ ""- "作为输出。这似乎是一个错字。输出应分别为" +"" -"

这是我的代码版本。

public class PrintDiff { 

    char[] input1 = "ABCDE".toCharArray(); 
    char[] input2 = "ACDC".toCharArray(); 
    int  M  = input1.length; 
    int  N  = input2.length; 

    public void run() { 
     int[][] opt = lcsLength(input1, input2); 
     printDiff(opt, input1, input2, M - 1, N - 1); 
    } 

    public int[][] lcsLength(char[] input1, char[] input2) { 
     int[][] opt = new int[M][N]; 
     for (int i = 1; i < input1.length; i++) { 
      for (int j = 1; j < input2.length; j++) { 
       if (input1[i] == input2[j]) { 
        opt[i][j] = opt[i - 1][j - 1] + 1; 
       } else { 
        opt[i][j] = Math.max(opt[i][j - 1], opt[i - 1][j]); 
       } 
      } 
     } 
     return opt; 
    } 

    public void printDiff(int opt[][], char input1[], char input2[], int i, 
      int j) { 
     if ((i >= 0) && (j >= 0) && (input1[i] == input2[j])) { 
      printDiff(opt, input1, input2, i - 1, j - 1); 
      System.out.print(" " + input1[i]); 
     } else if ((j > 0) && ((i == 0) || (opt[i][j - 1] >= opt[i - 1][j]))) { 
      printDiff(opt, input1, input2, i, j - 1); 
      System.out.print(" +" + input2[j]); 
     } else if ((i > 0) && ((j == 0) || (opt[i][j - 1] < opt[i - 1][j]))) { 
      printDiff(opt, input1, input2, i - 1, j); 
      System.out.print(" -" + input1[i]); 
     } else { 
      System.out.print(""); 
     } 
    } 

    public static void main(String[] args) { 
     new PrintDiff().run(); 
    } 

} 

这里是我的输出。

A -B C D -E +C 
1

这里是一个返回所有最长公共子序列的diff文件版本(基本上使用缓存表回溯 - 类似采取让所有最长公共子序列在All Longest Common Subsequences的方法)(或者,你可以参考我的博客@:http://codingworkout.blogspot.com/2014/07/longest-common-subsequence.html

对于ex,对于GAC和AGCAT,它返回=> {{“[G] [A] C”,“[G] A [C]”,“G [A] [C]“},{”A [G] C [A] T“,”A [G] [C] AT“,”[A] G [C] AT“}其中GA,GC和AC最长子序列...

string[][] GetDiffs(string A, string B, int aIndex, int bIndex, 
      int[][] DP_LCS_AllPrefixes_Cache) 
     { 
      if((aIndex == 0) && (bIndex ==0)) 
      { 
       return null; 
      } 
      if (DP_LCS_AllPrefixes_Cache[aIndex][bIndex] == 0) 
      { 
       var r = new string[2][]; 
       r[0] = new string[] { A.Substring(0, aIndex) }; 
       r[1] = new string[] { B.Substring(0, bIndex) }; 
       return r; 
      } 
      if (A[aIndex - 1] == B[bIndex - 1]) 
      { 
       var r = this.GetDiffs(A, B, aIndex - 1, bIndex - 1, 
        DP_LCS_AllPrefixes_Cache); 
       string ch = string.Format("[{0}]", A[aIndex - 1]); 
       if (r == null) 
       { 
        r = new string[2][]; 
        r[0] = new string[] { ch }; 
        r[1] = new string[] { ch }; 
       } 
       else 
       { 
        r[0] = r[0].Select(s => s + ch).ToArray(); 
        r[1] = r[1].Select(s => s + ch).ToArray(); 
       } 
       return r; 
      } 
      int lcs_up_direction = DP_LCS_AllPrefixes_Cache[aIndex - 1][bIndex]; 
      int lcs_left_direction = DP_LCS_AllPrefixes_Cache[aIndex][bIndex - 1]; 
      string[][] lcs_up = null, lcs_left = null; 
      if (lcs_up_direction == lcs_left_direction) 
      { 
       lcs_up = this.GetDiffs(A, B, aIndex - 1, bIndex, 
        DP_LCS_AllPrefixes_Cache); 
       lcs_left = this.GetDiffs(A, B, aIndex, bIndex - 1, 
        DP_LCS_AllPrefixes_Cache); 
      } 
      else if (lcs_up_direction > lcs_left_direction) 
      { 
       lcs_up = this.GetDiffs(A, B, aIndex - 1, bIndex, 
        DP_LCS_AllPrefixes_Cache); 
      } 
      else 
      { 
       lcs_left = this.GetDiffs(A, B, aIndex, bIndex - 1, DP_LCS_AllPrefixes_Cache); 
      } 
      char a = A[aIndex - 1], b = B[bIndex - 1]; 
      string[][] rl = new string[2][]; 
      rl[0] = new string[0]; 
      rl[1] = new string[0]; 
      if(lcs_up != null) 
      { 
       //we moved upward, that is we accepted that they differ with 'A' at aIndex-1 (a) 
       rl[0] = lcs_up[0].Select(s => s + a.ToString()).ToArray(); 
       rl[1] = lcs_up[1]; 
      } 
      if (lcs_left != null) 
      { 
       //we moved left, that is we accepted that they differ with 'B' at bIndex-1 (b) 
       rl[0] = rl[0].Union(lcs_left[0]).ToArray(); ; 
       rl[1] = rl[1].Union(lcs_left[1].Select(s => s + b.ToString())).ToArray(); 
      } 
      return rl.ToArray(); 
     } 

其中呼叫者是

string[][] GetDiffs(string A, string B, int[][] DP_LCS_AllPrefixes_Cache) 
     { 
      var r = this.GetDiffs(A, B, A.Length, B.Length, 
       DP_LCS_AllPrefixes_Cache); 
      return r; 
     } 

而捕获LCS长度回溯

public int[][] LCS_OfAllPrefixes_Length(string A, string B) 
     { 
      A.ThrowIfNullOrWhiteSpace("a"); 
      B.ThrowIfNullOrWhiteSpace("b"); 
      int[][] DP_LCS_AllPrefixes_Cache = new int[A.Length+1][]; 
      for(int i = 0;i<DP_LCS_AllPrefixes_Cache.Length; i++) 
      { 
       DP_LCS_AllPrefixes_Cache[i] = new int[B.Length + 1]; 
      } 
      for (int rowIndexOfCache = 1; rowIndexOfCache <= A.Length; rowIndexOfCache++) 
      { 
       for (int columnIndexOfCache = 1; columnIndexOfCache <= B.Length; columnIndexOfCache++) 
       { 
        //LCS(Ai, Bj) = 0 if i <=0, or j <= 0 
        //    LCS(Ai, Bj) + 1 if Ai == Bj 
        //    Max(LCS(Ai-1, Bj), LCS(Ai, Bj-1)) 
        if(A[rowIndexOfCache-1] == B[columnIndexOfCache-1]) 
        { 
         DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache] = DP_LCS_AllPrefixes_Cache[rowIndexOfCache - 1][columnIndexOfCache - 1] + 1; 
        } 
        else 
        { 
         DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache] = Utilities.Max(DP_LCS_AllPrefixes_Cache[rowIndexOfCache - 1][columnIndexOfCache], 
          DP_LCS_AllPrefixes_Cache[rowIndexOfCache][columnIndexOfCache - 1]); 
        } 
       } 
      } 
      return DP_LCS_AllPrefixes_Cache; 
     } 

TestMethod的

[TestMethod] 
     public void LCS_Tests() 
     { 
      string A = "GAC", B = "AGCAT"; 
      var DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 2); 
      var lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache); 
      Assert.IsNotNull(lcs_sequences); 
      var diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache); 
      Assert.IsNotNull(diffs); 
      Assert.IsTrue(diffs.Length == 2); 
      Assert.IsTrue(diffs[0].Length == lcs_sequences.Length); 
      Assert.IsTrue(diffs[1].Length == lcs_sequences.Length); 
      Assert.IsTrue(lcs_sequences.Any(s => "AC".Equals(s))); 
      Assert.IsTrue(lcs_sequences.Any(s => "GC".Equals(s))); 
      Assert.IsTrue(lcs_sequences.Any(s => "GA".Equals(s))); 
      var DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 2); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences 
       .Any(s => "AC".Equals(s))); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences 
       .Any(s => "GC".Equals(s))); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences 
       .Any(s => "GA".Equals(s))); 
      A = "ABCDGH"; B = "AEDFHR"; 
      DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 3); 
      lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache); 
      Assert.IsNotNull(lcs_sequences); 
      diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache); 
      Assert.IsNotNull(diffs); 
      Assert.IsTrue(diffs.Length == 2); 
      Assert.IsTrue(diffs[0].Length == lcs_sequences.Length); 
      Assert.IsTrue(diffs[1].Length == lcs_sequences.Length); 
      Assert.IsTrue(lcs_sequences.Any(s => "ADH".Equals(s))); 
      DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 3); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences 
       .Any(s => "ADH".Equals(s))); 
      A = "AGGTAB"; B = "GXTXAYB"; 
      DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 4); 
      lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache); 
      Assert.IsNotNull(lcs_sequences); 
      diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache); 
      Assert.IsNotNull(diffs); 
      Assert.IsTrue(diffs.Length == 2); 
      Assert.IsTrue(diffs[0].Length == 2); 
      Assert.IsTrue(diffs[1].Length == lcs_sequences.Length); 
      Assert.IsTrue(lcs_sequences.Any(s => "GTAB".Equals(s))); 
      DP_LCS_AllPrefixes_Subsequences_Cache = this.LCS_OfAllPrefixes_Subsequences(A, B); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Length == 4); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Subsequences_Cache[A.Length][B.Length].Subsequences 
       .Any(s => "GTAB".Equals(s))); 
      A = "ABCDEF"; B = "UVWXYZ"; 
      DP_LCS_AllPrefixes_Cache = this.LCS_OfAllPrefixes_Length(A, B); 
      Assert.IsTrue(DP_LCS_AllPrefixes_Cache[A.Length][B.Length] == 0); 
      lcs_sequences = this.GetLongestCommonSubsequences(A, B, DP_LCS_AllPrefixes_Cache); 
      diffs = this.GetDiffs(A, B, DP_LCS_AllPrefixes_Cache); 
      Assert.IsNotNull(diffs); 
      Assert.IsTrue(diffs.Length == 2); 
      Assert.IsTrue(diffs[0].Length == 1); 
      Assert.IsTrue(diffs[1].Length == 1); 
     } 
的DP方法