2011-06-20 44 views
6

我试图计算一个字节序列在另一个字节序列中出现的所有次数。但是,如果它已经对它们进行了计数,则它不能重新使用一个字节。例如,给定字符串
k.k.k.k.k.k.让我们假设字节序列为k.k然后它将只找到3次出现而不是5次,因为它们将被分解为:[k.k].[k.k].[k.k].而不是像[k.[k].[k].[k].[k].k]那里它们重叠并且基本上只是将2移到右侧。使用另一个字节列表/数组来计算字节列表/数组中的出现次数

理想的想法是了解压缩字典或运行时间编码的外观。所以我们的目标是将

k.k.k.k.k.k.减少到只有2个部分,因为(k.k.k.)是您可以拥有的最大和最好的符号。

这里是源至今:

using System; 
using System.Collections.Generic; 
using System.Collections; 
using System.Linq; 
using System.Text; 
using System.IO; 


    static class Compression 
    { 
     static int Main(string[] args) 
     { 

      List<byte> bytes = File.ReadAllBytes("ok.txt").ToList(); 
      List<List<int>> list = new List<List<int>>(); 

      // Starting Numbers of bytes - This can be changed manually. 
      int StartingNumBytes = bytes.Count; 
      for (int i = StartingNumBytes; i > 0; i--) 
      { 
       Console.WriteLine("i: " + i); 

       for (int ii = 0; ii < bytes.Count - i; ii++) 
       { 
        Console.WriteLine("ii: " + i); 
        // New pattern comes with refresh data. 
        List<byte> pattern = new List<byte>(); 

        for (int iii = 0; iii < i; iii++) 
        { 
         pattern.Add(bytes[ii + iii]); 
        } 



        DisplayBinary(bytes, "red"); 
        DisplayBinary(pattern, "green"); 

        int matches = 0; 
        // foreach (var position in bytes.ToArray().Locate(pattern.ToArray())) 
        for (int position = 0; position < bytes.Count; position++) { 
         if (pattern.Count > (bytes.Count - position)) 
         { 
          continue; 
         } 


         for (int iiii = 0; iiii < pattern.Count; iiii++) 
         { 
          if (bytes[position + iiii] != pattern[iiii]) 
          { 
           //Have to use goto because C# doesn't support continue <level> 
           goto outer; 
          } 

         } 

         // If it made it this far, it has found a match. 
         matches++; 
         Console.WriteLine("Matches: " + matches + " Orig Count: " + bytes.Count + " POS: " + position); 
         if (matches > 1) 
         { 
          int numBytesToRemove = pattern.Count; 
          for (int ra = 0; ra < numBytesToRemove; ra++) 
          { 
           // Remove it at the position it was found at, once it 
           // deletes the first one, the list will shift left and you'll need to be here again. 
           bytes.RemoveAt(position); 
          } 
          DisplayBinary(bytes, "red"); 
          Console.WriteLine(pattern.Count + " Bytes removed."); 

          // Since you deleted some bytes, set the position less because you will need to redo the pos. 
          position = position - 1; 
         } 


         outer: 
          continue; 
        } 

        List<int> sublist = new List<int>(); 
        sublist.Add(matches); 
        sublist.Add(pattern.Count); 
        // Some sort of calculation to determine how good the symbol was 
        sublist.Add(bytes.Count-((matches * pattern.Count)-matches)); 
        list.Add(sublist); 

       } 

      } 



      Display(list); 
      Console.Read(); 
      return 0; 
     } 


     static void DisplayBinary(List<byte> bytes, string color="white") 
     { 
      switch(color){ 
       case "green": 
        Console.ForegroundColor = ConsoleColor.Green; 
        break; 
       case "red": 
        Console.ForegroundColor = ConsoleColor.Red; 
        break; 
       default: 
        break; 
      } 


      for (int i=0; i<bytes.Count; i++) 
      { 
       if (i % 8 ==0) 
        Console.WriteLine(); 
       Console.Write(GetIntBinaryString(bytes[i]) + " "); 
      } 
      Console.WriteLine(); 
      Console.ResetColor(); 
     } 
     static string GetIntBinaryString(int n) 
     { 
      char[] b = new char[8]; 
      int pos = 7; 
      int i = 0; 

      while (i < 8) 
      { 
       if ((n & (1 << i)) != 0) 
       { 
        b[pos] = '1'; 
       } 
       else 
       { 
        b[pos] = '0'; 
       } 
       pos--; 
       i++; 
      } 
      //return new string(b).TrimStart('0'); 
      return new string(b); 
     } 

     static void Display(List<List<int>> list) 
     { 
      // 
      // Display everything in the List. 
      // 
      Console.WriteLine("Elements:"); 
      foreach (var sublist in list) 
      { 
       foreach (var value in sublist) 
       { 
        Console.Write("{0,4}", value); 

       } 
       Console.WriteLine(); 
      } 

      // 
      // Display total count. 
      // 
      int count = 0; 
      foreach (var sublist in list) 
      { 
       count += sublist.Count; 
      } 
      Console.WriteLine("Count:"); 
      Console.WriteLine(count); 
     } 

     static public int SearchBytePattern(byte[] pattern, byte[] bytes) 
     { 
      int matches = 0; 
      // precomputing this shaves some seconds from the loop execution 
      int maxloop = bytes.Length - pattern.Length; 
      for (int i = 0; i < maxloop; i++) 
      { 
       if (pattern[0] == bytes[i]) 
       { 
        bool ismatch = true; 
        for (int j = 1; j < pattern.Length; j++) 
        { 
         if (bytes[i + j] != pattern[j]) 
         { 
          ismatch = false; 
          break; 
         } 
        } 
        if (ismatch) 
        { 
         matches++; 
         i += pattern.Length - 1; 
        } 
       } 
      } 
      return matches; 
     } 
    } 

参考后得到文件的非二进制应该是,这里是二进制数据: 011010110010111001101011001011100110101100101110011010110010111001101011001011100110101100101110我希望能有比它如何小开始。

+0

可能调用上下文在这里是相关的。例如。如果我们知道我们将在单个字节数组中寻找许多模式,并且每个模式具有相同的长度,那么为搜索数组中的每个可能子字符串预先排列哈希代码可能是有意义的 - 然后可以比较哈希代码(通常是一个int32),而不是序列中的每个字节。但是如果代码匹配,你仍然必须测试每个字节(散列码告诉你一个序列绝对不匹配,否则它/可能/匹配) – redcalx

+0

你可以看看这里:http://stackoverflow.com/questions/283456/byte-array-pattern-search这涵盖了你正在寻找的大部分内容。 – Variant

回答

5
private static int CountOccurences(byte[] target, byte[] pattern) 
{ 
    var targetString = BitConverter.ToString(target); 
    var patternString = BitConverter.ToString(pattern); 
    return new Regex(patternString).Matches(targetString).Count; 
} 
+0

+1不错,简单,用'byte [] target = new byte [] {1,2,3,2,3,1,2,3}'进行测试。 byte [] pattern = new byte [] {2,3};' – shelleybutterfly

+0

有趣但值得注意的是,这是一种非常低效的方法 - 这一点与处理大量数据和/或在具有严格要求的实时服务器环境中运行在rountrip时间和吞吐量。 – redcalx

1

快速和肮脏,没有正则表达式。虽然我不确定它是否回答了问题的意图,但它应该相对较快。我想我要运行对正则表达式一些时间测试,看看肯定的相对速度:

private int CountOccurrences(string TestString, string TestPattern) 
    { 
     int PatternCount = 0; 
     int SearchIndex = 0; 

     if (TestPattern.Length == 0) 
      throw new ApplicationException("CountOccurrences: Unable to process because TestPattern has zero length."); 

     if (TestString.Length == 0) 
      return 0; 

     do 
     { 
      SearchIndex = TestString.IndexOf(TestPattern, SearchIndex); 

      if (SearchIndex >= 0) 
      { 
       ++PatternCount; 
       SearchIndex += TestPattern.Length; 
      } 
     } 
     while ((SearchIndex >= 0) && (SearchIndex < TestString.Length)); 

     return PatternCount; 
    } 

    private void btnTest_Click(object sender, EventArgs e) 
    { 
     string TestString1 = "k.k.k.k.k.k.k.k.k.k.k.k"; 
     string TestPattern1 = "k.k"; 

     System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1).ToString()); // outputs 6 
     System.Console.WriteLine(CountOccurrences(TestString1 + ".k", TestPattern1).ToString()); // still 6 
     System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1 + ".").ToString()); // only 5 
    } 
+0

另外,你的帖子让我想到了哈夫曼编码,我总是尽情玩耍......如果你有兴趣,这里有一个很好的介绍:http://www.cs.auckland.ac.nz/~jmor159 /PLDS210/huffman.html – shelleybutterfly

+0

进一步与你的问题的代码进行比较,我发现真的这不是你需要的,因为你正在做二进制代码......我会留下代码说:我会试图尝试做一些事情就像传递字节[],然后投射到字符串和搜索例如“string TestString = System.Text.Encoding.ASCII.GetString(TestBytes);”除了有些文化的一些匹配规则会妨碍这种正常工作。好的。 :) – shelleybutterfly

2

有了这个解决方案你有机会获得相匹配(虽然枚举)的单独的索引或者你可以拨打Count()在结果上,看看有多少场比赛有:

public static IEnumerable<int> Find<T>(T[] pattern, T[] sequence, bool overlap) 
{ 
    int i = 0; 
    while (i < sequence.Length - pattern.Length + 1) 
    { 
     if (pattern.SequenceEqual(sequence.Skip(i).Take(pattern.Length))) 
     { 
      yield return i; 
      i += overlap ? 1 : pattern.Length; 
     } 
     else 
     { 
      i++; 
     } 
    } 
}

overlap: false调用它来解决您的问题或overlap: true看到重叠比赛

我有交流(如果你有兴趣。)其他方法与API略有不同(以及更好的性能)here,包括一个直接在字节流上工作的方法。

相关问题