2012-09-26 35 views
1

我想要一个有效的算法来查找大模式内所有出现的模式。在较大的序列中查找所有出现的模式

例如,给定下列输入:
图样: GAS
序列: ASDFGASDFGASDFADFASDFGA

预期输出:{4, 9}

根据接受answersimilar question实现的算法用于实现期望的任务。但是,一个comment报告算法是“在大字节数组上很慢”。

经过阅读,看起来最好的算法是Boyer-Moore String search algrorithm在C#上的实现CodeProject,但我无法实现泛型枚举。

是否有任何基于Boyer-Moore算法的解决方案来查找在.NET中的通用序列中的所有模式?

注意
虽然我在我的例子使用的字符串我想,关于实现IEnumerable的任何数据问题的解答。换句话说,它不仅适用于字符串,而且适用于任何类型的字符串。

+0

或许这将帮助? http://www.blackbeltcoder.com/Articles/algorithms/fast-text-search-with-boyer-moore –

回答

0

经过徒劳​​地理解Boyer-Moore算法之后,我把这个代码放在一起,这个代码在一个较大的集合上进行了一次遍历。

我还没有能够对Boyer-Moore算法进行测试,但它的工作效率非常高,当整个序列是该模式的重复时,它是最坏情况下的性能。

这是我的实现。让我知道你的看法。

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 

namespace ConsoleApplication1 
{ 
    class Program 
    { 
     static void Main(string[] args) 
     { 
      Console.WriteLine("Enter the string you want to search within."); 
      string hayStack = Console.ReadLine(); 
      Console.WriteLine("Enter the string you want to search for."); 
      string needle = Console.ReadLine(); 

      var ps = new PatternSearch<char>(needle.ToCharArray()); 

      Console.WriteLine(); 
      Console.WriteLine(); 

      Console.WriteLine(hayStack); 

      var matches = ps.Matches(hayStack.ToCharArray()).ToList(); 

      for (int i = 0; i < hayStack.Length; i++) 
       Console.Write(matches.Contains(i) ? "↑" : " "); 

      Console.WriteLine(); 

      Console.ReadLine(); 
     } 
    } 

    /// <summary>Implements a pattern searching algorithm with <b>O(nm)</b> worst-case performance.</summary> 
    /// <typeparam name="T">The data type of the array to search.</typeparam> 
    public class PatternSearch<T> 
    { 
     private struct MatchInfo 
     { 
      public MatchInfo(int startIndex, int matchLength) 
      { 
       this.StartIndex = startIndex; 
       this.MatchLength = matchLength; 
      } 
      public int StartIndex; 
      public int MatchLength; 
     } 

     private IEnumerable<T> pattern; 
     private List<MatchInfo> found; 
     private Func<T, T, bool> eqComp; 

     //optimization for IEnumerables that do not implement IList 
     int patLen = -1; 
     int seqLen = -1; 

     /// <summary>Initializes a new instance of the <see cref="PatternSearch{T}" /> class.</summary> 
     /// <param name="pattern">The pattern that will be searched for.</param> 
     public PatternSearch(T[] pattern) : this(pattern, (x, y) => x.Equals(y)) { } 

     /// <summary> 
     /// Initializes a new instance of the <see cref="PatternSearch{T}"/> class with the specified equality comparer. 
     /// </summary> 
     /// <param name="pattern">The pattern to be searched for.</param> 
     /// <param name="equalityComparer">The equality comparer to use for matching elements in the array.</param> 
     public PatternSearch(T[] pattern, Func<T, T, bool> equalityComparer) 
     { 
      patLen = pattern.Length; 

      if (pattern == null) 
       throw new ArgumentNullException("pattern", "The search pattern cannot be null."); 
      if (equalityComparer == null) 
       throw new ArgumentNullException("equalityComparer", "The equality comparer cannot be null."); 

      if (patLen <= 0) 
       throw new ArgumentException("pattern", "The pattern cannot be empty."); 

      // assign the values 
      this.pattern = pattern; 
      found = new List<MatchInfo>(); 
      eqComp = equalityComparer; 
     } 

     /// <summary> 
     /// Returns the start index of all occurrences of the search pattern within the specified array. 
     /// </summary> 
     /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param> 
     public IEnumerable<int> Matches(IEnumerable<T> seq) 
     { 
      seqLen = seqLen == -1 ? seq.Count() : seqLen; 
      return this.Matches(seq, 0, seqLen); 
     } 

     /// <summary> 
     /// Returns the start index of all occurrences of the search pattern within the specified array. 
     /// </summary> 
     /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param> 
     /// <param name="startIndex">The index in <paramref name="seq"/> to start searching at.</param> 
     public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex) 
     { 
      seqLen = seqLen == -1 ? seq.Count() : seqLen; 
      return this.Matches(seq, startIndex, seqLen); 
     } 

     /// <summary> 
     /// Returns the start index of all occurrences of the search pattern within the specified array. 
     /// </summary> 
     /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param> 
     /// <param name="count">The maximum number of items in <paramref name="seq"/> to match.</param> 
     public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex, int count) 
     { 
      patLen = patLen == -1 ? pattern.Count() : patLen; 
      seqLen = seqLen == -1 ? seq.Count() : seqLen; 
      bool addedNew = false; 

      var endPoint = Math.Min(seqLen, startIndex + count); 

      if (seq == null ||      // sequence cannot be null 
       seqLen < patLen ||     // pattern cannot be longer than sequence 
       (endPoint - startIndex) < patLen) // start to end cannot be less than pattern 
       yield break; 

      for (int i = startIndex; i < endPoint; i++) 
      { 
       addedNew = false; 

       // add the first item if a match is found 
       if (eqComp(seq.ElementAt(i), pattern.ElementAt(0))) 
       { 
        if (patLen == 1) 
         yield return i; 

        found.Add(new MatchInfo(i, 1)); 
        addedNew = true; 
       } 

       // check incomplete matches 
       for (int m = found.Count - 1; m >= 0; m--) 
       { 
        //skip the last item added 
        if (addedNew && m == found.Count - 1) 
         continue; 

        var match = found[m]; 

        // check incomplete matches 
        if ((i - match.StartIndex < patLen) && 
         eqComp(seq.ElementAt(i), pattern.ElementAt(match.MatchLength))) 
        { 
         match.MatchLength += 1; 
         found[m] = match; 

         // determine if a complete match has been found 
         if (match.MatchLength == patLen) 
         { 
          yield return match.StartIndex; 
          found.RemoveAt(m); 
         } 
        } 
        else 
         found.RemoveAt(m); 
       } 
      } 
     } 

    } 
} 
3

最坏情况下的性能是O(纳米)(其中n = seq.Count)当所述序列是图案的重复和图案是其他图案重复m次(指正,如果我错)。

List<int> LookFor<T>(IEnumerable<T> seq, T[ ] pattern) 
     where T : IEquatable<T> { 

    var partialMatches = new LinkedList<int>(); 
    var matches = new List<int>(); 

    int i = 0; 
    foreach (T item in seq) { 
     if (item.Equals(pattern[ 0 ])) 
      partialMatches.AddLast(0); 

     var n = partialMatches.First; 
     while(n != null) { 
      if (item.Equals(pattern[ n.Value ])) { 
       n.Value += 1; 
       if (n.Value == pattern.Length) { 
        matches.Add(i - pattern.Length + 1); 

        var next = n.Next; 
        partialMatches.Remove(n); 
        n = next; 

        continue; 
       } 
      } 
      else partialMatches.Remove(n); 

      n = n.Next; 
     } 

     i += 1; 
    } 

    return matches; 
} 

测试:

void Main() 
{ 
    var matches = LookFor("abcabcabcabcabcabcabc", 
     new char[ ] { 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c' }); 

    foreach (var x in matches) 
     Console.WriteLine("{0}", x); 
} 

输出:

0 
3 
6 
9 
12 
+0

你的代码无法检测到'重复'匹配。例如,在'dododo'中搜索'dodo'应该产生'0,2',但是使用你的代码,输出只是'0'。 –

+0

@AlexEssilfie修复了这个问题,请给我看看 – BlackBear

+0

我试过了你的代码,它的工作原理,谢谢。我选择我的代码作为接受的答案,因为它允许在您指定的任何索引处进行搜索。并为你的努力+1。 ;-) –

相关问题