从您的描述中(120个文件,70K-80K字,每个文件1-2 MB),似乎最好的方法是一次读取文件并构建一个可搜索的索引。我已经在下面列出了一个例子来说明如何完成这样的事情,但是如果您需要更复杂的搜索词匹配而不是找到确切的词或前缀词,那么这对您来说可能有限。
如果您需要更复杂的文本搜索匹配(同时获得良好性能),我建议您查看专门为此目的而构建的出色Lucene库。
public struct WordLocation
{
public WordLocation(string fileName, int lineNumber, int wordIndex)
{
FileName = fileName;
LineNumber = lineNumber;
WordIndex = wordIndex;
}
public readonly string FileName; // file containing the word.
public readonly int LineNumber; // line within the file.
public readonly int WordIndex; // index within the line.
}
public struct WordOccurrences
{
private WordOccurrences(int nOccurrences, WordLocation[] locations)
{
NumberOfOccurrences = nOccurrences;
Locations = locations;
}
public static readonly WordOccurrences None = new WordOccurrences(0, new WordLocation[0]);
public static WordOccurrences FirstOccurrence(string fileName, int lineNumber, int wordIndex)
{
return new WordOccurrences(1, new [] { new WordLocation(fileName, lineNumber, wordIndex) });
}
public WordOccurances AddOccurrence(string fileName, int lineNumber, int wordIndex)
{
return new WordOccurrences(
NumberOfOccurrences + 1,
Locations
.Concat(
new [] { new WordLocation(fileName, lineNumber, wordIndex) })
.ToArray());
}
public readonly int NumberOfOccurrences;
public readonly WordLocation[] Locations;
}
public interface IWordIndexBuilder
{
void AddWordOccurrence(string word, string fileName, int lineNumber, int wordIndex);
IWordIndex Build();
}
public interface IWordIndex
{
WordOccurrences Find(string word);
}
public static class BuilderExtensions
{
public static IWordIndex BuildIndexFromFiles(this IWordIndexBuilder builder, IEnumerable<FileInfo> wordFiles)
{
var wordSeparators = new char[] {',', ' ', '\t', ';' /* etc */ };
foreach (var file in wordFiles)
{
var lineNumber = 1;
using (var reader = file.OpenText())
{
while (!reader.EndOfStream)
{
var words = reader
.ReadLine()
.Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries)
.Select(f => f.Trim());
var wordIndex = 1;
foreach (var word in words)
builder.AddWordOccurrence(word, file.FullName, lineNumber, wordIndex++);
lineNumber++;
}
}
}
return builder.Build();
}
}
那么最简单的索引实现(即只能做一个精确匹配查找)使用的字典内:
public class DictionaryIndexBuilder : IIndexBuilder
{
private Dictionary<string, WordOccurrences> _dict;
private class DictionaryIndex : IWordIndex
{
private readonly Dictionary<string, WordOccurrences> _dict;
public DictionaryIndex(Dictionary<string, WordOccurrences> dict)
{
_dict = dict;
}
public WordOccurrences Find(string word)
{
WordOccurrences found;
if (_dict.TryGetValue(word, out found);
return found;
return WordOccurrences.None;
}
}
public DictionaryIndexBuilder(IEqualityComparer<string> comparer)
{
_dict = new Dictionary<string, WordOccurrences>(comparer);
}
public void AddWordOccurrence(string word, string fileName, int lineNumber, int wordIndex)
{
WordOccurrences current;
if (!_dict.TryGetValue(word, out current))
_dict[word] = WordOccurrences.FirstOccurrence(fileName, lineNumber, wordIndex);
else
_dict[word] = current.AddOccurrence(fileName, lineNumber, wordIndex);
}
public IWordIndex Build()
{
var dict = _dict;
_dict = null;
return new DictionaryIndex(dict);
}
}
用法:
var builder = new DictionaryIndexBuilder(EqualityComparer<string>.Default);
var index = builder.BuildIndexFromFiles(myListOfFiles);
var matchSocks = index.Find("Socks");
如果你也想这样做前缀查找,实现使用已排序字典的索引构建器/索引类(并更改IWordIndex.Find
方法以返回多个匹配项,或向接口添加新方法以查找部分/模式匹配)。
如果你想做更复杂的查找,请选择Lucence。
而不是加载RichTextBox中的每个文件,你可以加载一个字符串对象并使用它。 –
搜索什么?没有语境,就无法回答。和'RichTextBox'在这里是不相关的。 –
@SriramSakthivel,假设我有120个代码文件,我想在这些文件中搜索'public'并且重新定位他们的位置? – PurpleXenon