我想根据每个记录中第一项的关键字将大文件(逗号分隔,每个项用双引号包裹)分割成许多较小的文件,通常是相同的密钥超过1条记录。将一个文件分割成多个小文件的优化
这个大文件的范围可以从1GB到2GB,生成的文件数量可以在10,000-30,000范围内,每个文件夹在以该密钥命名的子文件夹中。我想在每行上做一个StreamReader.ReadLine(),将结果连接起来,直到它到达一个不同的键(用信号表示上一个键的最后一个数据),然后调用一个函数来写入这些文件asyncronously。我调用Windows排序来排序这些文件以使这些键连续(因此我只需打开一次文件),但操作完成仍需要20分钟。有什么办法可以加快速度吗?
sfd = new SaveFileDataDelegate(this.SaveFileData);
private void CSVParse(string filename, string unzippedFilePath, string feedname)
{
StreamReader filestream = null;
FileStream readerStream = null;
try
{
readerStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.None, 120000, FileOptions.SequentialScan);
filestream = new StreamReader(readerStream, Encoding.UTF8, false, 120000);
string tempstring = "";
string buffer = "";
string lastlotkey = "";
IAsyncResult result = null;
activityLog.Log("Parsing File: " + filename);
while (((tempstring = filestream.ReadLine()) != null) || buffer != "")
{
if (tempstring == null)
{
tempstring = "";
}
string lotkey = tempstring.Replace("\"","").Split(',').First();
if (lotkey == tempstring && tempstring != "")
{
break;
}
if (lotkey == "DealerID")
{
continue;
}
if (lastlotkey == "")
{
lastlotkey = lotkey;
}
if ((lotkey != lastlotkey && buffer.Length > 0))
{
result = sfd.BeginInvoke(outputDirectory + @"\" + feedname + @"\" + lastlotkey + @"\" + (filename.Split('\\').Last()).Split('.').First() + ".txt", buffer, outputDirectory + @"\" + feedname + @"\" + lastlotkey,null,null);
lastlotkey = lotkey;
buffer = "";
if (tempstring == "")
{
continue;
}
}
if (buffer.Length > 0)
{
buffer = buffer + "\r\n";
}
buffer = buffer + tempstring;
}
filestream.Close();
readerStream.Close();
if (result != null)
{
result.AsyncWaitHandle.WaitOne(-1);
}
return;
}
catch (Exception e)
{
activityLog.Log("Error Occurred: " + e.ToString());
if (filestream != null)
{
filestream.Close();
}
hadError = true;
return;
}
}
private void SaveFileData(string file, string buffer, string directory)
{
// create file from last lot key with data from parsing, write, close, update lastlotkey
Directory.CreateDirectory(directory);
FileStream fs = null;
StreamWriter temp = null;
try
{
if (!File.Exists(file))
{
fs = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write, FileShare.None, 120000);
}
else
{
fs = new FileStream(file, FileMode.Truncate, FileAccess.Write, FileShare.None, 120000);
}
temp = new StreamWriter(fs, Encoding.UTF8, 120000);
temp.AutoFlush = false;
temp.WriteLine(headerLine);
temp.Write(buffer);
temp.Flush();
temp.Close();
fs.Close();
}
catch (Exception e)
{
activityLog.Log("Error Occurred: " + e.ToString());
if (fs != null)
{
fs.Close();
}
if (temp != null)
{
temp.Close();
}
hadError = true;
return;
}
}
编辑
我爬到堆栈溢出和互联网最深的肠子,用线仿形切削线后,我发现字符串连接实际上是解析程序的繁重(后文件复制和Windows排序),用Stringbuilder替代了这一改进,总处理时间从20分钟(副本+排序+解析)下降到5分钟的副本+排序和2分钟的解析,总共7分钟。速度提高130%
StringBuilder再次触击。我想知道是否浪费时间来尝试调试使用String写入不当的程序时,String通过String interning已经超过了保存的全部内存。 –