这可能证明有用的你......从a way to parse PDF data using iTextSharp
using iTextSharp.text.pdf;
using iTextSharp.text;
private void openPDF()
{
string str = "";
string newFile = "c:\\New Document.pdf";
Document doc = new Document();
PdfReader reader = new PdfReader("c:\\New Document.pdf");
for (int i = 1; i <= reader.NumberOfPages; i++)
{
byte[] bt = reader.GetPageContent(i);
str += ExtractTextFromPDFBytes(bt);
}
}
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
//() Bracket nesting level. Text appears inside()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) {
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return string.Empty;
}
}
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a)))
{
return true;
}
}
return false;
}
两者一旦数据被解析它,然后每一行转换为一个类(如在您的文章的评论建议安德烈V)的问题,将其序列化为XML,然后将XML文件本身存储到数据库或将XML数据存储到数据库。
一种选择是创建模拟您的数据的类(即字段/属性),为PDF数据中的每个“实体”创建对象并将对象序列化为XML。 StackOverflow和MSDN上的XML序列化中有大量示例。主要想法是创建一个缓冲区(类对象),您可以在其中构建您从PDF接收的数据。 –
如果客户想要的最终产品是数据库中的数据,那么为什么要先将它放入XML中? –
我真的不能告诉你为什么,因为这是一个更大的图片,这样做使我们能够使用代码来处理其他更重要的事情。 – Charlie