是否有一个开放源代码库可以帮助我在.Net/C#中读取/解析PDF文档?在.Net中读取PDF文档
回答
iTextSharp是最好的选择。用它为lucene.Net制作蜘蛛,以便抓取PDF。
using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;
namespace Spider.Utils
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen)/(float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
//() Bracket nesting level. Text appears inside()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (input[i] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return CleanupContent(resultString);
}
catch
{
return "";
}
}
private string CleanupContent(string text)
{
string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };
for (int i = 0; i < patterns.Length; i++)
{
string regExPattern = patterns[i];
Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
text = regex.Replace(text, replace[i]);
}
return text;
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="tokens">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
你可以看看这个: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx 这不是完全免费的,但它看起来非常好。
亚历
这篇帮助将PDF转换为原始文本?似乎该工具将其转换为图像。所以我需要一个OCR库,然后:-) – JRoppert 2008-09-17 13:33:42
还有LibHaru
链接断开。 HTTP:// libharu。org/ – TernaryTopiary 2017-05-08 06:37:18
另外:“在这个时候,libHaru不支持阅读和编辑现有的PDF文件,这种支持不可能出现。” 这实际上是相关的吗? – TernaryTopiary 2017-05-08 06:38:01
iText的是我知道的最好的图书馆。最初用Java编写,还有一个.NET端口。
这不是一个官方的端口,并且该链接无论如何都被打破。 iText,iTextSharp的官方.NET端口可以在GitHub上找到:http://github.com/itext/itextsharp – 2015-12-09 15:39:55
我以前用ITextSharp来操作/分割和改造PDF文档 - 它非常简单,也是开源的。
aspose pdf工作得很好。然后再次,你必须支付它
public string ReadPdfFile(object Filename, DataTable ReadLibray)
{
PdfReader reader2 = new PdfReader((string)Filename);
string strText = string.Empty;
for (int page = 1; page <= reader2.NumberOfPages; page++)
{
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
PdfReader reader = new PdfReader((string)Filename);
String s = PdfTextExtractor.GetTextFromPage(reader, page, its);
s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
strText = strText + s;
reader.Close();
}
return strText;
}
由于这个问题是在2008年最后回答,iTextSharp大大改善了他们的API。如果您从http://sourceforge.net/projects/itextsharp/下载最新版本的api,则可以使用以下代码片段将PDF中的所有文本提取为字符串。
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
namespace PdfParser
{
public static class PdfTextExtractor
{
public static string pdfText(string path)
{
PdfReader reader = new PdfReader(path);
string text = string.Empty;
for(int page = 1; page <= reader.NumberOfPages; page++)
{
text += PdfTextExtractor.GetTextFromPage(reader,page);
}
reader.Close();
return text;
}
}
}
看看Docotic.Pdf library。它不要求你打开你的应用程序的源代码(例如iTextSharp具有病毒性的AGPL 3许可证)。
Docotic.Pdf可用于阅读PDF文件并提取带或不带格式的文本。请查看显示how to extract text from PDFs的示例。
声明:我为图书馆供应商Bit Miracle工作。
- 1. 在.NET中读取文档
- 2. 在.NET中从PDF读取文本
- 3. 在.net中读取大型XML文档
- 4. 在.NET中创建PDF/DOCX/HTML文档
- 5. org.apache.pdfbox.pdmodel.PDDocument不加载/读取PDF文档
- 6. 在Android中阅读PDF文档
- 7. 如何从iPhone中的文档目录中读取pdf文件?
- 8. 有没有可能在iText库中读取pdf文档android
- 9. 在PDF/A文档中读取和写入xml元数据
- 10. 从iOS的pdf文档中读取文本和图像
- 11. 在.Net中原生读取Adobe Framemaker文档?
- 12. 如何在.NET中读取Microsoft Word文档?
- 13. 在Asp.net中从PDF中读取文本
- 14. 从PDF文档中提取文本 - C#
- 15. 使用.NET在PDF文档中提取标记为新版本的文本
- 16. 从文档中读取plist
- 17. 在C#中编程读取PDF文件#
- 18. 如何在android中读取pdf文件?
- 19. 在节点js中读取PDF文件
- 20. 获取PDF文档大纲
- 21. 读取文件在c#.net
- 22. 如何在.NET中的pdf文档中获取页面的方向?
- 23. 读取XML文档
- 24. 在PDF中添加pdf表格文档
- 25. Asp .NET从tar.gz存档中读取文件
- 26. 在ASP.NET中读取XML文档
- 27. 如何在C#中读取XML文档
- 28. 在php中读取word文档
- 29. 在xslt中读取xml文档
- 30. 如何在asp.net中读取word文档
由布洛克Nusser提供的答案看起来像最先进的最新解决方案,应该被认为是正确的答案这个问题 – ceetheman 2018-01-11 15:38:47