2012-01-07 122 views
3

我使用Pdfstamper在pdf上添加了水印。这里是代码:使用iTextSharp从PDF中删除水印

for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++) 
{ 
    iTextSharp.text.Rectangle pageRectangle = reader.GetPageSizeWithRotation(pageIndex); 
    PdfContentByte pdfData = stamper.GetUnderContent(pageIndex); 
    pdfData.SetFontAndSize(BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, 
     BaseFont.NOT_EMBEDDED), watermarkFontSize); 
    PdfGState graphicsState = new PdfGState(); 
    graphicsState.FillOpacity = watermarkFontOpacity; 
    pdfData.SetGState(graphicsState); 
    pdfData.SetColorFill(iTextSharp.text.BaseColor.BLACK); 
    pdfData.BeginText(); 
    pdfData.ShowTextAligned(PdfContentByte.ALIGN_CENTER, "LipikaChatterjee", 
     pageRectangle.Width/2, pageRectangle.Height/2, watermarkRotation); 
    pdfData.EndText(); 
} 

这工作正常。现在我想从我的pdf中删除这个水印。我看着iTextSharp,但无法得到任何帮助。我甚至尝试添加水印作为图层,然后删除图层,但无法从pdf中删除图层的内容。我查看了iText的图层删除功能,发现了一个OCGRemover类,但是我无法在iTextsharp中获得相应的类。

+1

如果你有一个过程,增加了水印,然后以最简单的做法是:当你不想水印, **不要添加**。 – 2012-01-07 09:42:41

+0

我需要删除已添加的水印 – Lipika 2012-01-07 11:18:01

+1

,通常意味着您想从别人的内容中删除水印...我错了吗? – 2012-01-07 11:26:27

回答

11

我要给你根据语句的疑点利益“我甚至尝试添加水印为层”,并假设您正在使用您要创建的内容,而不是试图unwatermark别人的内容。

PDF文件使用可选内容组(OCG)来存储对象作为层。如果您将水印文本添加到图层中,稍后可以很容易地将其删除。

下面的代码是一个完整的工作C#2010的WinForms应用定位iTextSharp的5.1.1.0。它使用基于Bruno's original Java code found here的代码。代码分为三部分。第1部分为我们创建了一个样本PDF。第2部分从第一部分创建一个新的PDF,并将水印应用到单独图层上的每个页面。第3部分从第二部分创建了一个最终的PDF,但使用我们的水印文本移除了图层。请参阅代码注释了解更多详细信息。

当你创建一个PdfLayer对象可以分配到一个PDF阅读器中显示的名称。不幸的是,我无法找到访问该名称的方法,因此下面的代码会查找图层中的实际水印文本。如果你不使用额外的PDF层,我会建议寻找/OC内容流中,而不是浪费时间寻找你的实际水印文本。如果你找到一种方法来寻找/OC组的名称,请让我kwow!

using System; 
using System.Windows.Forms; 
using System.IO; 
using iTextSharp.text; 
using iTextSharp.text.pdf; 

namespace WindowsFormsApplication1 { 
    public partial class Form1 : Form { 
     public Form1() { 
      InitializeComponent(); 
     } 

     private void Form1_Load(object sender, EventArgs e) { 
      string workingFolder = Environment.GetFolderPath(Environment.SpecialFolder.Desktop); 
      string startFile = Path.Combine(workingFolder, "StartFile.pdf"); 
      string watermarkedFile = Path.Combine(workingFolder, "Watermarked.pdf"); 
      string unwatermarkedFile = Path.Combine(workingFolder, "Un-watermarked.pdf"); 
      string watermarkText = "This is a test"; 

      //SECTION 1 
      //Create a 5 page PDF, nothing special here 
      using (FileStream fs = new FileStream(startFile, FileMode.Create, FileAccess.Write, FileShare.None)) { 
       using (Document doc = new Document(PageSize.LETTER)) { 
        using (PdfWriter witier = PdfWriter.GetInstance(doc, fs)) { 
         doc.Open(); 

         for (int i = 1; i <= 5; i++) { 
          doc.NewPage(); 
          doc.Add(new Paragraph(String.Format("This is page {0}", i))); 
         } 

         doc.Close(); 
        } 
       } 
      } 

      //SECTION 2 
      //Create our watermark on a separate layer. The only different here is that we are adding the watermark to a PdfLayer which is an OCG or Optional Content Group 
      PdfReader reader1 = new PdfReader(startFile); 
      using (FileStream fs = new FileStream(watermarkedFile, FileMode.Create, FileAccess.Write, FileShare.None)) { 
       using (PdfStamper stamper = new PdfStamper(reader1, fs)) { 
        int pageCount1 = reader1.NumberOfPages; 
        //Create a new layer 
        PdfLayer layer = new PdfLayer("WatermarkLayer", stamper.Writer); 
        for (int i = 1; i <= pageCount1; i++) { 
         iTextSharp.text.Rectangle rect = reader1.GetPageSize(i); 
         //Get the ContentByte object 
         PdfContentByte cb = stamper.GetUnderContent(i); 
         //Tell the CB that the next commands should be "bound" to this new layer 
         cb.BeginLayer(layer); 
         cb.SetFontAndSize(BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 50); 
         PdfGState gState = new PdfGState(); 
         gState.FillOpacity = 0.25f; 
         cb.SetGState(gState); 
         cb.SetColorFill(BaseColor.BLACK); 
         cb.BeginText(); 
         cb.ShowTextAligned(PdfContentByte.ALIGN_CENTER, watermarkText, rect.Width/2, rect.Height/2, 45f); 
         cb.EndText(); 
         //"Close" the layer 
         cb.EndLayer(); 
        } 
       } 
      } 

      //SECTION 3 
      //Remove the layer created above 
      //First we bind a reader to the watermarked file, then strip out a bunch of things, and finally use a simple stamper to write out the edited reader 
      PdfReader reader2 = new PdfReader(watermarkedFile); 

      //NOTE, This will destroy all layers in the document, only use if you don't have additional layers 
      //Remove the OCG group completely from the document. 
      //reader2.Catalog.Remove(PdfName.OCPROPERTIES); 

      //Clean up the reader, optional 
      reader2.RemoveUnusedObjects(); 

      //Placeholder variables 
      PRStream stream; 
      String content; 
      PdfDictionary page; 
      PdfArray contentarray; 

      //Get the page count 
      int pageCount2 = reader2.NumberOfPages; 
      //Loop through each page 
      for (int i = 1; i <= pageCount2; i++) { 
       //Get the page 
       page = reader2.GetPageN(i); 
       //Get the raw content 
       contentarray = page.GetAsArray(PdfName.CONTENTS); 
       if (contentarray != null) { 
        //Loop through content 
        for (int j = 0; j < contentarray.Size; j++) { 
         //Get the raw byte stream 
         stream = (PRStream)contentarray.GetAsStream(j); 
         //Convert to a string. NOTE, you might need a different encoding here 
         content = System.Text.Encoding.ASCII.GetString(PdfReader.GetStreamBytes(stream)); 
         //Look for the OCG token in the stream as well as our watermarked text 
         if (content.IndexOf("/OC") >= 0 && content.IndexOf(watermarkText) >= 0) { 
          //Remove it by giving it zero length and zero data 
          stream.Put(PdfName.LENGTH, new PdfNumber(0)); 
          stream.SetData(new byte[0]); 
         } 
        } 
       } 
      } 

      //Write the content out 
      using (FileStream fs = new FileStream(unwatermarkedFile, FileMode.Create, FileAccess.Write, FileShare.None)) { 
       using (PdfStamper stamper = new PdfStamper(reader2, fs)) { 

       } 
      } 
      this.Close(); 
     } 
    } 
} 
1

作为扩展到Chris's answer,用于除去层的VB.Net类被包括在此信息应该是更精确的比特的底部。

  1. 它经历层的PDF文件列表(存储在OCGs阵列中的文件的目录OCProperties字典)。此数组包含对PDF文件中包含名称的对象的间接引用,它包含名称
  2. 它遍历页面的属性(也存储在字典中)以查找指向图层对象的属性(通过间接引用)
  3. 它做的内容流的实际解析找到模式/OC /{PagePropertyReference} BDC {Actual Content} EMC的实例,因此它可以去除不仅仅是这些段适当

代码然后清除所有的引用一样,因为它可以。调用代码可能的工作,如下所示:

Public Shared Sub RemoveWatermark(path As String, savePath As String) 
    Using reader = New PdfReader(path) 
    Using fs As New FileStream(savePath, FileMode.Create, FileAccess.Write, FileShare.None) 
     Using stamper As New PdfStamper(reader, fs) 
     Using remover As New PdfLayerRemover(reader) 
      remover.RemoveByName("WatermarkLayer") 
     End Using 
     End Using 
    End Using 
    End Using 
End Sub 

满级:

Imports iTextSharp.text 
Imports iTextSharp.text.io 
Imports iTextSharp.text.pdf 
Imports iTextSharp.text.pdf.parser 

Public Class PdfLayerRemover 
    Implements IDisposable 

    Private _reader As PdfReader 
    Private _layerNames As New List(Of String) 

    Public Sub New(reader As PdfReader) 
    _reader = reader 
    End Sub 

    Public Sub RemoveByName(name As String) 
    _layerNames.Add(name) 
    End Sub 

    Private Sub RemoveLayers() 
    Dim ocProps = _reader.Catalog.GetAsDict(PdfName.OCPROPERTIES) 
    If ocProps Is Nothing Then Return 
    Dim ocgs = ocProps.GetAsArray(PdfName.OCGS) 
    If ocgs Is Nothing Then Return 

    'Get a list of indirect references to the layer information 
    Dim layerRefs = (From l In (From i In ocgs 
           Select Obj = DirectCast(PdfReader.GetPdfObject(i), PdfDictionary), 
             Ref = DirectCast(i, PdfIndirectReference)) 
        Where _layerNames.Contains(l.Obj.GetAsString(PdfName.NAME).ToString) 
        Select l.Ref).ToList 
    'Get a list of numbers for these layer references 
    Dim layerRefNumbers = (From l In layerRefs Select l.Number).ToList 

    'Loop through the pages 
    Dim page As PdfDictionary 
    Dim propsToRemove As IEnumerable(Of PdfName) 
    For i As Integer = 1 To _reader.NumberOfPages 
     'Get the page 
     page = _reader.GetPageN(i) 

     'Get the page properties which reference the layers to remove 
     Dim props = _reader.GetPageResources(i).GetAsDict(PdfName.PROPERTIES) 
     propsToRemove = (From k In props.Keys Where layerRefNumbers.Contains(props.GetAsIndirectObject(k).Number) Select k).ToList 

     'Get the raw content 
     Dim contentarray = page.GetAsArray(PdfName.CONTENTS) 
     If contentarray IsNot Nothing Then 
     For j As Integer = 0 To contentarray.Size - 1 
      'Parse the stream data looking for references to a property pointing to the layer. 
      Dim stream = DirectCast(contentarray.GetAsStream(j), PRStream) 
      Dim streamData = PdfReader.GetStreamBytes(stream) 
      Dim newData = GetNewStream(streamData, (From p In propsToRemove Select p.ToString.Substring(1))) 

      'Store data without the stream references in the stream 
      If newData.Length <> streamData.Length Then 
      stream.SetData(newData) 
      stream.Put(PdfName.LENGTH, New PdfNumber(newData.Length)) 
      End If 
     Next 
     End If 

     'Remove the properties from the page data 
     For Each prop In propsToRemove 
     props.Remove(prop) 
     Next 
    Next 

    'Remove references to the layer in the master catalog 
    RemoveIndirectReferences(ocProps, layerRefNumbers) 

    'Clean up unused objects 
    _reader.RemoveUnusedObjects() 
    End Sub 

    Private Shared Function GetNewStream(data As Byte(), propsToRemove As IEnumerable(Of String)) As Byte() 
    Dim item As PdfLayer = Nothing 
    Dim positions As New List(Of Integer) 
    positions.Add(0) 

    Dim pos As Integer 
    Dim inGroup As Boolean = False 
    Dim tokenizer As New PRTokeniser(New RandomAccessFileOrArray(New RandomAccessSourceFactory().CreateSource(data))) 
    While tokenizer.NextToken 
     If tokenizer.TokenType = PRTokeniser.TokType.NAME AndAlso tokenizer.StringValue = "OC" Then 
     pos = CInt(tokenizer.FilePointer - 3) 
     If tokenizer.NextToken() AndAlso tokenizer.TokenType = PRTokeniser.TokType.NAME Then 
      If Not inGroup AndAlso propsToRemove.Contains(tokenizer.StringValue) Then 
      inGroup = True 
      positions.Add(pos) 
      End If 
     End If 
     ElseIf tokenizer.TokenType = PRTokeniser.TokType.OTHER AndAlso tokenizer.StringValue = "EMC" AndAlso inGroup Then 
     positions.Add(CInt(tokenizer.FilePointer)) 
     inGroup = False 
     End If 
    End While 
    positions.Add(data.Length) 

    If positions.Count > 2 Then 
     Dim length As Integer = 0 
     For i As Integer = 0 To positions.Count - 1 Step 2 
     length += positions(i + 1) - positions(i) 
     Next 

     Dim newData(length) As Byte 
     length = 0 
     For i As Integer = 0 To positions.Count - 1 Step 2 
     Array.Copy(data, positions(i), newData, length, positions(i + 1) - positions(i)) 
     length += positions(i + 1) - positions(i) 
     Next 

     Dim origStr = System.Text.Encoding.UTF8.GetString(data) 
     Dim newStr = System.Text.Encoding.UTF8.GetString(newData) 

     Return newData 
    Else 
     Return data 
    End If 
    End Function 

    Private Shared Sub RemoveIndirectReferences(dict As PdfDictionary, refNumbers As IEnumerable(Of Integer)) 
    Dim newDict As PdfDictionary 
    Dim arrayData As PdfArray 
    Dim indirect As PdfIndirectReference 
    Dim i As Integer 

    For Each key In dict.Keys 
     newDict = dict.GetAsDict(key) 
     arrayData = dict.GetAsArray(key) 
     If newDict IsNot Nothing Then 
     RemoveIndirectReferences(newDict, refNumbers) 
     ElseIf arrayData IsNot Nothing Then 
     i = 0 
     While i < arrayData.Size 
      indirect = arrayData.GetAsIndirectObject(i) 
      If refNumbers.Contains(indirect.Number) Then 
      arrayData.Remove(i) 
      Else 
      i += 1 
      End If 
     End While 
     End If 
    Next 
    End Sub 

#Region "IDisposable Support" 
    Private disposedValue As Boolean ' To detect redundant calls 

    ' IDisposable 
    Protected Overridable Sub Dispose(disposing As Boolean) 
    If Not Me.disposedValue Then 
     If disposing Then 
     RemoveLayers() 
     End If 

     ' TODO: free unmanaged resources (unmanaged objects) and override Finalize() below. 
     ' TODO: set large fields to null. 
    End If 
    Me.disposedValue = True 
    End Sub 

    ' TODO: override Finalize() only if Dispose(ByVal disposing As Boolean) above has code to free unmanaged resources. 
    'Protected Overrides Sub Finalize() 
    ' ' Do not change this code. Put cleanup code in Dispose(ByVal disposing As Boolean) above. 
    ' Dispose(False) 
    ' MyBase.Finalize() 
    'End Sub 

    ' This code added by Visual Basic to correctly implement the disposable pattern. 
    Public Sub Dispose() Implements IDisposable.Dispose 
    ' Do not change this code. Put cleanup code in Dispose(ByVal disposing As Boolean) above. 
    Dispose(True) 
    GC.SuppressFinalize(Me) 
    End Sub 
#End Region 

End Class