2014-01-06 46 views
2

我怎么能手动解析PDF文档到使用JavaScript的话阵列。我不关心图像,数字,表格 - 只有单词,因为我可以将它用作JavaScript对象。如何?解析器PDF到JavaScript数组?

+0

要那样做服务器端或客户端?如果客户端,你如何收到文件?没有跨域问题? –

+0

是,客户端) –

+0

@RuzelDavletyarov你已经在你的网站上的PDF文件,还是用户上传吗? – Cilan

回答

0

假设你已经有PDF格式的内容,你可以下载pdftotext和:

功能应用(){ VAR 自我=这一点; this.complete = 0;

this.pdfToText = function(data){    
    // render the first pages 
    var pdf = new PDFJS.PDFDoc(data); 
    var total = pdf.numPages; 

    for (i = 1; i <= total; i++){ 
     var page = pdf.getPage(i); 

     var canvas = document.createElement('canvas'); 
     canvas.id = 'page' + i; 
     canvas.mozOpaque = true; 
     div.appendChild(canvas); 

     canvas.width = page.width; 
     canvas.height = page.height; 

     var context = canvas.getContext('2d'); 
     context.save(); 
     context.fillStyle = 'rgb(255, 255, 255)'; 
     context.fillRect(0, 0, canvas.width, canvas.height); 
     context.restore(); 

     self.setMessage("Rendering..."); 

     var textLayer = document.createElement('div'); 
     textLayer.className = 'textLayer'; 
     document.body.appendChild(textLayer); 

     page.startRendering(context, function(){ 
     if (++self.complete == total){ 
      self.setMessage("Finished rendering. Extracting text..."); 

      window.setTimeout(function(){ 
      var layers = []; 
      var nodes = document.querySelectorAll(".textLayer > div"); 
      for (var j = 0; j < nodes.length; j++){ 
       layers.push(nodes[j].textContent + "\n"); 
      } 
      self.sendOutput(layers.join("\n")); 

      self.setMessage("Done!"); 
      }, 1000); 
     } 
     }, textLayer); 
    } 
    }; 
    } 

    this.receiveInput = function(event){ 
    if (event.source != parent) return; 
    if (!event.data.byteLength) return alert("The PDF data needs to be an ArrayBuffer"); 
    self.setMessage("Received data"); 
    self.pdfToText(event.data); 
    } 

    this.sendOutput = function(text){ 
    var recipient = parent.postMessage ? parent : (parent.document.postMessage ? parent.document : undefined); 
    recipient.postMessage(text, "*"); 
    }; 

    this.setMessage = function(text){ 
    text = text.split(' '); 
    console.log(text); 
    } 

    window.addEventListener("message", self.receiveInput, true); 
    self.setMessage("Ready"); 
    self.sendOutput("ready"); 

,让你输入的onchangeApp()

DEMO (doesn't work in some browsers) 这将从PDF中记录单词数组(含标点符号)。

+0

东西我没有采取文件的文字( –

0

使用pdf.js,我这样做:

var pdf = require("pdf.js"); 
PDFJS.getDocument('document.pdf').then(function(pdf){ 
    pdf.getPage(1).then(function(page){ 
    page.getTextContent().then(function(txt){ 
     var arrayOfText = items.map(function(item){ 
     return item.str; 
     }); 
    }); 
    }); 
});