2016-07-03 35 views
0

我想如下,以提取页面的PDF页面,并存储在字典中的结果页面:单独的PDF使用pdfminer

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import TextConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 
from cStringIO import StringIO 
import re 

def convert_pdf_to_txt(path): 
    ps=dict() 
    rsrcmgr = PDFResourceManager() 
    retstr = StringIO() 
    codec = 'utf-8' 
    laparams = LAParams() 
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
    fp = file(path, 'rb') 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 
    password = "" 
    maxpages = 0 
    caching = True 
    pagenos=set() 
    i=1 
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): 
     interpreter.process_page(page) 
     text = retstr.getvalue() 
     ps[i]=re.sub(' +',' ',text) 
     i+=1 
    return ps 

print convert_pdf_to_txt('Aak.pdf')[3] 

但无论页面访问我,我得到所有的前几页。请告诉我如何解决这个问题?

回答

0

这应该有效。

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import TextConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 
from cStringIO import StringIO 

import os 

def set_interpreter(): 
    rsrcmgr = PDFResourceManager() 
    retstr = StringIO() 
    codec = 'utf-8' 
    laparams = LAParams() 
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 
    return { 'retstr': retstr, 'device': device, 'interpreter': interpreter } 

def convert_pdf_to_txt(path): 
    fp = file(path, 'rb') 
    si = set_interpreter() 
    retstr = si['retstr'] 
    device = si['device'] 
    interpreter = si['interpreter'] 
    password = "" 
    maxpages = 0 
    caching = True 
    pagenos=set() 
    page_counter = 0 

    for pageNumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True)): 
     interpreter.process_page(page) 
     fpp = file('pagetext_%d.txt' % page_counter, 'w+') 
     fpp.write(retstr.getvalue()) 
     fpp.close() 
     page_counter += 1 
     si = set_interpreter() 
     retstr = si['retstr'] 
     device = si['device'] 
     interpreter = si['interpreter'] 

    fp.close() 
    device.close() 
    retstr.close() 
    return text 

print convert_pdf_to_txt(os.path.dirname(os.path.realpath('filename.pdf')) + "/filename.pdf")