2010-01-07 15 views
1

这里是我的代码,我敢肯定,它看起来可怕,但它所有的作品,因为它应该只有我有问题是与最后一行...给定一个统一的错误,我不明白

import pyPdf 
import os 
import csv 

class UnicodeWriter: 
    """ 
    A CSV writer which will write rows to CSV file "f", 
    which is encoded in the given encoding. 
    """ 

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 
     # Redirect output to a queue 
     self.queue = cStringIO.StringIO() 
     self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 
     self.stream = f 
     self.encoder = codecs.getincrementalencoder(encoding)() 

    def writerow(self, row): 
     self.writer.writerow([s.encode("utf-8") for s in row]) 
     # Fetch UTF-8 output from the queue ... 
     data = self.queue.getvalue() 
     data = data.decode("utf-8") 
     # ... and reencode it into the target encoding 
     data = self.encoder.encode(data) 
     # write to the target stream 
     self.stream.write(data) 
     # empty queue 
     self.queue.truncate(0) 

    def writerows(self, rows): 
     for row in rows: 
      self.writerow(row) 


    PDFWriter = csv.writer(open('/home/nick/TAM_work/text/text.doc', 'a'), delimiter=' ', quotechar='|', quoting=csv.QUOTE_ALL) 

    def getPDFContent(path): 
     content = "" 
     # Load PDF into pyPDF 
     pdf = pyPdf.PdfFileReader(file(path, "rb")) 
     # Iterate pages 
     for i in range(0, pdf.getNumPages()): 
      # Extract text from page and add to content 
      content += pdf.getPage(i).extractText() + "\n" 
     # Collapse whitespace 
     content = " ".join(content.replace(u"\xa0", " ").strip().split()) 
     return content 

    for word in os.listdir("/home/nick/TAM_work/TAM_pdfs"): 
    print getPDFContent("/home/nick/TAM_work/TAM_pdfs/" + word) 

    PDFWriter.writerow ([getPDFContent("/home/nick/TAM_work/TAM_pdfs/" + word)]) 

当我运行一切正常,直到它达到这个......

Traceback (most recent call last): 
    File "Saving_fuction_added.py", line 52, in <module> 
    PDFWriter.writerow ([getPDFContent("/home/nick/TAM_work/TAM_pdfs/" + word)]) 
UnicodeEncodeError: 'ascii' codec can't encode character u'\u2122' in position 81: ordinal not in range(128) 

我很乐意帮忙。多谢你们。

Matt

+0

你有非ascii文件名吗?我很困惑,因为栈跟踪很短 - 它似乎表明错误在列表理解(TAM_pdfs + word)内,而不在writerow()函数内? –

+0

我一开始也这么认为,但之后不会失败? – danben

+0

试图改变我的.DOC为.csv并添加 尝试: X =的Unicode(值, “ASCII”) 除了UnicodeError: 值=的Unicode(值, “UTF-8”) 其他: #值有效的ASCII数据 通过 但这没有奏效。 也许我看着这个完全错误的方式?我只需要将我提取的文本提取到一个csv文件。 ([/ home/nick/TAM_work/TAM_pdfs /“+ word).encode(”ascii“,”ignore“)]) 进入for循环,再次修复 – Matt

回答

1

下面是回答该问题的代码。但现在它只写入最后一个文件。

import pyPdf 
import os 
import csv 

class UnicodeWriter: 
    """ 
    A CSV writer which will write rows to CSV file "f", 
    which is encoded in the given encoding. 
    """ 

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 
     # Redirect output to a queue 
     self.queue = cStringIO.StringIO() 
     self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 
     self.stream = f 
     self.encoder = codecs.getincrementalencoder(encoding)() 

    def writerow(self, row): 
     self.writer.writerow([s.encode("utf-8") for s in row]) 
     # Fetch UTF-8 output from the queue ... 
     data = self.queue.getvalue() 
     data = data.decode("utf-8") 
     # ... and reencode it into the target encoding 
     data = self.encoder.encode(data) 
     # write to the target stream 
     self.stream.write(data) 
     # empty queue 
     self.queue.truncate(0) 

    def writerows(self, rows): 
     for row in rows: 
      self.writerow(row) 


PDFWriter = csv.writer(open('/home/nick/TAM_work/text/text.doc', 'a'), delimiter=' ', quotechar='|', quoting=csv.QUOTE_ALL) 

def getPDFContent(path): 
    content = "" 
    # Load PDF into pyPDF 
    pdf = pyPdf.PdfFileReader(file(path, "rb")) 
    # Iterate pages 
    for i in range(0, pdf.getNumPages()): 
     # Extract text from page and add to content 
     content += pdf.getPage(i).extractText() + "\n" 
    # Collapse whitespace 
    content = " ".join(content.replace(u"\xa0", " ").strip().split()) 
    return content 

for word in os.listdir("/home/nick/TAM_work/TAM_pdfs"): 
    print getPDFContent("/home/nick/TAM_work/TAM_pdfs/" + word) 

PDFWriter.writerow ([getPDFContent("/home/nick/TAM_work/TAM_pdfs/" + word).encode("ascii", "ignore")]) 
+0

。 – Matt

-1

作为我Underestand你把一个很大的数字在一个小变量,它的抛出异常。

我给你介绍一个C#的工具,做工非常精细使用Unicode,你可以在你的情况http://unicode.codeplex.com

觉得我要推荐改变

for i in range(0, pdf.getNumPages()): 

pdf.getNumPages()比上述128只是控制它。

+0

-1 OP的例外情况是一个UnicodeEncodeError,它只能模糊地被定义为“在小变量中大数”,并且与PDF文件中的页数无关。至于你未公开的“工具”,你必须说服Python用户它提供了Python的标准unicode设施之上的东西 - 但请不要将这些言论作为进一步发送垃圾邮件的邀请,恰恰相反。 –