2017-02-14 100 views
0
import nltk 
import random 
from nltk.corpus import movie_reviews 
from nltk.classify.scikitlearn import SklearnClassifier 
import pickle 
import sys 
sys.getdefaultencoding() 
import os 

from sklearn.naive_bayes import MultinomialNB, BernoulliNB 
from sklearn.linear_model import SGDClassifier 

from nltk.classify import ClassifierI 
from statistics import mode 

from nltk.tokenize import word_tokenize 

class VoteClassifier(ClassifierI): 
    def __int__(self, *classifiers): 
     self._classifiers = classifiers 

    def classify(self, features): 
     votes = [ ] 
     for c in self._classifiers: 
      v = c.classify(features) 
      votes.append(v) 
     return mode(votes) 

    def confidence(self, features): 
     votes = [ ] 
     for c in self._classifiers: 
      v = c.classify(features) 
      votes.append(v) 

     choice_votes = votes.count(mode(votes)) 
     conf = choice_votes/len(votes) 
     return conf 


short_pos = os.open("positive.txt", os.O_RDONLY).read() 
short_neg = os.open("negative.txt", os.O_RDONLY).read() 

documents = [ ] 

for r in short_pos.split('\n'): 
    documents.append((r, "pos")) 

for r in short_neg.split('\n'): 
    documents.append((r, "neg")) 

    all_words = [ ] 

    short_pos_words = word_tokenize(short_pos) 
    short_neg_words = word_tokenize(short_neg) 

    for w in short_pos_words: 
     all_words.append(w.lower()) 

    for w in short_neg_words: 
    all_words.append(w.lower()) 

all_words = nltk.FreqDist(all_words) 

word_features = list(all_words.keys())[:5000] 

def find_features(document): 
    words = set(document) 
    features = {} 
    for w in word_features: 
     features[w] = (w in words) 

    return features 

# print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 

featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets) 

#training with increased data collection 
training_set = featuresets[:10000] 
#testing with increased data Collection 
testing_set = featuresets[10000:] 

#define and train classifier 
classifier = nltk.NaiveBayesClassifier.train(training_set) 
#testing classifier 
#print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100) 
#show the 15 most valuable words when it comes to positive or negative reviews 
#classifier.show_most_informative_features(15) 
#saving classifier 
#save_classifier = open("naivebayes.pickle", "wb") 
#pickle.dump(classifier, save_classifier) 
#save_classifier.close() 

#loading classifier 
#classifier_f = open("naivebayes.pickle", "rb") 
#classifier = pickle.load(classifier_f) 
#classifier_f.close() 

print("Original Naive Bayes Alogrithm acurracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 
    classifier.show_most_informative_features(15) 

MNB_classifier = SklearnClassifier(MultinomialNB()) 
MNB_classifier.train(training_set) 
print("MultinomialNB accuracy percent:", nltk.classify.accuracy(MNB_classifier, testing_set)) 

BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) 
BernoulliNB_classifier.train(training_set) 
print("BernoulliNB accuracy percent:", nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) 


SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) 
SGDClassifier_classifier.train(training_set) 
print("SGDclassifier accuracy percent:", nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) 

#SVC_classifier = SklearnClassifier(SVC()) 
#SVC_classifier.train(training_set) 
#print("SVC accuracy percent:", nltk.classify.accuracy(SVC_classifier, testing_set)) 

voted_classifier = VoteClassifier(classifier, 
          SGDClassifier_classifier, 
          MNB_classifier, 
          BernoulliNB_classifier) 

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, training_set))*100) 

#print ("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence%:", voted_classifier.confidence(testing_set[0][0])*100) 
#print ("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence%:", voted_classifier.confidence(testing_set[1][0])*100) 
#print ("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence%:", voted_classifier.confidence(testing_set[2][0])*100) 
#print ("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence%:", voted_classifier.confidence(testing_set[3][0])*100) 
#print ("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence%:", voted_classifier.confidence(testing_set[4][0])*100) 
#print ("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence%:", voted_classifier.confidence(testing_set[5][0])*100) 

当运行上面的代码我得到的错误,属性错误消息

short_pos = os.open("positive.txt", os.O_RDONLY).read() 
AttributeError: 'int' object has no attribute 'read' 

为什么发生这个错误,以及如何避免这个错误再次出现?

回答

2

这是因为您试图调用.read()方法返回值为os.open(),它返回的是一个int值,而不是文件类型的对象。

我认为你的意思是使用一个简单的

with open('filename.txt', 'r') as f: 
    text = f.read() 

还是做什么,如果你真的想要一个班轮:

text = open('filename.txt', 'r').read() 

这两行:

short_pos = os.open("positive.txt", os.O_RDONLY).read() 
short_neg = os.open("negative.txt", os.O_RDONLY).read() 

应改为:

with open("positive.txt", 'r') as f: 
    short_pos = f.read() 

with open("negative.txt", 'r') as f: 
    short_neg = f.read() 

此外,而不是读取整个文件的内容,然后拆分那些被\n这样的:

for r in short_pos.split('\n'): # This .split() 
    documents.append((r, "pos")) 

for r in short_neg.split('\n'): # And this .split() 
    documents.append((r, "neg")) 

而是采用str.split()这将是更好的主意阅读该文件首先使用.readlines()而不是read()。前者将从文件流中返回一系列行,并且您不必担心不同操作系统使用的不同行结束方案。

+0

非常感谢。但我现在有错误:返回codecs.ascii_decode(输入,self.errors)[0] UnicodeDecodeError:'ascii'编解码器无法解码位置4645中的字节0xf3:序号不在范围内(128) –

+0

@ A.Lona它似乎不是来源于你发布的代码,因此我不能确定,但​​我有一个预感,你试图解码已经解码为ascii/utf的东西。如果输入变量来自你正在打开的文件,那么如果你使用'os.open()'来打开这个文件,输出将是一个字节串,你必须用一些一种编解码器。你现在用一个简单的'open()'打开你的文件,因此输出已经是一个不需要任何解码的标准字符串。你能发布完整的堆栈跟踪吗? –