2015-12-21 47 views
-5

我已经写代码的蟒蛇对电影的情感分析评论的Python:错误的目录

import re 
import nltk 
from multiprocessing import Pool 
import numpy as np 
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.svm import LinearSVC 
from sklearn.naive_bayes import GaussianNB 
from nltk.stem.porter import PorterStemmer 
from bs4 import BeautifulSoup 



def lemmatize(l): 
    # proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip') 
    lmtzr = WordNetLemmatizer() 
    ''' for i in xrange(0, len(l)): 
     for j in xrange(0, len(l[i])): 
      l[i][j] = lmtzr.lemmatize(l[i][j]) 
      l[i][j] = lmtzr.lemmatize(l[i][j],'v') 
    ''' 

    for i in xrange(0, len(l)): 
     words = l[i].split() 
     word = [lmtzr.lemmatize(lmtzr.lemmatize(w,'v')) for w in words] 
     return(" ".join(word)) 
    return l 


# input: a list l of string 
# output: a list containing the stemmed string in l 
def stem(l): 
    result = [] 
    stmr = PorterStemmer() 

    for i in xrange(0, len(l)): 
     words = l[i].split() 
     meaningful = [stmr.stem(w) for w in words] 
     l[i] = " ".join(meaningful) 

    return l 




    return result 

# input: a list l of string 
# output: a list of string where the stopwords are removed 

def removeStopwords(l): 

    stops = set(stopwords.words("english")) 

    for i in xrange(0, len(l)): 
     words = l[i].lower().split() 
     meaningful = [w for w in words if not w in stops] 
     l[i] = " ".join(meaningful) 

    return l 

# input: a list l of string 
# output: a matrix where the (i,j) component is how many times 
#   the j-th word appear in the i-th document 
def tf(l): 

    result = [[]] 
    vectorizer = CountVectorizer(analyzer = "word", 
          tokenizer = None,  
          preprocessor = None, 
          stop_words = None)      
    result = vectorizer.fit_transform(l).toarray() 
    ''' 
     your code goes here... 
    ''' 

    return result 

# input: a list l of string 
# output: a matrix where the (i,j) component is the tf-idf value of the j-th word in the i-th document 
def tfidf(l): 

    result = [[]] 
    tf_ = tf(l) 
    #print(tf_[2]) 
    vectorizer = TfidfVectorizer(smooth_idf = False) 
    vectorizer.fit_transform(l) 
    idf = vectorizer.idf_ 
    idf = idf -1 
    # scikit learn idf implementation see line 993 below  
    # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py 

    #print(idf[2]) 
    result = tf_*idf 


    return result 

# add any additional preprocessing you find helpful 
def additional(l): 
    result = [] 

    ''' 
     your code goes here... 
    ''' 

    return result 

# input: a list l of string 
# output: a feature matrix like object ready for training (2-D list, numpy array, sparse matrix) 
# you may choose to use a subset of the previous functions that work best for you 
def preprocess(l): 
    print('preprocess done') 
    removeStopwords(l) 

    # print(l[1]) 
    lemmatize(l) 
    #stem(l) 

    return l 

# train_X: feature matrix for training 
# train_t: list of labels for training 
# val_X: feature matrix for validation 
# val_t: list of labels for validation 
# just print out your results, no need to return any value 
def sec2c(train_X, train_t, val_X, val_t): 

    cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 
    print('LOGREG result:') 
    for c in cvalue: 
     logreg = LogisticRegression(C=c) 
     a = logreg.fit(train_X, train_t).score(val_X, val_t) 
     print(a,c) 


    cvalue = [0.001, 0.01, 0.1, 1, 10, 100] 
    print('SVM result') 
    for c in cvalue: 
     svm = LinearSVC(C= c) 
     a = svm.fit(train_X, train_t).score(val_X, val_t) 
     print(a,c) 



    print('NB result') 
    array = np.asarray(train_X) 
    array[array==0]=1e9 
    train_X = array.tolist() 

    array = np.asarray(val_X) 
    array[array==0]=1e9 
    val_X = array.tolist() 

    n = int(len(train_X)/8) 
    nb = GaussianNB() 
    a = nb.fit(train_X,train_t).score(val_X,val_t) 
    print(a) 
    return 
# input train_text, vali_text, test_text: each being a list of strings 
#  train_labels, vali_labels: each being a list of labels 
# 

def sec2di(train_X, train_t, val_X, val_t, tf= False): 

    if tf: 
     print('Using TF') 
    else: 
     print('Using TF-IDF') 
    cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 
    print('LOGREG result:') 
    for c in cvalue: 
     logreg = LogisticRegression(C=c) 
     a = logreg.fit(train_X, train_t).score(val_X, val_t) 
     print(a,c) 

def useWord2vec(train_text, train_labels, vali_text, vali_labels, test_text): 

# from gensim.models import Word2Vec 

    # merge your texts here 

    # train your word2vec here 

    # train your classifiers here 
    return 0 
def parse(doc,text, label, test= False): 

    if test: 
     for sentence in doc: 
      review = BeautifulSoup(sentence).get_text() 
      if len(review)>0: 
       letters_only = re.sub("[^a-zA-Z]"," ",review) 
       text.append(letters_only) 
    else: 
     for sentence in doc: 
      review = BeautifulSoup(sentence).get_text() 
      if len(review)>0: 
       if review[0:1] == '+': 
        label.append(1) 
       else: 
        label.append(-1)  
       review = review[3:] 
       letters_only = re.sub("[^a-zA-Z]"," ",review) 
       text.append(letters_only) 

def main(): 
# read data and extract texts and labels 
    pool = Pool(processes=3) 
    train = open('small_train.txt', 'r') 

    # do preprocessing 
    trainSentences = re.split(r'\n', train.read()) 
    trainLabel = [] 
    trainText = [] 

    valid = open('small_valid.txt', 'r') 

    validSentences = re.split(r'\n', valid.read()) 
    validLabel = [] 
    validText = [] 

    test = open('small_test.txt', 'r')  
    testSentences = re.split(r'\n', test.read()) 
    testLabel = [] 
    testText = [] 

    parse(trainSentences, trainText, trainLabel) 
    print'parsed train' 
    parse(validSentences,validText,validLabel) 
    print'parsed valid' 
    parse(testSentences,testText,testLabel, test= True) 
    print'parsed test' 

    pool.map(preprocess, [trainText, validText, testText]) 
    ''' 
    preprocess(trainText) 
    print('preprocesed train') 
    preprocess(validText) 
    print('preprocesed valid') 
    preprocess(testText) 
    print('preprocesed test') 
    ''' 


    #ts = tfidf(trainText) 
    # print(ts[2]) 
    # print(trainText[1]) 
    # train the model 

    # make predictions and save them 
    return 0 

if __name__ == '__main__': 

    main() 

但我收到以下错误:

Traceback (most recent call last): File 
"C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py", 
line 261, in <module 
    main() File "C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py", 
line 222, in main 
    valid = open('small_valid.txt', 'r') IOError: [Errno 2] No such file or directory: 'small_valid.txt' 

你能不能帮我解决这个问题呢?

+3

的错误难道你不明白什么地方? 'IOError:[Errno 2]没有这样的文件或目录:'small_valid.txt'你试图读取一个不存在的文件。 – That1Guy

+0

我有small_valid.txt,我把它放在SentimentAnalysis-master文件中。但我不知道为什么不工作,(我对python真的很陌生) –

+1

您确定该文件在运行脚本的工作目录中可用吗?您可以尝试捕获IOError并创建文件(如果文件不存在),然后继续正常。然后你还会发现程序认为该文件应该放在哪里。 –

回答

1

错误很明显:No such file or directory: 'small_valid.txt'。将文件移到此路径:

C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master 

或更新下一行代码使用绝对路径:

train = open('C:\..path_to_file..\small_train.txt', 'r') 

valid = open('C:\..path_to_file..\small_valid.txt', 'r')