2017-07-17 62 views
0

我有一个训练有素的分类器,一直工作正常。Python机器学习训练Classifer错误指数是越界

我试图修改它来处理多个.csv文件使用循环,但是这已经打破它,原始代码(这是工作正常)现在返回与.csv文件相同的错误它的点它以前处理没有任何问题。

我非常困惑,看不到什么会突然导致此错误出现之前,一切工作正常。原始(工作)代码是;

# -*- coding: utf-8 -*- 

    import csv 
    import pandas 
    import numpy as np 
    import sklearn.ensemble as ske 
    import re 
    import os 
    import collections 
    import pickle 
    from sklearn.externals import joblib 
    from sklearn import model_selection, tree, linear_model, svm 


    # Load dataset 
    url = 'test_6_During_100.csv' 
    dataset = pandas.read_csv(url) 
    dataset.set_index('Name', inplace = True) 
    ##dataset = dataset[['ProcessorAffinity','ProductVersion','Handle','Company', 
    ##   'UserProcessorTime','Path','Product','Description',]] 

    # Open file to output everything to 
    new_url = re.sub('\.csv$', '', url) 
    f = open(new_url + " output report", 'w') 
    f.write(new_url + " output report\n") 
    f.write("\n") 


    # shape 
    print(dataset.shape) 
    print("\n") 
    f.write("Dataset shape " + str(dataset.shape) + "\n") 
    f.write("\n") 

    clf = joblib.load(os.path.join(
      os.path.dirname(os.path.realpath(__file__)), 
      'classifier/classifier.pkl')) 


    Class_0 = [] 
    Class_1 = [] 
    prob = [] 

    for index, row in dataset.iterrows(): 
     res = clf.predict([row]) 
     if res == 0: 
      if index in malware: 
       Class_0.append(index) 
      elif index in Class_1: 
       Class_1.append(index)   
      else: 
       print "Is ", index, " recognised?" 
       designation = raw_input() 

       if designation == "No": 
        Class_0.append(index) 
       else: 
        Class_1.append(index) 

    dataset['Type'] = 1      
    dataset.loc[dataset.index.str.contains('|'.join(Class_0)), 'Type'] = 0 

    print "\n" 

    results = [] 

    results.append(collections.OrderedDict.fromkeys(dataset.index[dataset['Type'] == 0])) 
    print (results) 

    X = dataset.drop(['Type'], axis=1).values 
    Y = dataset['Type'].values 


    clf.set_params(n_estimators = len(clf.estimators_) + 40, warm_start = True) 
    clf.fit(X, Y) 
    joblib.dump(clf, 'classifier/classifier.pkl') 

    output = collections.Counter(Class_0) 

    print "Class_0; \n" 
    f.write ("Class_0; \n") 

    for key, value in output.items():  
     f.write(str(key) + " ; " + str(value) + "\n") 
     print(str(key) + " ; " + str(value)) 

    print "\n" 
    f.write ("\n") 

    output_1 = collections.Counter(Class_1) 

    print "Class_1; \n" 
    f.write ("Class_1; \n") 

    for key, value in output_1.items():  
     f.write(str(key) + " ; " + str(value) + "\n") 
     print(str(key) + " ; " + str(value)) 

    print "\n" 

    f.close() 

我的新代码是一样的,但是包裹的一对夫妇嵌套循环内,以保持脚本运行,同时有文件的文件夹内的过程中,新的代码(代码导致错误)低于;

# -*- coding: utf-8 -*- 

import csv 
import pandas 
import numpy as np 
import sklearn.ensemble as ske 
import re 
import os 
import time 
import collections 
import pickle 
from sklearn.externals import joblib 
from sklearn import model_selection, tree, linear_model, svm 

# Our arrays which we'll store our process details in and then later print out data for 
Class_0 = [] 
Class_1 = [] 
prob = [] 
results = [] 

# Open file to output our report too 
timestr = time.strftime("%Y%m%d%H%M%S") 

f = open(timestr + " output report.txt", 'w') 
f.write(timestr + " output report\n") 
f.write("\n") 

count = len(os.listdir('.')) 

while (count > 0): 
    # Load dataset 
    for filename in os.listdir('.'): 
      if filename.endswith('.csv') and filename.startswith("processes_"): 

       url = filename 

       dataset = pandas.read_csv(url) 
       dataset.set_index('Name', inplace = True) 

       clf = joblib.load(os.path.join(
         os.path.dirname(os.path.realpath(__file__)), 
         'classifier/classifier.pkl'))    

       for index, row in dataset.iterrows(): 
        res = clf.predict([row]) 
        if res == 0: 
         if index in Class_0: 
          Class_0.append(index) 
         elif index in Class_1: 
          Class_1.append(index)   
         else: 
          print "Is ", index, " recognised?" 
          designation = raw_input() 

          if designation == "No": 
           Class_0.append(index) 
          else: 
           Class_1.append(index) 

       dataset['Type'] = 1      
       dataset.loc[dataset.index.str.contains('|'.join(Class_0)), 'Type'] = 0 

       print "\n" 

       results.append(collections.OrderedDict.fromkeys(dataset.index[dataset['Type'] == 0])) 
       print (results) 

       X = dataset.drop(['Type'], axis=1).values 
       Y = dataset['Type'].values 


       clf.set_params(n_estimators = len(clf.estimators_) + 40, warm_start = True) 
       clf.fit(X, Y) 
       joblib.dump(clf, 'classifier/classifier.pkl') 

       os.remove(filename) 


output = collections.Counter(Class_0) 

print "Class_0; \n" 
f.write ("Class_0; \n") 

for key, value in output.items():  
    f.write(str(key) + " ; " + str(value) + "\n") 
    print(str(key) + " ; " + str(value)) 

print "\n" 
f.write ("\n") 

output_1 = collections.Counter(Class_1) 

print "Class_1; \n" 
f.write ("Class_1; \n") 

for key, value in output_1.items():  
    f.write(str(key) + " ; " + str(value) + "\n") 
    print(str(key) + " ; " + str(value)) 

print "\n" 

f.close() 

误差(IndexError: index 1 is out of bounds for size 1)被引用预测线res = clf.predict([row])。据我所知,问题在于没有足够的“类”或数据的标签类型(我正在寻找二元分类器)?但我一直在使用这个确切的方法(在嵌套循环之外),没有任何问题。

https://codeshare.io/Gkpb44 - 包含我的.csv数据上面的代码共享链接提到.csv文件。

回答

0

所以我已经意识到了问题所在。

我创建在分级加载,然后使用warm_start我重新拟合数据更新分类,试图仿效增量/在线学习的格式。当我处理包含这两种类型的数据时,这很有效。但是,如果数据只是积极的,那么当我重新适应分类器时就会破坏它。

现在我已经评论了以下内容;

clf.set_params(n_estimators = len(clf.estimators_) + 40, warm_start = True) 
clf.fit(X, Y) 
joblib.dump(clf, 'classifier/classifier.pkl') 

已经解决了这个问题。展望未来,我可能会添加(又一个!)条件语句,看看我是否应该重新拟合数据。

我很想删除这个问题,但我还没有找到任何东西,我的搜索过程中涉及这一事实,我想我会在任何情况下的答案离开这个了发现他们有同样的问题。

0

的问题是,[row]是长度的数组1.你的程序试图访问索引1,其不存在(索引从0开始)。看起来你可能想要做res = clf.predict(row)或者再看看行变量。希望这可以帮助。