2017-06-11 44 views
1

我想在一系列alpha(拉普拉斯平滑参数)上使用GridSearchCV来检查哪个给出了伯努利朴素贝叶斯模型的最佳精度。GridSearchCV初始化

def binarize_pixels(data, threshold=0.784): 
    # Initialize a new feature array with the same shape as the original data. 
    binarized_data = np.zeros(data.shape) 

    # Apply a threshold to each feature. 
    for feature in range(data.shape[1]): 
     binarized_data[:,feature] = data[:,feature] > threshold 
    return binarized_data 

binarized_train_data = binarize_pixels(mini_train_data) 

def BNB(): 
    clf = BernoulliNB() 
    clf.fit(binarized_train_data, mini_train_labels) 
    scoring = clf.score(mini_train_data, mini_train_labels) 
    predsNB = clf.predict(dev_data) 
    print "Bernoulli binarized model accuracy: {:.4}".format(np.mean(predsNB == dev_labels)) 

该模型运行正常,而我的GridSearch交叉验证并不:

pipeline = Pipeline([('classifier', BNB())]) 
def P8(alphas): 
    gs_clf = GridSearchCV(pipeline, param_grid = alphas, refit=True) 
    y_predictions = gs_clf.best_estimator_.predict(dev_data) 
    print classification_report(dev_labels, y_predictions) 
alphas = {'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]} 
P8(alphas) 

我得到AttributeError的: 'GridSearchCV' 对象有 'best_estimator_'

回答

1

没有属性的问题是在以下两行:

gs_clf = GridSearchCV(pipeline, param_grid = alphas, refit=True) 
y_predictions = gs_clf.best_estimator_.predict(dev_data) 

请注意,在使用之前predict ,你首先需要适应模型。也就是说,致电gs_clf.fit。请参阅documentation中的以下示例:

>>> from sklearn import svm, datasets 
>>> from sklearn.model_selection import GridSearchCV 
>>> iris = datasets.load_iris() 
>>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} 
>>> svr = svm.SVC() 
>>> clf = GridSearchCV(svr, parameters) 
>>> clf.fit(iris.data, iris.target) 
...        
GridSearchCV(cv=None, error_score=..., 
     estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=..., 
        decision_function_shape=None, degree=..., gamma=..., 
        kernel='rbf', max_iter=-1, probability=False, 
        random_state=None, shrinking=True, tol=..., 
        verbose=False), 
     fit_params={}, iid=..., n_jobs=1, 
     param_grid=..., pre_dispatch=..., refit=..., return_train_score=..., 
     scoring=..., verbose=...) 
>>> sorted(clf.cv_results_.keys()) 
...        
['mean_fit_time', 'mean_score_time', 'mean_test_score',... 
'mean_train_score', 'param_C', 'param_kernel', 'params',... 
'rank_test_score', 'split0_test_score',... 
'split0_train_score', 'split1_test_score', 'split1_train_score',... 
'split2_test_score', 'split2_train_score',... 
'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]