2017-09-24 43 views
1
## Load the data ## 

train=pd.read_csv("../kagglehouse/train.csv") 
test=pd.read_csv("../kagglehouse/test.csv") 
all_data=pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],test.loc[:,"MSSubClass":"SaleCondition"])) 

NFOLDS = 5 
SEED = 0 
NROWS = None 

ntrain = train.shape[0] 
ntest = test.shape[0] 

#creating matrices for sklearn 1: 
y_train=train["SalePrice"] 
x_train = np.array(all_data[:train.shape[0]]) 
x_test = np.array(all_data[train.shape[0]:]) 

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED) 

class SklearnWrapper(object): 
    def __init__(self, clf, seed=0, params=None): 
     params['random_state'] = seed 
     self.clf = clf(**params) 


    def train(self, x_train, y_train): 
     self.clf.fit(train_df_munged, label_df) 
     #self.clf.fit(x_train, y_train) 

    def predict(self, x): 
     return self.clf.predict(x) 

def get_oof(clf): 
    oof_train = np.zeros((ntrain,)) 
    oof_test = np.zeros((ntest,)) 
    oof_test_skf = np.empty((NFOLDS, ntest)) 

    for i, (train_index, test_index) in enumerate(kf): 

     x_tr = x_train[train_index] 

     y_tr = y_train[train_index] 

     x_te = x_train[test_index] 

     clf.train(x_tr, y_tr) 

     oof_train[test_index] = clf.predict(x_te) 
     oof_test_skf[i, :] = clf.predict(x_test) 

    oof_test[:] = oof_test_skf.mean(axis=0) 
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) 


et_params = { 
    'n_jobs': 16, 
} 

rf_params = { 
    'n_jobs': 16, 

} 

xgb_params = { 
    'seed': 0, 
    'colsample_bytree': 0.7, 
    'silent': 1, 
    'subsample': 0.7, 

} 



rd_params={ 
    'alpha': 10 
} 


ls_params={ 
    'alpha': 0.005 
} 


et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params) 
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params) 
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params) 
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params) 

et_oof_train, et_oof_test = get_oof(et) 
rf_oof_train, rf_oof_test = get_oof(rf) 
rd_oof_train, rd_oof_test = get_oof(rd) 
ls_oof_train, ls_oof_test = get_oof(ls) 

,看来这个遇到错误:输入包含的NaN,无穷大或过大的D型(“float64”)

 
    ValueError        Traceback (most recent call 
     last) 
      in() 
       135 
       136 xg_oof_train, xg_oof_test = get_oof(xg) 
      --> 137 et_oof_train, et_oof_test = get_oof(et) 
       138 rf_oof_train, rf_oof_test = get_oof(rf) 
       139 rd_oof_train, rd_oof_test = get_oof(rd) 

      in get_oof(clf) 
       77   x_te = x_train[test_index] 
       78 
      ---> 79   clf.train(x_tr, y_tr) 
       80 
       81   oof_train[test_index] = clf.predict(x_te) 

      in train(self, x_train, y_train) 
       46  def train(self, x_train, y_train): 
       47   #self.clf.fit(x_train, y_train) 
      ---> 48   self.clf.fit(x_train, y_train) 
       49 
       50  def predict(self, x): 

      E:\graphLab\Anaconda2\lib\site-packages\sklearn\ensemble\forest.pyc 
     in fit(self, X, y, sample_weight) 
       245   # Validate or convert input data 
       246   X = check_array(X, accept_sparse="csc", dtype=DTYPE) 
      --> 247   y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) 
       248   if sample_weight is not None: 
       249    sample_weight = check_array(sample_weight, ensure_2d=False) 

      E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc 
     in check_array(array, accept_sparse, dtype, order, copy, 
     force_all_finite, ensure_2d, allow_nd, ensure_min_samples, 
     ensure_min_features, warn_on_dtype, estimator) 
       420        % (array.ndim, estimator_name)) 
       421   if force_all_finite: 
      --> 422    _assert_all_finite(array) 
       423 
       424  shape_repr = _shape_repr(array.shape) 

      E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc 
     in _assert_all_finite(X) 
       41    and not np.isfinite(X).all()): 
       42   raise ValueError("Input contains NaN, infinity" 
      ---> 43       " or a value too large for %r." % X.dtype) 
       44 
       45 

      ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). 

,当我使用np.isnan(all_data.all()),它返回False和np.isfinite(all_data.all()),它的值返回True,所以我很迷惑。为什么我得到这个错误?

回答

3

您没有正确检查all_data

np.isnan(all_data.all()) 
np.isfinite(all_data.all()) 

你应该如何检查数据。

正在申请np.isnan()np.isfinite()到的all_data.all()其输出是总是一个布尔值True/False并且因此总是有限和非nan

您应该检查你的数据为:

np.isfinite(all_data).all() 
np.isnan(all_data).all() 

注意all()被应用于np.isfinite()np.isnan()输出,而不是周围的其他方式。

+0

ok.First,即时通讯真的非常感谢你回答我的问题,我用你的正确方法来检查all_data和all_data不存在南或无限值,它仍然有ValueError – zengcaifei

+0

@zengcaifei请编辑你的问题,以反映这个新的信息。 – Shai

+1

哦,我刚刚发现,当我使用x_train = np.array(all_data [:train.shape [0]])和x_test = np.array(all_data [train.shape [0]:]),我忘记y_train也需要改变为numpy,所以我改变y_train = train [“SalePrice”]到y_train = np.array(train [“SalePrice”]),它是正确的,但我仍然不知道它为什么发生 – zengcaifei

相关问题