0

我目前正在使用tensorflow根据其“宽”教程实现逻辑回归器:https://www.tensorflow.org/tutorials/wide
我的代码与教程非常匹配,但是,当我在模型,它每次都会猜测负面的类,这是大约77%的数据。我怎样才能让我的模型冒险进行积极的猜测?我不是正规化的,所以方差应该是最大值。该文档夸耀了84%的准确性,我使用完全相同的数据集。可能会出现什么问题?这里是训练码:Tensorflow LinearClassifier()总是猜测否定类

def train_logistic_model(training_path, response, predictors, num_labels): 

    # Get csv 
    df_train = pd.read_csv(training_path, header=0) 

    # Sanitize column names 
    unsanitized_column_names = df_train.columns.values 
    column_names = [] 
    for col in unsanitized_column_names: 
     column_names.append(re.sub('[^A-Za-z0-9]+', '', col)) 

    # Update dataframe with sanitized column names 
    df_train = pd.read_csv(training_path, names=column_names, skiprows=1) 

    # Slice off %10 of training data to test with 
    df_test = df_train.loc[(len(df_train.index) * .9):] 
    df_train = df_train.loc[:(len(df_train.index) * .9)] 

    response_name = column_names[response] 

    LABEL_COLUMN = "label" 
    df_train[LABEL_COLUMN] = (df_train[response_name].apply(lambda x: ">50K" in x)).astype(int) 
    df_test[LABEL_COLUMN] = (df_test[response_name].apply(lambda x: ">50K" in x)).astype(int) 

    del df_train[response_name] 
    del df_test[response_name] 

    # remove NaN elements 
    df_train = df_train.dropna(how='any', axis=0) 
    df_test = df_test.dropna(how='any', axis=0) 

    CATEGORICAL_COLUMNS = [] 
    CONTINUOUS_COLUMNS = [] 
    for key, value in predictors.items(): 
     if value == 'Categorical': 
      CATEGORICAL_COLUMNS.append(column_names[key]) 
     elif value == 'Continuous': 
      CONTINUOUS_COLUMNS.append(column_names[key]) 

    # Input bulder function 
    def input_fn(df): 
     continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS} 

     categorical_cols = { 
      k: tf.SparseTensor(
       indices=[[i, 0] for i in range(df[k].size)], 
       values=df[k].values, 
       dense_shape=[df[k].size, 1]) for k in CATEGORICAL_COLUMNS 
      } 

     # Merges the two dictionaries into one. 
     feature_cols = {**continuous_cols, **categorical_cols} 

     label = tf.constant(df[LABEL_COLUMN].values) 

     return feature_cols, label 

    def train_input_fn(): 
     return input_fn(df_train) 

    def eval_input_fn_test(): 
     return input_fn(df_test) 

    cat_tensors = [] 
    for col in CATEGORICAL_COLUMNS: 
     cat_tensors.append(tf.contrib.layers.sparse_column_with_hash_bucket(
      column_name=col, hash_bucket_size=100)) 

    cont_tensors = [] 
    for cont in CONTINUOUS_COLUMNS: 
     cont_tensors.append(tf.contrib.layers.real_valued_column(cont)) 

    feature_columns = cat_tensors + cont_tensors 

    model_dir = tempfile.mkdtemp() 

    logistic_model = tf.contrib.learn.LinearClassifier(feature_columns=feature_columns, n_classes=num_labels, model_dir=model_dir) 

    logistic_model.fit(input_fn=train_input_fn, steps=200) 

    # Test the model on reserve data 
    eval_result_test = logistic_model.evaluate(input_fn=eval_input_fn_test, steps=1) 

    # Test the model on training data 
    eval_result_train = logistic_model.evaluate(input_fn=eval_input_fn_train, steps=1) 

    for key in sorted(eval_result_train): 
     print("%s: %s" % (key, eval_result_train[key])) 

    return eval_result_test, model_dir 

回答

0

我想,你需要添加cross_columns,使我们的线性模型更好地工作。