2017-04-12 39 views
0

一个没有希望的专栏中,我有这样的代码添加在CSV

from sklearn import tree 
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" 
train = pd.read_csv(train_url) 
train["Sex"][train["Sex"] == "male"] = 0 
train["Sex"][train["Sex"] == "female"] = 1 
train["Embarked"] = train["Embarked"].fillna("S") 
train["Age"] = train["Age"].fillna(train["Age"].median()) 
train["Embarked"][train["Embarked"] == "S"] = 0 
train["Embarked"][train["Embarked"] == "C"] = 1 
train["Embarked"][train["Embarked"] == "Q"] = 2 
target = train["Survived"].values 
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values 
my_tree_one = tree.DecisionTreeClassifier() 
my_tree_one = my_tree_one.fit(features_one, target) 
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" 
test = pd.read_csv(test_url) 
test.Fare[152] = test["Fare"].median() 
test["Sex"][test["Sex"] == "male"] = 0 
test["Sex"][test["Sex"] == "female"] = 1 
test["Embarked"] = test["Embarked"].fillna("S") 
test["Age"] = test["Age"].fillna(test["Age"].median()) 
test["Embarked"][test["Embarked"] == "S"] = 0 
test["Embarked"][test["Embarked"] == "C"] = 1 
test["Embarked"][test["Embarked"] == "Q"] = 2 
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values 
my_prediction = my_tree_one.predict(test_features) 
PassengerId = np.array(test["PassengerId"]).astype(int) 
my_solution = pd.DataFrame(my_prediction, PassengerId) 
my_solution.to_csv("5.csv", index_label = ["PassangerId", "Survived"]) 

正如你可以看到我只想救一个CSV有两列,但是当我看到文件5.csv它增加了一个叫做另一列0 ..任何人都知道为什么?

回答

0

您看到此行为是因为当只有一个索引时添加两个index_labels

可以代替命名一个列如:

my_solution.columns = ['Survived'] 

然后标记您指数像这样:

my_solution.to_csv("5.csv", index_label=["PassengerId"]) 
+0

哦,你是绝对正确的。非常感谢 –

+0

你是最受欢迎的。请考虑接受带有复选标记的答案,如果它帮助你。快乐的编码给你。 – bernie

0

尝试这种略带优化的解决方案:

from sklearn import tree 

train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" 
cols = ["Pclass", "Sex", "Age", "Fare"] 

mappings = { 
    'Sex': {'male':0, 'female':1}, 
} 

def cleanup(df, mappings=mappings): 
    # map non-numeric columns 
    for c in mappings.keys(): 
     df[c] = df[c].map(mappings[c]) 
    # replace NaN's with average value 
    for c in df.columns[df.isnull().any()]: 
     df[c].fillna(df[c].mean(), inplace=True) 
    return df 

# parse train data set 
train = cleanup(d.read_csv(train_url, usecols=cols + ['Survived'])) 
my_tree_one = tree.DecisionTreeClassifier() 
my_tree_one.fit(train.drop('Survived',1), train['Survived']) 

# parse test data set 
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" 
test = pd.read_csv(test_url, usecols=cols+['PassengerId']) 
result = test.pop('PassengerId').to_frame('PassengerId') 
test = cleanup(test) 

result['Survived'] = my_tree_one.predict(test) 
result.to_csv("5.csv", index=False)