的问题是,该数据包含楠:
代码:
import pandas as pd
from numpy import nanmean
import numpy as np
from sklearn.linear_model import LogisticRegression
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test["Sex"][test["Sex"]=="male"]=0
test["Sex"][test["Sex"]=="female"]=1
test["Embarked"][test["Embarked"]=='S']=0
test["Embarked"][test["Embarked"]=='C']=1
test["Embarked"][test["Embarked"]=='Q']=2
train["Sex"][train["Sex"]=="male"]=0
train["Sex"][train["Sex"]=="female"]=1
train["Embarked"][train["Embarked"]=='S']=0
train["Embarked"][train["Embarked"]=='C']=1
train["Embarked"][train["Embarked"]=='Q']=2
nan_mean_age = nanmean(test.iloc[:,4])
test = test.fillna(value = nan_mean_age)
nan_mean_age2 = nanmean(train.iloc[:,5])
train = train.fillna(value = nan_mean_age2)
train_features=train[["Sex","Age","Pclass","Fare","Embarked"]].values
test_features=test[["Sex","Age","Pclass","Fare","Embarked"]].values
train_features = np.asarray(train_features)
test_features = np.asarray(test_features)
lg = LogisticRegression()
#define your target variable y and then fit
y_train = train.iloc[:,1]
lg.fit(train_features,y_train)
lg.predict(test_features)
结果:
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
1, 0, 0, 0], dtype=int64)
像这样的东西应该工作正常
可你上传的测试数据集?另外,你在这个测试数据集中是否有NaN或字符串? – sera
https://www.kaggle.com/c/titanic/data我已经推断所有NaN,并用字符串替换所有分类变量,然后将其提供给预测函数 –
问题是,您使用“性别”包含字符串而不是数字。 predict()函数仅将输入数据集作为输入。你需要将“性别”转换为男性为1和男性为0的二进制 – sera