2016-12-07 29 views
0

我想大致重现C-分类SVM交叉验证R中使用Python(scikit-learn)技能(e1071 package),但我越来越远不的r预报技巧。考虑到下面的训练和测试数据(从数据集中得到的平均长度要大得多),R预测技能是0.87(其中1是完美的),Python技能是0.55,这比猜测好得多。请注意,我完全没有试图获得相同的结果,我只希望如果R可以做得相当好,那么Python可以在同一个数据集上。我已经将我的数据分成了50-50(训练和测试),并试图预测浮点数的二项结果。 R和Python代码如下。我检查的所有默认SVM参数在R和Python(gamma,C(cost),shrink,tol等)之间都是相同的。可怜的技能相比于R

R代码里面:

library("e1071") 

data <- c(-108.604150711185, -131.880188127745, -18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562, -396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413) 
class <- as.factor(c(0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0)) 
df_training <- data.frame(data, class) 

data <- c(133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133, 80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719, 85.552, 120.582346886614, -214.850084749638, 96.8090523924549) 
class <- as.factor(c(1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1)) 
df_test <- data.frame(data, class) 

#train model 
best.svm <- best.tune(svm, 
         class~data, 
         data=df_training,kernel = 'radial',cost = 1, gamma = 0.01, 
         type = "C-classification") 

#make predictions 
TrainingPredictions<-predict(best.svm,df_training,type="class") 
TestPredictions <- predict(best.svm,df_test,type="class") 

Skill = sum(TestPredictions==df_test[[c('class')]])/length(TestPredictions) 
print(Skill) #value is 0.87 

Python代码:

import numpy as np 
from sklearn.svm import SVC 

Data = np.array([-108.604150711185, -131.880188127745,-18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562,-396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413]) 
Class = np.array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0]) 
df_training = np.array([Data, Class]) 

Data = np.array([133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133,80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719,85.552, 120.582346886614, -214.850084749638, 96.8090523924549]) 
Class = np.array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1]) 
df_test = np.array([Data, Class]) 

# train model                                     
clf = SVC(verbose=True, gamma=0.01, kernel='rbf', C=1)              

# make predictions 
clf.fit(df_training[0].reshape(88,1), df_training[1].reshape(88,1)) 
TrainingPredictions = clf.predict(df_training[0].reshape(88,1)) 
TestPredictions = clf.predict(df_test[0].reshape(89,1)) 
Skill = np.sum(TestPredictions==df_test[1])/float(len(TestPredictions)) 
print Skill #value is 0.55 
+2

作为R-只有“扬声器/思想家”我觉得很奇怪,这对Python的抱怨会有一个R标签。即使有人对如何使用另一种计算机语言(但没有明确的自然语言描述)的邮政编码进行编码并期望我们单语的R用户提供建议,我也会感到烦恼。支持向量机也可能是非确定性的,因此只要在具有二项式预测的小数据集上提供单次运行输出就不可能是令人信服的“真正差异”证据。 –

+0

Grumpy先生脸上:我做了一些努力写简洁明了的方式写这篇文章。我希望这是关于调整SVM的最高技能的细节。支持向量机可能不是确定性的,但你可以调整它们以提高技能(R中的best.svm函数就是这样)。所发布的数据从数据量的20倍进行平滑处理,因此不会有太多噪音。 –

+0

所以......你真的认为R标签对于未来试图找到关于如何在未来使用Python进行SVM的信息的人有用吗?如果“价值”具有可比性,那么你不认为它会在问题中增加一些东西来说明它们是用比显示的数据大得多的数据构建的? –

回答

2

观察到的差异可能来自以下事实:在R,SVM()由缺省值(see documentation, page 6)缩放的数据。

如果使用scikit学习的StandardScaler,你最终与你有R获得了一个相当接近的结果:

import numpy as np 
from sklearn.svm import SVC 
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler() 
Data = np.array([-108.604150711185, -131.880188127745,-18.3017441809734, 32.011639982337, -71.6651360870381, -107.587087751331, 21.316311739316, -36.015324564807, 138.22302265079, 47.9322592065447, -129.007749732555, -150.41808326425, -141.00589707504, -105.912063885407, 76.2956568174239, 141.457541434218, -20.6676395937811, -226.505644333494, -151.229861588686, -160.18717733968, -107.01667849677, -7.52794131287047, -93.1147621027003, 5.59630172385392, 38.741091785708, -32.9061390503546, -78.5031246062325, -9.64080356337477, -54.1430873201472, -108.127067430103, -12.2589074567133, 129.212940940854, 132.670728015743, 107.075153550768, 167.176831103164, -20.6839530330714, 102.677911281291, -109.423698849103, -154.454318421757, 140.52342226202, 110.184351332211, -16.6842057565239, -11.1688984829787, 178.441845032635, 37.0689292040101, 166.610506783818, -79.2764182099804, 99.1136693164655, 82.0929274697289, 15.1752041486536, 178.489001782771, 145.332200036106, -185.977800430997, -90.5440753976243, 78.0459300120412, 144.297553387967, 99.5945824957091, 110.803195137024, 81.3094331750562,-396.825240330405, -166.038928089807, -78.863983688682, 138.309908804212, -148.647304302406, -2.23135233624276, 129.411511929621, -111.664324254549, -96.4151180340831, 129.219227225386, 90.7050615157428, 141.986869866474, 93.0147970463941, 142.807435791073, -75.8426755946232, 122.537973092667, 117.078515092191, 134.166968023265, 90.8512172789568, 146.367129646428, 125.539182526718, -70.485058023267, -46.967575223949, 116.210349687502, -91.2992704167832, 104.052231138142, -114.580693287221, -82.9991067628608, -111.649187979413]) 
Data = scaler.fit_transform(Data) 
Class = np.array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0]) 
df_training = np.array([Data, Class]) 

Data = np.array([133.75999742845, 22.9386702890105, -126.959902277009, -116.317935595297, -33.9418594804197, -49.0102540773413, -159.266630498512, -8.92296705690401, 114.328300224712, 66.0706175847251, -154.385344188283, 70.7868284982941, -28.334490887314, 118.755307949047, 154.362286178401, 101.331675190569, 96.2196681290104, 99.5694296232446, 210.160787371823, 65.8474210711036, -125.475676456606, 66.7541385125748, -161.001356357477, -40.1416817172267, 38.6877489907967, -7.12706914419719, -10.3967176519225, -80.6831091111636, 128.604227270616, 75.4219966516171, 184.951786958864, 90.9170782990185, 66.7190886024699, 81.377280661573, -82.4053965286415, -65.6718687269108, 61.1679518726262, 190.532649096311, 199.917670153196, 104.558442558929, 113.747065157369, 106.640501329133,80.593201532054, 75.0176280888154, 155.538654396817, 30.0548798029353, 116.900219512636, 131.431417509576, 33.3308447581156, -121.191534016935, -80.4203785670198, 157.737407847885, 66.5956228628815, 50.8340706561446, -113.713450848071, -18.7787225270887, 113.832326071127, -45.5884280143408, 221.782395098832, 70.1660982367319, 235.005982636939, 80.8180320055801, -74.7107276814795, 133.925782624001, 97.9261686360971, -127.954532027281, 58.9295075974962, 96.1702797891484, -49.6048543914143, -42.1842037639683, -235.694708213157, 13.4862841916787, 126.396462591781, 214.297316240176, 125.148658464391, 84.8887673204376, 78.2717096234718, 139.677936314095, -168.649300541479, 103.40253638232, 69.2727189156141, 153.017155534869, -238.07168745534, -166.929968475244, 113.414489211719,85.552, 120.582346886614, -214.850084749638, 96.8090523924549]) 
Data = scaler.fit_transform(Data) 
Class = np.array([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1]) 
df_test = np.array([Data, Class]) 

# train model                                     
clf = SVC(verbose=True, gamma=0.01, kernel='rbf', C=1)              

# make predictions 
clf.fit(df_training[0].reshape(88,1), df_training[1].reshape(88,1)) 
TrainingPredictions = clf.predict(df_training[0].reshape(88,1)) 
TestPredictions = clf.predict(df_test[0].reshape(89,1)) 
Skill = np.sum(TestPredictions==df_test[1])/float(len(TestPredictions)) 
print("Skill: "+str(Skill)) #value is 0.84 
+0

这是我正在寻找的信息的重要一点。 –