2012-05-25 83 views
0

我想与R使用KDD杯99数据集,但不幸的是,我得到非常糟糕的结果。基本上,预测器是猜测(交叉验证集约50%的错误)。我的代码中可能存在一个错误,但我找不到位置。使用KDD杯99数据集和机器学习与R

KDD cup 99数据集由大约4百万个例子组成,这些例子分为四类不同类型的攻击+“正常”类。首先,我将数据集分成5个文件(每个类一个+“正常”类一个),并将非数字数据转换为数字数据。目前,我正在研究“远程到本地”(r2l)类。根据关于该主题的论文结果选择一些功能。之后,我抽取了大量等于r2l实例数量的“正常”实例,以避免类错误的问题。我还用标签“attack”替换了所有不同类型r2l攻击的标签,这样我就可以训练一个两级分类器。然后,我将该样本加入到一个新数据集中的r2l实例中。最后,我申请了10倍交叉验证来评估我的模型,它是利用SVM构建和我得到了机器学习的历史上最坏的结果... :(

这里是我的代码:

r2l <- read.table("kddcup_r2l.data",sep=",",header=T) 
#u2r <- read.table("kddcup_u2r.data",sep=",",header=T) 
#probe_original <- read.table("kddcup_probe.data",sep=",",header=T) 
#dos <- read.table("kddcup_dos.data",sep=",",header=T) 
normal <- read.table("kddcup_normal.data",sep=",",header=T) 

#probe <- probe_original[sample(1:dim(probe_original)[1],10000),] 

# Features selected by the three algorithms svm, lgp and mars 
# for the different classes of attack 
######################################################################## 

features.r2l.svm <- c("srv_count","service","duration","count","dst_host_count") 
features.r2l.lgp <- c("is_guest_login","num_access_files","dst_bytes","num_failed_logins","logged_in") 
features.r2l.mars <- c("srv_count","service","dst_host_srv_count","count","logged_in") 
features.r2l.combined <- unique(c(features.r2l.svm,features.r2l.lgp,features.r2l.mars)) 



#  Sample the training set containing the normal labels 
#  for each class of attack in order to have the same number 
#  of training data belonging to the "normal" class and the 
#  "attack" class 
####################################################################### 

normal_sample.r2l <- normal[sample(1:dim(normal)[1],dim(r2l)[1]),] 


# This part was useful before the separation normal/attack because 
# attack was composed of different types for each class 
###################################################################### 

normal.r2l.Y <- matrix(normal_sample.r2l[,c("label")]) 


####################################################################### 
#  Class of attack Remote to Local (r2l) 
####################################################################### 

# Select the features according to the algorithms(svm,lgp and mars) 
# for this particular type of attack. Combined contains the 
# combination of the features selected by the 3 algorithms 
####################################################################### 
#features.r2l.svm <- c(features.r2l.svm,"label") 
r2l_svm <- r2l[,features.r2l.svm] 
r2l_lgp <- r2l[,features.r2l.lgp] 
r2l_mars <- r2l[,features.r2l.mars] 
r2l_combined <- r2l[,features.r2l.combined] 
r2l_ALL <- r2l[,colnames(r2l) != "label"] 

r2l.Y <- matrix(r2l[,c("label")]) 
r2l.Y[,1] = "attack" 



# Merge the "normal" instances and the "r2l" instances and shuffle the result 
############################################################################### 

r2l_svm.tr <- rbind(normal_sample.r2l[,features.r2l.svm],r2l_svm) 
r2l_svm.tr <- r2l_svm.tr[sample(1:nrow(r2l_svm.tr),replace=F),] 
r2l_lgp.tr <- rbind(normal_sample.r2l[,features.r2l.lgp],r2l_lgp) 
r2l_lgp.tr <- r2l_lgp.tr[sample(1:nrow(r2l_lgp.tr),replace=F),] 
r2l_mars.tr <- rbind(normal_sample.r2l[,features.r2l.mars],r2l_mars) 
r2l_mars.tr <- r2l_mars.tr[sample(1:nrow(r2l_mars.tr),replace=F),] 
r2l_ALL.tr <- rbind(normal_sample.r2l[,colnames(normal_sample.r2l) != "label"],r2l_ALL) 
r2l_ALL.tr <- r2l_ALL.tr[sample(1:nrow(r2l_ALL.tr),replace=F),] 

r2l.Y.tr <- rbind(normal.r2l.Y,r2l.Y) 
r2l.Y.tr <- matrix(r2l.Y.tr[sample(1:nrow(r2l.Y.tr),replace=F),]) 

####################################################################### 
# 
#  10-fold CROSS-VALIDATION to assess the models accuracy 
# 
####################################################################### 

# CV for Remote to Local 
########################  
cv(r2l_svm.tr, r2l_lgp.tr, r2l_mars.tr, r2l_ALL.tr, r2l.Y.tr) 

和交叉验证功能:

cv <- function(svm.tr, lgp.tr, mars.tr, ALL.tr, Y.tr){ 

Jcv.svm_mean <- NULL 

#Compute the size of the cross validation 
# ======================================= 
index=sample(1:dim(svm.tr)[1]) 
size.CV<-floor(dim(svm.tr)[1]/10) 

Jcv.svm <- NULL 

#Start 10-fold Cross validation 
# ============================= 
for (i in 1:10) { 
    # if m is the size of the training set 
    # (nr of rows in svm.tr for example) 
    # take n observations for test and (m-n) for training 
    # with n << m (here n = m/10) 
    # =================================================== 
    i.ts<-(((i-1)*size.CV+1):(i*size.CV)) 
    i.tr<-setdiff(index,i.ts) 

    Y.tr.tr <- as.factor(Y.tr[i.tr])  
    Y.tr.ts <- as.factor(matrix(Y.tr[i.ts],ncol=1)) 

    svm.tr.tr <- svm.tr[i.tr,] 
    svm.tr.ts <- svm.tr[i.ts,] 


    # Get the model for the algorithms 
    # ============================================== 


    model.svm <- svm(Y.tr.tr~.,svm.tr.tr,type="C-classification") 

    # Compute the prediction 
    # ============================================== 
    Y.hat.ts.svm <- predict(model.svm,svm.tr.ts) 

    # Compute the error 
    # ============================================== 

    h.svm <- NULL 

    h.svm <- matrix(Y.hat.ts.svm,ncol=1) 

    Jcv.svm <- c(Jcv.svm ,sum(!(h.svm == Y.tr.ts))/size.CV) 
    print(table(h.svm,Y.tr.ts)) 

} 

Jcv.svm_mean <- c(Jcv.svm_mean, mean(Jcv.svm)) 

d <- 10 
print(paste("Jcv.svm_mean: ", round(Jcv.svm_mean,digits=d))) 
} 

我得到很奇怪的结果看来,该算法并没有真正看到的情况有什么区别它看起来像一个猜测比预测更我。也尝试了攻击类“探测器”,但获得了相同的结果我前面提到的论文在等级r2l上为30%,在探头上为60-98%(取决于多项式等级)。

下面是10倍交叉验证中的一个预测:

h.svm(攻击)& Y.tr.ts(攻击) - > 42个实例

小时。 SVM(攻击)& Y.tr.ts(正常) - (。正常)> 44个实例

h.svm & Y.tr.ts(攻击) - > 71个实例

小时。 svm(normal。)& Y.tr.ts(normal。) - > 68 insta nces

如果有人能告诉我我的代码出了什么问题,我将非常感激。

预先感谢您

+0

显然,没有人似乎回答...是否因为我的问题没有很好地形成?还是因为没有人看到什么是错的? – Alex

+0

这属于[DataScience.SE](http://datascience.stackexchange.com),但现在太老,无法迁移。推荐你在那里试试。 – smci

回答