2014-06-12 21 views
2

我一直在玩MNIST数字识别数据集,我有点卡住了。我阅读了一些研究论文,并实施了我所了解的全部内容。基本上我做的是我首先创建了我的训练集和用于评估分类器的交叉验证集,然后在测试和训练集上运行PCA,然后使用KNN和SVM执行分类任务。我面临的主要问题是,我应该在所有集合上运行PCA,然后将我的训练集和交叉验证集分开或分开,然后在交叉验证测试和训练集上单独运行PCA。我很抱歉询问我已经尝试过的事情,因为我已经尝试了两种情况,在第一种情况下,我的分类器表现出色,因为我猜测PCA使用测试数据集的同时创建了调整我的结果的主要组件,在我的模型中偏见的原因,在另一种情况下,表现是20%至30%左右,这是非常低的。所以我有点困惑,应该如何改进我的模型,高度赞赏任何帮助和指导,我已经粘贴了我的代码以供参考。MNIST数字识别数据集性能不佳

library(ggplot2) 
library(e1071) 
library(ElemStatLearn) 
library(plyr) 
library(class) 

import.csv <- function(filename){ 
    return(read.csv(filename, sep = ",", header = TRUE, stringsAsFactors = FALSE)) 
} 

train.data <- import.csv("train.csv") 
test.data <- train.data[30001:32000,] 
train.data <- train.data[1:6000,] 

#Performing PCA on the dataset to reduce the dimensionality of the data 

get_PCA <- function(dataset){ 
    dataset.features <- dataset[,!(colnames(dataset) %in% c("label"))] 
    features.unit.variance <- names(dataset[, sapply(dataset, function(v) var(v, na.rm=TRUE)==0)]) 
    dataset.features <- dataset[,!(colnames(dataset) %in% features.unit.variance)] 
    pr.comp <- prcomp(dataset.features, retx = T, center = T, scale = T) 
    #finding the total variance contained in the principal components 
    prin_comp <- summary(pr.comp) 
    prin_comp.sdev <- data.frame(prin_comp$sdev) 
    #print(paste0("%age of variance contained = ", sum(prin_comp.sdev[1:500,])/sum(prin_comp.sdev))) 
    screeplot(pr.comp, type = "lines", main = "Principal Components") 
    num.of.comp = 50 
    red.dataset <- prin_comp$x 
    red.dataset <- red.dataset[,1:num.of.comp] 
    red.dataset <- data.frame(red.dataset) 
    return(red.dataset) 
} 

#Perform k-fold cross validation 

do_cv_class <- function(df, k, classifier){ 
    num_of_nn = gsub("[^[:digit:]]","",classifier) 
    classifier = gsub("[[:digit:]]","",classifier) 
    if(num_of_nn == "") 
    { 
    classifier = c("get_pred_",classifier) 
    } 
    else 
    { 
    classifier = c("get_pred_k",classifier) 
    num_of_nn = as.numeric(num_of_nn) 
    } 
    classifier = paste(classifier,collapse = "") 
    func_name <- classifier 
    output = vector() 
    size_distr = c() 
    n = nrow(df) 
    for(i in 1:n) 
    { 
    a = 1 + (((i-1) * n)%/%k) 
    b = ((i*n)%/%k) 
    size_distr = append(size_distr, b - a + 1) 
    } 

    row_num = 1:n 
    sampling = list() 
    for(i in 1:k) 
    { 
    s = sample(row_num,size_distr) 
    sampling[[i]] = s 
    row_num = setdiff(row_num,s) 
    } 
    prediction.df = data.frame() 
    outcome.list = list() 

    for(i in 1:k) 
    { 
    testSample = sampling[[i]] 
    train_set = df[-testSample,] 
    test_set = df[testSample,]  

    if(num_of_nn == "") 
    { 
     classifier = match.fun(classifier) 
     result = classifier(train_set,test_set) 
     confusion.matrix <- table(pred = result, true = test_set$label) 
     accuracy <- sum(diag(confusion.matrix)*100)/sum(confusion.matrix) 
     print(confusion.matrix) 
     outcome <- list(sample_ID = i, Accuracy = accuracy) 
     outcome.list <- rbind(outcome.list, outcome) 
    } 
    else 
    { 

     classifier = match.fun(classifier) 
     result = classifier(train_set,test_set) 
     print(class(result)) 
     confusion.matrix <- table(pred = result, true = test_set$label) 
     accuracy <- sum(diag(confusion.matrix)*100)/sum(confusion.matrix) 
     print(confusion.matrix) 
     outcome <- list(sample_ID = i, Accuracy = accuracy) 
     outcome.list <- rbind(outcome.list, outcome) 
    } 
    } 
    return(outcome.list) 
} 

#Support Vector Machines with linear kernel 

get_pred_svm <- function(train, test){ 
    digit.class.train <- as.factor(train$label) 
    train.features <- train[,-train$label] 
    test.features <- test[,-test$label] 
    svm.model <- svm(train.features, digit.class.train, cost = 10, gamma = 0.0001, kernel = "radial") 
    svm.pred <- predict(svm.model, test.features) 
    return(svm.pred) 
} 

#KNN model 
get_pred_knn <- function(train,test){ 
    digit.class.train <- as.factor(train$label) 
    train.features <- train[,!colnames(train) %in% "label"] 
    test.features <- test[,!colnames(train) %in% "label"] 
    knn.model <- knn(train.features, test.features, digit.class.train) 
    return(knn.model) 
} 

=========================================== =============================

回答

4

将PCA视为适用于您的数据的转换。你想两件事情举行:

  1. 由于测试集模拟了“真实世界”的情况下,你得到的样品,你没有看到过,你不能使用测试仪任何东西,但分类的评价。
  2. 您需要对所有样本应用相同的转换。

因此,你需要PCA应用到训练集,不断变换的数据,这是两条信息:

  1. 的意思是你从样品,以集中他们减去。
  2. 变换矩阵,即协方差矩阵

的特征向量和相同的变换应用到测试集。

+0

谢谢你的帮助和指导:)。 – user37940

+0

如果它有帮助,考虑upvoting /接受:) –

+0

我很感谢你帮助我,因为我得到了96.87%的准确性,这是惊人的,我很抱歉没有投票,因为它一直说我需要有15这样做的声誉。 – user37940