2015-12-02 28 views
0

对于我的作业,我必须将K-means算法应用于600个数据点,它们都有60个维度(或属性,如果您愿意的话)。我存储了600个数据点到6群(所以K = 6),这是我为它至今:K均值算法C++,交换点?

#include <iostream> //std::cout 
#include <fstream> //std::ifstream 
#include <string> //std::string 
#include <sstream> //std::istringstream 
#include <vector> //std::vector 
#include <cmath> //std::cmath 
#include <array> //std::array 

#define K 6 
#define MAX_ITERATIONS 10000000 
#define NUM_ATTRIBUTES 60 

struct Point{ 
     std::array<double, NUM_ATTRIBUTES> point; 
     std::string classType; 
}; 

struct Cluster{ 
     std::vector<Point> points; 
     Point centroid; 
}; 

int randNumGenerator(int max){ 
     int num = (rand() % max); 
     return num; 
} 

void setData(Point p, std::string line, std::vector<Point> &data, int index){ 
     std::stringstream s(line); 
     std::string classes[] = {"Normal", "Cyclic", "Increasing trend", 
           "Decreasing trend", "Upward shift", "Downward shift"}; 

     for(int i = 0; i < NUM_ATTRIBUTES; i++){ 
       double num; 
       if(s >> num){ 
         p.point[i] = num; 
       } 
     } 

     if(index > 0 && index <= 100){ 
       p.classType = classes[0]; 
     } 
     else if(index > 100 && index <= 200){ 
       p.classType = classes[1]; 
     } 
     else if(index > 200 && index <= 300){ 
       p.classType = classes[2]; 
     } 
     else if(index > 300 && index <= 400){ 
       p.classType = classes[3]; 
     } 
     else if(index > 400 && index <= 500){ 
       p.classType = classes[4]; 
     } 
     else if(index > 500 && index <= 600){ 
       p.classType = classes[5]; 
     } 
     data.push_back(p); 
} 

void initializeCentroids(std::vector<Point> &points, int num_clusters, 
           std::vector<Point> &centroids){ 
     Point p; 
     std::vector<bool> numsUsedAlready(points.size()); 
     for(int i = 0; i < num_clusters; i++){ 
       int randNum = randNumGenerator(points.size()); 
       while(numsUsedAlready[randNum]){ 
         randNum = randNumGenerator(points.size()); 
       } 
       numsUsedAlready[randNum] = true; 
       p = points[randNum]; 
       centroids.push_back(p); 
     } 
} 

double calculateDistance(Point p, Point centroid){ 
     double ret = 0; 
     for(int i = 0; i < p.point.size(); i++){ 
       double distance = p.point[i] - centroid.point[i]; 
       ret += distance * distance; 
     } 
     return sqrt(ret); 
} 

void setCentroids(std::vector<Cluster> &clusters, std::vector<Point> centroids){ 
     for(int i = 0; i < centroids.size(); i++){ 
       Cluster c; 
       c.points.push_back(centroids[i]); 
       c.centroid = centroids[i]; 
       clusters.push_back(c); 
     } 
} 

void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){ 
     int sendToCluster = 99999999; 
     for(int index = 0; index < points.size(); index++){ 
       double minDist = 99999999999; 
       Point p = points[index]; 
       for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){ 
         double tempDist = calculateDistance(p, clusters[clusterNum].centroid); 
         //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl; 
         if(tempDist < minDist){ 
           minDist = tempDist; 
           sendToCluster = clusterNum; 
         } 
       } 
       //std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl; 
       clusters[sendToCluster].points.push_back(p); 
     } 
} 

void updateCentroid(std::vector<Cluster> &clusters){ 
     for(int i = 0; i < clusters.size(); i++){ 
       Cluster c = clusters[i]; 
       for(int j = 0; j < NUM_ATTRIBUTES; j++){ 
         double avg = 0; 
         for(int h = 0; h < c.points.size(); h++){ 
           Point p = c.points[h]; 
           avg += p.point[j]; 
         } 
         double oldCentroidValue = c.centroid.point[j]; 
         c.centroid.point[j] = avg/c.points.size(); 
         std::cout << "old: " << oldCentroidValue << " new: " << c.centroid.point[j] << std::endl; 
       } 
     } 
} 

void k_clustering(std::vector<Point> &points, int num_clusters){ 
     std::vector<Point> centroids; 
     initializeCentroids(points, num_clusters, centroids); 
     std::vector<Cluster> clusters; 
     setCentroids(clusters, centroids); 

     /*for(int i = 0; i < clusters.size(); i++){ 
       for(int j = 0; j < NUM_ATTRIBUTES; j++){ 
         std::cout << clusters[i].centroid.point[j] << " "; 
       } 
       std::cout << std::endl; 
     } 
     */ 

     setClusters(clusters, points); 
     updateCentroid(clusters); 
     for(int i = 0; i < clusters.size(); i++){ 
       std::cout << i << " " << clusters[i].points.size() << std::endl; 
     } 
} 

void readInputFile(std::ifstream &file, std::vector<Point> &data){ 
     std::string line; 
     int counter = 0; 
     while(getline(file,line)){ 
       counter++; 
       Point p; 
       setData(p, line, data, counter); 
     } 
} 

void usageString(){ 
     std::cout << "Usage: output <input_file>" << std::endl; 
} 

char* checkFile(int argc, char *argv[]){ 
     if(argc < 2){ 
       usageString(); 
       exit(0); 
     } 
     else{ 
       char *inputfile = argv[1]; 
       return inputfile; 
     } 
} 

int main(int argc, char *argv[]){ 
     char *inputfile; 
     inputfile = checkFile(argc, argv); 

     std::ifstream input(inputfile); 
     if(!input.is_open()){ 
       std::cerr << "Error: Data file doesn't exist" << std::endl; 
       return EXIT_FAILURE; 
     } 
     srand(time(NULL)); 
     std::vector<Point> data; 
     readInputFile(input, data); 
     k_clustering(data, K); 
     //printData(data); 
     //Attribute closest cluster to each data point 
      //For each point, calculate distance to each centroid 
      //Assign point to cluster (and update centroid value, by finding mean values of all points) 
      //Repeat until nothing changed or after a certain number of iterations 

     return 1; 
} 

然而,我发现我的代码工作正常,只在一个迭代。但是在第一次迭代之后,在我的'setClusters'函数中,我推出了一个Point。我将不得不将这个Point移动到不同的集群(如果需要的话),但是我很困惑如何做到这一点,而不需要从该集群删除该Point并将其推送到另一个集群的漫长路由。我确信有一个更有效的方法可以将Point转换到另一个群集。

回答

0

我还没有时间浏览整个代码,但如果您有兴趣优化setClusters(),那么也许您可以尝试C++的移动语义和vectoremplace_back()

void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){ 
     int sendToCluster = 99999999; 
     for(int index = 0; index < points.size(); index++){ 
       double minDist = 99999999999; 
       Point& p = points[index]; // use a reference to avoid copying, unless you don't want to alter the original "points" vector 
       for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){ 
         double tempDist = calculateDistance(p, clusters[clusterNum].centroid); 
         //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl; 
         if(tempDist < minDist){ 
           minDist = tempDist; 
           sendToCluster = clusterNum; 
         } 
       } 
       //std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl; 
       clusters[sendToCluster].points 
             .emplace_back( // construct the new item in place 
              std::move(p) // by "moving" from p 
             ); 
     } 
     points.clear(); // clear the input vector, unless you don't want to alter it. cf. the comment on using references 
} 

另一个想法是Cluster s将持有“指数”,而不是实际的“点”。在这种情况下,你会声明结构是这样的:

struct Cluster{ 
     std::vector<int> pointIndices; 
     Point centroid; 
}; 

然后,在setClusters(),你可以推指数:

void setClusters(std::vector<Cluster> &clusters, const std::vector<Point> points){ // points can be const now 
     int sendToCluster = 99999999; 
     for(int index = 0; index < points.size(); index++){ 
       double minDist = 99999999999; 
       const Point& p = points[index]; // use a reference to avoid copying 
       for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){ 
         double tempDist = calculateDistance(p, clusters[clusterNum].centroid); 
         //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl; 
         if(tempDist < minDist){ 
           minDist = tempDist; 
           sendToCluster = clusterNum; 
         } 
       } 
       //std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl; 
       clusters[sendToCluster].pointIndices 
             .push_back(index); // cheap compared to pushing a "Point" 
     } 
} 

我希望它能帮助。