2017-04-06 9 views
0

我从.csv文件(databoth.csv)中提取了以下数据,并使用matplotlib执行了k-means聚类。数据是3列(国家,出生率,预期寿命)。使用matplotlib标记K-means簇数据点

我需要帮助输出: 属于每个群集的国家数量。 属于每个群集的国家列表。 每个群集的平均预期寿命和出生率。

这里是我的代码:

import csv 
import matplotlib.pyplot as plt 
import sys 
import pylab as plt 
import numpy as np 
plt.ion() 


#K-Means clustering implementation 
# data = set of data points 
# k = number of clusters 
# maxIters = maximum number of iterations executed k-means 
def kMeans(data, K, maxIters = 10, plot_progress = None): 

    centroids = data[np.random.choice(np.arange(len(data)), K), :] 
    for i in range(maxIters): 
     # Cluster Assignment step 
     C = np.array([np.argmin([np.dot(x_i-y_k, x_i-y_k) for y_k in 
     centroids]) for x_i in data]) 
     # Move centroids step 
     centroids = [data[C == k].mean(axis = 0) for k in range(K)] 
     if plot_progress != None: plot_progress(data, C, np.array(centroids)) 
    return np.array(centroids) , C 


# Calculates euclidean distance between 
# a data point and all the available cluster 
# centroids. 
def euclidean_dist(data, centroids, clusters): 
    for instance in data: 
     mu_index = min([(i[0], np.linalg.norm(instance-centroids[i[0]])) \ 
         for i in enumerate(centroids)], key=lambda t:t[1])[0] 
    try: 
     clusters[mu_index].append(instance) 
    except KeyError: 
     clusters[mu_index] = [instance] 

# If any cluster is empty then assign one point 
# from data set randomly so as to not have empty 
# clusters and 0 means. 
for cluster in clusters: 
    if not cluster: 
     cluster.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist()) 

return clusters 


# this function reads the data from the specified files 
def csvRead(file): 
    np.genfromtxt('dataBoth.csv', delimiter=',') 




# function to show the results on the screen in form of 3 clusters 
def show(X, C, centroids, keep = False): 
    import time 
    time.sleep(0.5) 
    plt.cla() 
    plt.plot(X[C == 0, 0], X[C == 0, 1], '*b', 
    X[C == 1, 0], X[C == 1, 1], '*r', 
    X[C == 2, 0], X[C == 2, 1], '*g') 
plt.plot(centroids[:,0],centroids[:,1],'*m',markersize=20) 
plt.draw() 
if keep : 
    plt.ioff() 
    plt.show() 

# generate 3 cluster data 
data = csvRead('dataBoth.csv') 
m1, cov1 = [9, 8], [[1.5, 2], [1, 2]] 
m2, cov2 = [5, 13], [[2.5, -1.5], [-1.5, 1.5]] 
m3, cov3 = [3, 7], [[0.25, 0.5], [-0.1, 0.5]] 
data1 = np.random.multivariate_normal(m1, cov1, 250) 
data2 = np.random.multivariate_normal(m2, cov2, 180) 
data3 = np.random.multivariate_normal(m3, cov3, 100) 
X = np.vstack((data1,np.vstack((data2,data3)))) 
np.random.shuffle(X) 


# calls to the functions 
# first to find centroids using k-means 
centroids, C = kMeans(X, K = 3, plot_progress = show) 
#second to show the centroids on the graph 
show(X, C, centroids, True) 

回答

0

也许你可以使用annotatehttp://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.annotate

更多例子: http://matplotlib.org/users/annotations.html#plotting-guide-annotation

这将允许有一个文本标签附近的每个点。

,或者你可以使用颜色在这post

+0

嗨@Dadep我编辑我的问题上需要帮助,更清晰。 – Zambo004

+0

所以你应该阅读你已经放入簇中每个'cluster'的所有'instance',并对它们进行统计。我会尽力编辑我的帖子后请 – Dadep

+0

请帮助我很多! – Zambo004