2011-06-21 50 views
3

我正在写一个代码,对一组数据执行k-means聚类。我实际上使用O'Reilly出版的集体智慧书中的代码。一切正常,但在他的代码中,他使用命令行,我想用notepad ++编写所有内容。作为参考,他的路线是如何在Python中打印出数组中的对象?

>>>kclust=clusters.kcluster(data,k=10) 
>>>[rownames[r] for r in k[0]] 

这里是我的代码:

from PIL import Image,ImageDraw 
def readfile(filename): 
    lines=[line for line in file(filename)] 
    # First line is the column titles 
    colnames=lines[0].strip().split('\t')[1:] 
    rownames=[] 
    data=[] 
    for line in lines[1:]: 
     p=line.strip().split('\t') 
     # First column in each row is the rowname 
     rownames.append(p[0]) 
     # The data for this row is the remainder of the row 
     data.append([float(x) for x in p[1:]]) 
    return rownames,colnames,data 


from math import sqrt 
def pearson(v1,v2): 
    # Simple sums 
    sum1=sum(v1) 
    sum2=sum(v2) 

    # Sums of the squares 
    sum1Sq=sum([pow(v,2) for v in v1]) 
    sum2Sq=sum([pow(v,2) for v in v2]) 

    # Sum of the products 
    pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) 

    # Calculate r (Pearson score) 
    num=pSum-(sum1*sum2/len(v1)) 
    den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) 
    if den==0: return 0 

    return 1.0-num/den 

class bicluster: 
    def __init__(self,vec,left=None,right=None,distance=0.0,id=None): 
     self.left=left 
     self.right=right 
     self.vec=vec 
     self.id=id 
     self.distance=distance 

def hcluster(rows,distance=pearson): 
    distances={} 
    currentclustid=-1 

    # Clusters are initially just the rows 
    clust=[bicluster(rows[i],id=i) for i in range(len(rows))] 

    while len(clust)>1: 
     lowestpair=(0,1) 
     closest=distance(clust[0].vec,clust[1].vec) 

     # loop through every pair looking for the smallest distance 
     for i in range(len(clust)): 
      for j in range(i+1,len(clust)): 
       # distances is the cache of distance calculations 
       if (clust[i].id,clust[j].id) not in distances: 
        distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) 
       #print 'i' 
       #print i  
       #print 
       #print 'j' 
       #print j  
       #print 

       d=distances[(clust[i].id,clust[j].id)] 
       if d<closest: 
        closest=d 
        lowestpair=(i,j) 

     # calculate the average of the two clusters 
     mergevec=[ 
     (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 
     for i in range(len(clust[0].vec))] 

     # create the new cluster 
     newcluster=bicluster(mergevec,left=clust[lowestpair[0]], 
          right=clust[lowestpair[1]], 
          distance=closest,id=currentclustid) 

     # cluster ids that weren't in the original set are negative 
     currentclustid-=1 
     del clust[lowestpair[1]] 
     del clust[lowestpair[0]] 
     clust.append(newcluster) 

    return clust[0] 

def kcluster(rows,distance=pearson,k=4): 
    # Determine the minimum and maximum values for each point 
    ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) 
    for i in range(len(rows[0]))] 

    # Create k randomly placed centroids 
    clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] 
    for i in range(len(rows[0]))] for j in range(k)] 

    lastmatches=None 
    for t in range(100): 
     print 'Iteration %d' % t 
     bestmatches=[[] for i in range(k)] 

     # Find which centroid is the closest for each row 
     for j in range(len(rows)): 
      row=rows[j] 
      bestmatch=0 
      for i in range(k): 
       d=distance(clusters[i],row) 
       if d<distance(clusters[bestmatch],row): bestmatch=i 
      bestmatches[bestmatch].append(j) 

     # If the results are the same as last time, this is complete 
     if bestmatches==lastmatches: break 
     lastmatches=bestmatches 

     # Move the centroids to the average of their members 
     for i in range(k): 
      avgs=[0.0]*len(rows[0]) 
      if len(bestmatches[i])>0: 
       for rowid in bestmatches[i]: 
        for m in range(len(rows[rowid])): 
         avgs[m]+=rows[rowid][m] 
       for j in range(len(avgs)): 
        avgs[j]/=len(bestmatches[i]) 
       clusters[i]=avgs 
    return bestmatches 
+2

的主要功能一个很好的描述有什么概括为一个相当简单的问题:“如何打印......”这里的许多代码。考虑发布较少的代码,只有相关的部分。 –

回答

2

我假设你想运行这是一个程序?最简单的方法是简单地在底部添加这些行:

kclust=clusters.kcluster(data,k=10) 
print [rownames[r] for r in k[0]] 

你可能想在以后添加的主要功能,如果你正计划扩大,虽然。

编辑:有是由Guido here