2016-05-26 32 views
0

也许我错过了一些东西,因为这似乎是一个简单的问题,但是我在网上查找并没有在文献中找到任何东西。基于节点值约束的k均值聚类

基本上我需要做的是根据它们的位置(所以纬度/经度作为每个节点的特征,相似性度量的欧几里德距离)对具有固定数量的簇的一组目的地城市进行聚类。一切似乎都很好,一个K-means就可以做到这一点。但是,对于每个群集,我有以下限制:每个城市(节点)都有相应的值分配给它,并且每个群集中这些值的总和不应超过固定阈值(所有群集的阈值相同)。有没有简单的方法来做到这一点?

+0

这是一个**背包**类型的问题,而不是一个聚类问题。另外,不要在纬度/经度上使用k-means。 –

+0

不,这不是一个背包问题。它只是与约束聚类。 –

回答

0

你有2种选择:

- 你可以改用rpart包作为一个集群,并使用重量和minbucket选项。然而,预测会给你的集群将是矩形。

- 你可以看看源代码,我在 https://searchcode.com/codesearch/view/18689414/发现:

kmeans <- 
function(x, centers, iter.max = 10, nstart = 1, 
     algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")) 
{ 
    do_one <- function(nmeth) { 
     Z <- 
      switch(nmeth, 
        { # 1 
         Z <- .Fortran(C_kmns, x, m, p, 
           centers = centers, 
           as.integer(k), c1 = integer(m), integer(m), 
           nc = integer(k), double(k), double(k), integer(k), 
           double(m), integer(k), integer(k), 
           as.integer(iter.max), wss = double(k), 
           ifault = 0L) 
         switch(Z$ifault, 
           stop("empty cluster: try a better set of initial centers", 
            call.=FALSE), 
           warning(gettextf("did not converge in %d iterations", 
               iter.max), call.=FALSE, domain =NA), 
           stop("number of cluster centres must lie between 1 and nrow(x)", 
            call.=FALSE) 
          ) 
         Z 
        }, 
        { # 2 
         Z <- .C(C_kmeans_Lloyd, x, m, p, 
           centers = centers, as.integer(k), 
           c1 = integer(m), iter = as.integer(iter.max), 
           nc = integer(k), wss = double(k)) 
         if(Z$iter > iter.max) 
          warning("did not converge in ", 
            iter.max, " iterations", call.=FALSE) 
         if(any(Z$nc == 0)) 
          warning("empty cluster: try a better set of initial centers", call.=FALSE) 
         Z 
        }, 
        { # 3 
         Z <- .C(C_kmeans_MacQueen, x, m, p, 
           centers = as.double(centers), as.integer(k), 
           c1 = integer(m), iter = as.integer(iter.max), 
           nc = integer(k), wss = double(k)) 
         if(Z$iter > iter.max) 
          warning("did not converge in ", 
            iter.max, " iterations", call.=FALSE) 
         if(any(Z$nc == 0)) 
          warning("empty cluster: try a better set of initial centers", call.=FALSE) 
         Z 
        }) 
     Z 
    } 
    x <- as.matrix(x) 
    m <- as.integer(nrow(x)) 
    if(is.na(m)) stop("invalid nrow(x)") 
    p <- as.integer(ncol(x)) 
    if(is.na(p)) stop("invalid ncol(x)") 
    if(missing(centers)) 
    stop("'centers' must be a number or a matrix") 
    nmeth <- switch(match.arg(algorithm), 
        "Hartigan-Wong" = 1, 
        "Lloyd" = 2, "Forgy" = 2, 
        "MacQueen" = 3) 
    if(length(centers) == 1L) { 
    if (centers == 1) nmeth <- 3 
    k <- centers 
     ## we need to avoid duplicates here 
     if(nstart == 1) 
      centers <- x[sample.int(m, k), , drop = FALSE] 
     if(nstart >= 2 || any(duplicated(centers))) { 
      cn <- unique(x) 
      mm <- nrow(cn) 
      if(mm < k) 
       stop("more cluster centers than distinct data points.") 
      centers <- cn[sample.int(mm, k), , drop=FALSE] 
     } 
    } else { 
    centers <- as.matrix(centers) 
     if(any(duplicated(centers))) 
      stop("initial centers are not distinct") 
     cn <- NULL 
    k <- nrow(centers) 
     if(m < k) 
      stop("more cluster centers than data points") 
    } 
    if(iter.max < 1) stop("'iter.max' must be positive") 
    if(ncol(x) != ncol(centers)) 
    stop("must have same number of columns in 'x' and 'centers'") 
    if(!is.double(x)) storage.mode(x) <- "double" 
    if(!is.double(centers)) storage.mode(centers) <- "double" 
    Z <- do_one(nmeth) 
    best <- sum(Z$wss) 
    if(nstart >= 2 && !is.null(cn)) 
    for(i in 2:nstart) { 
     centers <- cn[sample.int(mm, k), , drop=FALSE] 
     ZZ <- do_one(nmeth) 
     if((z <- sum(ZZ$wss)) < best) { 
     Z <- ZZ 
     best <- z 
     } 
    } 
    centers <- matrix(Z$centers, k) 
    dimnames(centers) <- list(1L:k, dimnames(x)[[2L]]) 
    cluster <- Z$c1 
    if(!is.null(rn <- rownames(x))) 
     names(cluster) <- rn 
    totss <- sum(scale(x, scale = FALSE)^2) 
    structure(list(cluster = cluster, centers = centers, totss = totss, 
        withinss = Z$wss, tot.withinss = best, 
        betweenss = totss - best, size = Z$nc), 
      class = "kmeans") 
} 

## modelled on print methods in the cluster package 
print.kmeans <- function(x, ...) 
{ 
    cat("K-means clustering with ", length(x$size), " clusters of sizes ", 
     paste(x$size, collapse=", "), "\n", sep="") 
    cat("\nCluster means:\n") 
    print(x$centers, ...) 
    cat("\nClustering vector:\n") 
    print(x$cluster, ...) 
    cat("\nWithin cluster sum of squares by cluster:\n") 
    print(x$withinss, ...) 
    cat(sprintf(" (between_SS/total_SS = %5.1f %%)\n", 
     100 * x$betweenss/x$totss), 
    "Available components:\n", sep="\n") 
    print(names(x)) 
    invisible(x) 
} 

fitted.kmeans <- function(object, method = c("centers", "classes"), ...) 
{ 
    method <- match.arg(method) 
    if (method == "centers") object$centers[object$cl, , drop=FALSE] 
    else object$cl 
} 

请注意,如果代码检查的改善发生这些行:

if((z <- sum(ZZ$wss)) < best) { 
     Z <- ZZ 
     best <- z 
     } 

在这里你可以添加你的约束。

0

您可以使用与KMeans相同的原理。迭代在2-3直至收敛:

  1. 指定城市群(随机)
  2. 计算集群
  3. 分配点重心的重心使得:
    • 距离来分总和到指定的质心被最小化
    • 阈值约束受到尊重

在标准KMeans中没有限制。因此,第二步通过将每个点分配给最接近的质心来执行。在这里,你必须在步骤2中解决一个优化问题。 如果你只是将它建模为一个整数规划问题,它可能会更快。 OR Tools有解决整数规划问题的设施。

Here是一个python实现,用不同的约束条件进行K均值聚类,包括一个集群中实例总重量的最大值。