2

作为代码所示,次通过螺纹的增加,减少和通过螺纹的减小而增加,最佳为1。多线程增加矩阵乘法示例中的时间?

import java.util.concurrent.ExecutorService; 
import java.util.concurrent.Executors; 
import java.util.concurrent.TimeUnit; 

public class MatrixMultiplication { 
    public static class Matrix { 
     int[][] container; 
     int rows; 
     int cols; 

     public Matrix(int rows, int cols) { 
      this.rows = rows; 
      this.cols = cols; 
      container = new int[rows][cols]; 
     } 
    } 

    public static class MultiplyThread implements Runnable { 
     private int row; 
     private int col; 
     Matrix a; 
     Matrix b; 
     Matrix c; 

     public MultiplyThread(Matrix a, Matrix b, Matrix c, int row, int col) { 
      this.row = row; 
      this.col = col; 
      this.a = a; 
      this.b = b; 
      this.c = c; 
     } 

     @Override 
     public void run() { 
      int sum = 0; 
      for (int i = 0; i < a.cols; i++) // or b.rows 
       sum += a.container[row][i] * b.container[i][col]; 
      c.container[row][col] = sum; 
     } 
    } 

    public static Matrix multiplyMatrices(Matrix a, Matrix b, int threads) throws InterruptedException { 
     if (a.cols != b.rows) 
      return null; 
     int rows = a.rows; 
     int cols = b.cols; 
     Matrix ret = new Matrix(rows, cols); 
     ExecutorService executor = Executors.newFixedThreadPool(threads); 
     int i = 0, j = 0; 
     for (i = 0; i < rows; i++) 
      for (j = 0; j < cols; j++) 
       executor.submit(new MultiplyThread(a, b, ret, i, j)); 
     executor.shutdown(); 
     executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); 
     return ret; 
    } 

    public static void main(String[] args) throws InterruptedException { 
     int[][] mat = new int[][] { { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 } }; 
     Matrix a = new Matrix(4, 4); 
     a.container = mat; 
     Matrix b = new Matrix(4, 4); 
     b.container = mat; 

     int numProcessors = Runtime.getRuntime().availableProcessors(); 
     System.out.println(numProcessors); 
     for (int i = 10; i >= 1; i--) { 
      long time = System.currentTimeMillis(); 
      for (int j = 0; j < 10000; j++) 
       multiplyMatrices(a, b, i); 
      time = System.currentTimeMillis() - time; 
      System.out.println("number of threads: " + i + ", and the time: " + time); 
     } 
    } 

} 

作为代码所示,次数增加通过螺纹的增加,减少和由线程数减少,最佳值为1.

+4

a)http://stackoverflow.com/questions/504103/how-do-i-write-a-correct-micro-benchmark-in-java; b)这些是小矩阵 - 启动额外线程的开销将大大超过任何潜在的加速。 –

+0

查看我使用Akka进行的大型矩阵并行化的示例:https://github.com/GreaterMKEMeetup/akka-examples我使用BigInteger可以将大型矩阵乘以大数。使用线程池的速度非常快。 – aglassman

回答

1

创建线程在注释中指出的开销有一个问题。

此外,当多个CPU尝试修改每个CPU的高速缓存行上表示的内存时,该高速缓存行的第一个修改会导致其他CPU中的高速缓存行被标记为无效。这在SMP系统中造成了真正的性能问题。这个问题被称为虚假分享。

当您修改多个CPU的地址空间中靠近的内存区域时,您很可能会发生虚假共享。对于出色的写作情况,see Dr. Dobbs

您的代码似乎可能会遭受虚假分享,因为您符合一般标准。您正在修改数据,数据在内存中紧密相连,并且您正在多线程上执行此操作。如果你也有多个CPU核心,那么几乎可以肯定至少有一定程度的错误分享。