流内在降低性能

我正在玩_mm_stream_ps内在，我在理解其性能方面遇到了一些麻烦。流内在降低性能

这里是我工作的代码片段... 流版本：

#include <stdio.h> 
#include <stdint.h> 
#include <stdlib.h> 
#include <omp.h> 

#include <immintrin.h> 

#define NUM_ELEMENTS 10000000L 

static void copy_temporal(float* restrict x, float* restrict y) 
{ 
    for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){ 
     _mm_store_ps(y,_mm_load_ps(x)); 
     _mm_store_ps(y+4,_mm_load_ps(x+4)); 
     x+=8; 
     y+=8; 
    } 
} 
static void copy_nontemporal(float* restrict x, float* restrict y) 
{ 
    for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){ 
     _mm_stream_ps(y,_mm_load_ps(x)); 
     _mm_stream_ps(y+4,_mm_load_ps(x+4)); 
     x+=8; 
     y+=8; 
    } 
} 

int main(int argc, char** argv) 
{ 
    uint64_t sizeX = sizeof(float) * 4 * NUM_ELEMENTS; 
    float *x = (float*) _mm_malloc(sizeX,32); 
    float *y = (float*) _mm_malloc(sizeX,32); 

    //initialization 
    for(uint64_t i = 0 ; i < 4 * NUM_ELEMENTS; ++i){ 
     x[i] = (float)rand()/RAND_MAX; 
     y[i] = 0; 
    } 

    printf("%g MB allocated\n",(2 * sizeX)/1024.0/1024.0); 

    double start = omp_get_wtime(); 
    copy_nontemporal(x, y); 
    double time = omp_get_wtime() - start; 
    printf("Bandwidth (non-temporal): %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time); 

    start = omp_get_wtime(); 
    copy_temporal(x, y); 
    time = omp_get_wtime() - start; 
    printf("Bandwidth: %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time); 

    _mm_free(x); 
    _mm_free(y); 

    return 0; 
}

性能测试结果：

2.3 GHz Core i7 (I7-3615QM) (Laptop): 
    305.176 MB allocated 
    Bandwidth (non-temporal): 24.2242 GB/s 
    Bandwidth: 21.4136 GB/s 

Xeon(R) CPU E5-2650 0 @ 2.00GHz (cluster (exclusive job)): 
    305.176 MB allocated 
    Bandwidth (non-temporal): 8.33133 GB/s 
    Bandwidth: 8.20684 GB/s

真正困扰我的是，我看到更好的性能 - - 在Xeon CPU上（不在我的笔记本电脑上） - 如果我使用非对齐的加载和存储（即storeu_ps/loadu_ps）：

305.176 MB allocated 
Bandwidth (non-temporal): 8.30105 GB/s 
Bandwidth: 12.7056 GB/s

由于y的冗余负载，我期望流版本比非流版本更快。但是，测量结果显示，流版本实际上比非流版本慢两倍。

你对此有任何解释吗？使用编译器：Intel 14.0.1;使用编译器：Intel 14.0.2;使用编译器：Intel 14.0.1;使用编译器：Intel 14.0.1;使用编译器：Intel 14.0.1; 编译器标志：-O3 -restrict -xAVX; 使用的CPU：Intel Xeon E5-2650;

谢谢。

来源

2014-01-18 user1829358

没有必要展开循环。循环展开只在依赖关系链中有用，并且没有依赖关系链。 CPU可以帮你处理这个问题。但我有个问题。在你的带宽计算中，3是什么因素？ –

两次读取+一次写入。即使非时间版本只做一次阅读，我仍然保留三个因子来简化比较。 – user1829358

正如ScottD指出的那样，问题的答案在于生成的汇编代码。显然，英特尔编译器足够智能，可以检测访问模式，并且即使对于时间版本也会自动生成非时间负载。

这里是一个对时间版本编译器生成的汇编代码：

..___tag_value___Z13copy_temporalPfS_.35:      # 
     xor  edx, edx          #22.4 
     xor  eax, eax          # 
..B2.2:       # Preds ..B2.2 ..B2.1 
     vmovups xmm0, XMMWORD PTR [rax+rdi]     #23.34 
     inc  rdx           #22.4 
     vmovntps XMMWORD PTR [rax+rsi], xmm0     #23.20 
     vmovups xmm1, XMMWORD PTR [16+rax+rdi]    #24.36 
     vmovntps XMMWORD PTR [16+rax+rsi], xmm1    #24.20 
     vmovups xmm2, XMMWORD PTR [32+rax+rdi]    #23.34 
     vmovntps XMMWORD PTR [32+rax+rsi], xmm2    #23.20 
     vmovups xmm3, XMMWORD PTR [48+rax+rdi]    #24.36 
     vmovntps XMMWORD PTR [48+rax+rsi], xmm3    #24.20 
     add  rax, 64          #22.4 
     cmp  rdx, 5000000         #22.4 
     jb  ..B2.2  # Prob 99%      #22.4

这仍是以下问题：

为什么不匹配，时间版本的性能比好CPU E5-2650的非暂时版本（请参阅上文）。我已经查看了生成的汇编代码，编译器确实生成了vmovups指令（由于不存在对齐）。

来源

2014-01-20 23:32:53 user1829358

如果ICC做了一些与你所说的不同的东西，那是令人失望的。我更喜欢它是否实现了你想要它们的内在方式。 –

流变化会直接向DRAM创建流水线突发写入。速度应该大致与DRAM的速度相匹配。标准存储区写入缓存（但是，如果数据不在缓存中，则首先将其读入缓存）。如果数据已经在缓存中，则标准存储以缓存写入的速度运行。一般来说，使用流方法的写入速度要比上一级缓存大得多。使用标准商店的小写入速度通常更快。尝试使用几GB的缓冲区大小运行测试。流方法应该更快。

这里是展示一个风向标：从英特尔Core i7-2600K

#define __USE_MINGW_ANSI_STDIO 1 
#include <stdlib.h> 
#include <intrin.h> 
#include <windows.h> 
#include <stdio.h> 
#include <stdint.h> 

//----------------------------------------------------------------------------- 
// 
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns 
//       count directly. 

uint64_t queryPerformanceCounter (void) 
    { 
    LARGE_INTEGER int64; 
    QueryPerformanceCounter (&int64); 
    return int64.QuadPart; 
    } 

//----------------------------------------------------------------------------- 
// 
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly. 

uint64_t queryPerformanceFrequency (void) 
    { 
    LARGE_INTEGER int64; 

    QueryPerformanceFrequency (&int64); 
    return int64.QuadPart; 
    } 

//--------------------------------------------------------------------------- 

static void testNontemporal (float *x, float *y, uint64_t numberOfVectors) 
    { 
    uint64_t i; 
    for(i = 0; i < numberOfVectors/2; ++i) 
     { 
     _mm_stream_ps(y,_mm_load_ps(x)); 
     _mm_stream_ps(y+4,_mm_load_ps(x+4)); 
     y+=8; x+=8; 
     } 
    } 

//--------------------------------------------------------------------------- 

static void testTemporal (float *x, float *y, uint64_t numberOfVectors) 
    { 
    uint64_t i; 
    for(i = 0; i < numberOfVectors/2; ++i) 
     { 
     _mm_store_ps(y,_mm_load_ps(x)); 
     _mm_store_ps(y+4,_mm_load_ps(x+4)); 
     y+=8; x+=8; 
     } 
    } 

//--------------------------------------------------------------------------- 

static void runtests (int nonTemporal) 
    { 
    uint64_t startCount, elapsed, index; 
    float *x, *y; 
    uint64_t numberOfBytes = 400 * 0x100000ull; 
    uint64_t numberOfFloats = numberOfBytes/sizeof *x; 
    uint64_t numberOfVectors = numberOfFloats/4; 
    double gbPerSecond; 

    x = _mm_malloc (numberOfBytes, 32); 
    y = _mm_malloc (numberOfBytes, 32); 
    if (x == NULL || y == NULL) exit (1); 

    // put valid floating point data into the source buffer 
    // to avoid performance penalty 
    for (index = 0; index < numberOfFloats; index++) 
     x [index] = (float) index, y [index] = 0; 

    startCount = queryPerformanceCounter(); 
    if (nonTemporal) 
     testNontemporal (x, y, numberOfVectors); 
    else 
     testTemporal (x, y, numberOfVectors); 
    elapsed = queryPerformanceCounter() - startCount; 
    gbPerSecond = (double) numberOfBytes/0x40000000 * queryPerformanceFrequency()/elapsed; 
    printf ("%.2f GB/s\n", gbPerSecond); 
    _mm_free (x); 
    _mm_free (y); 
    } 

//--------------------------------------------------------------------------- 

int main (void) 
    { 
    // raise our priority to increase measurement accuracy 
    SetPriorityClass (GetCurrentProcess(), REALTIME_PRIORITY_CLASS); 

    printf ("using temporal stores\n"); 
    runtests (0); 
    printf ("using non-temporal stores\n"); 
    runtests (1); 
    return 0; 
    } 

//---------------------------------------------------------------------------

输出：

using temporal stores 
5.57 GB/s 
using non-temporal stores 
8.35 GB/s

来源

2014-01-18 17:49:32 ScottD

感谢您的回复。我已经在使用大小为400MB的缓冲区（比我系统中的任何缓存大得多）。此外，为了读取一些硬件计数器，我已经安装了代码，结果是确凿的（即，使用stream_ps导致l2次写入未命中）。但是，我仍然无法解释这两个版本之间的巨大性能差异。 – user1829358

我将添加一个示例基准，试图显示大缓冲区的非时间（流）优势。这有点快，很脏，但我认为是正确的。使用非便携式（Windows）定时功能。 – ScottD

我已经更新了原始文章，但是我无法重现您的结果（即使我将您的代码移植到Linux中，结果也保持不变）。你知道为什么会出现这种情况吗？此外，你有解释为什么不对齐版本更快？这实际上可能指向真正的问题，因为流确实需要对齐。 – user1829358

据我所知，非临时存储从所有缓存相应的目标缓存行。如果线条在自然掉落之前再次触摸，那么您已经非常困难。

来源

2014-01-18 17:49:41 gsg

但是这不是这个代码片段的情况，是吗？ – user1829358

我想说这取决于有问题的缓冲区的大小，但是您对ScottD的评论表明它们非常大。此时我不确定发生了什么事。你可能会尝试各种各样的东西，比如注释'#pragma'，用'-xAVX'等编译，寻找常规和非临时商店之间性能比例的变化。 – gsg

流内在降低性能

回答

相关问题