我有一个代码,它根据另一个小数组中的值更新数组。访问5字节的结构比8字节慢很多
for (var i = 0; i < result.Length; i++)
{
var c = cards[i];
result[i] -= one[c.C0] + one[c.C1];
}
c
哪里是一个结构是一对从一甲板表示卡的字节。 one
是52的阵列大小(对于每个的52张牌的条目从甲板)
我写一个基准来分析此代码:
private void TestCards2(int testRepetitions, float[] result, float[] one, Cards[] cards)
{
for (var r = 0; r < testRepetitions; r++)
for (var i = 0; i < result.Length; i++)
{
var c = cards[i];
result[i] -= one[c.C0] + one[c.C1];
}
}
设置testRepetitions
= 25万,而使用的阵列256个元素(result.Length = 256
),它在我的机器上运行约8.5秒。
这里是Cards
结构:
struct Cards
{
public byte C0;
public byte C1;
public Cards(byte c0, byte c1)
{
C0 = c0;
C1 = c1;
}
}
当我修改结构,以容纳5张(5个字节),同样基准现在只需13秒〜。 为什么会发生?计算结果相同,其余3张卡未使用,所有阵列都足够小以适应L1缓存。
甚至更奇怪的是,如果我进一步改变卡现在持有8个字节,基准现在更快,需要约10秒。
我的设置:
VS 2015 Update 3.
.NET 4.6.2
Release Build x64
CPU: Haswell i7-5820K CPU @ 3.30GHz
这里是我得到确切时机:
Test With 2 Cards. Time = 8582 ms
Test With 5 Cards. Time = 12910 ms
Test With 8 Cards. Time = 10180 ms
这到底是怎么回事?
基准代码:
class TestAdjustment
{
public void Test()
{
using (Process p = Process.GetCurrentProcess())
p.PriorityClass = ProcessPriorityClass.High;
var size = 256;
float[] one = ArrayUtils.CreateRandomFloatArray(size:52);
int[] card0 = ArrayUtils.RandomIntArray(size, minValue:0, maxValueInclusive:51);
int[] card1 = ArrayUtils.RandomIntArray(size, minValue: 0, maxValueInclusive: 51);
Cards[] cards = CreateCardsArray(card0, card1);
Cards5[] cards5 = CreateCards5Array(card0, card1);
Cards8[] cards8 = CreateCards8Array(card0, card1);
float[] result = ArrayUtils.CreateRandomFloatArray(size);
float[] resultClone = result.ToArray();
var testRepetitions = 25*1000*1000;
var sw = Stopwatch.StartNew();
TestCards2(testRepetitions, result, one, cards);
WriteLine($"Test With 2 Cards. Time = {sw.ElapsedMilliseconds} ms");
result = resultClone.ToArray(); //restore original array from the clone, so that next method works on the same data
sw.Restart();
TestCards5(testRepetitions, result, one, cards5);
WriteLine($"Test With 5 Cards. Time = {sw.ElapsedMilliseconds} ms");
result = resultClone.ToArray();
sw.Restart();
TestCards8(testRepetitions, result, one, cards8);
WriteLine($"Test With 8 Cards. Time = {sw.ElapsedMilliseconds} ms");
}
private void TestCards2(int testRepetitions, float[] result, float[] one, Cards[] cards)
{
for (var r = 0; r < testRepetitions; r++)
for (var i = 0; i < result.Length; i++)
{
var c = cards[i];
result[i] -= one[c.C0] + one[c.C1];
}
}
private void TestCards5(int testRepetitions, float[] result, float[] one, Cards5[] cards)
{
for (var r = 0; r < testRepetitions; r++)
for (var i = 0; i < result.Length; i++)
{
var c = cards[i];
result[i] -= one[c.C0] + one[c.C1];
}
}
private void TestCards8(int testRepetitions, float[] result, float[] one, Cards8[] cards)
{
for (var r = 0; r < testRepetitions; r++)
for (var i = 0; i < result.Length; i++)
{
var c = cards[i];
result[i] -= one[c.C0] + one[c.C1];
}
}
private Cards[] CreateCardsArray(int[] c0, int[] c1)
{
var result = new Cards[c0.Length];
for (var i = 0; i < result.Length; i++)
result[i] = new Cards((byte)c0[i], (byte)c1[i]);
return result;
}
private Cards5[] CreateCards5Array(int[] c0, int[] c1)
{
var result = new Cards5[c0.Length];
for (var i = 0; i < result.Length; i++)
result[i] = new Cards5((byte)c0[i], (byte)c1[i]);
return result;
}
private Cards8[] CreateCards8Array(int[] c0, int[] c1)
{
var result = new Cards8[c0.Length];
for (var i = 0; i < result.Length; i++)
result[i] = new Cards8((byte)c0[i], (byte)c1[i]);
return result;
}
}
struct Cards
{
public byte C0;
public byte C1;
public Cards(byte c0, byte c1)
{
C0 = c0;
C1 = c1;
}
}
struct Cards5
{
public byte C0, C1, C2, C3, C4;
public Cards5(byte c0, byte c1)
{
C0 = c0;
C1 = c1;
C2 = C3 = C4 = 0;
}
}
struct Cards8
{
public byte C0, C1, C2, C3, C4, C5, C6, C7;
public Cards8(byte c0, byte c1)
{
C0 = c0;
C1 = c1;
C2 = C3 = C4 = C5 = C6 = C7 = 0;
}
}
编辑 我再次运行基准,以100百万次迭代。下面是结果:
Test With 5 Cards. Time = 52245 ms
Test With 8 Cards. Time = 40531 ms
而在相反的顺序:
Test With 8 Cards. Time = 41041 ms
Test With 5 Cards. Time = 52034 ms
上运行它表面临4(SKYLAKE微架构i7-6650U涡轮增压至3.4GHz的〜):
Test With 8 Cards. Time = 47913 ms
Test With 5 Cards. Time = 55182 ms
所以差异依然存在,不依赖于订单。
我也使用英特尔VTune运行分析,它显示“5卡”版本的0.3
和“8卡”的0.27
的CPI。
编辑2增加了用于创建初始随机数组的ArrayUtils类。
public static class ArrayUtils
{
static Random rand = new Random(137);
public static float[] CreateRandomFloatArray(int size)
{
var result = new float[size];
for (int i = 0; i < size; i++)
result[i] = (float) rand.NextDouble();
return result;
}
public static int[] RandomIntArray(int size, int minValue, int maxValueInclusive)
{
var result = new int[size];
for (int i = 0; i < size; i++)
result[i] = rand.Next(minValue, maxValueInclusive + 1);
return result;
}
}
我无法重现此问题。使用2张卡进行测试需要最多的时间才能完成,而使用8张卡进行测试的速度最快。我甚至不知道如何解释这:)可能你的情况与这一行中的浅拷贝相关:var c = cards [i];'。浅拷贝具有8个属性的结构比具有5或2个字节的属性要花费更多的时间。 –
@Yeldar在我的基准测试中,5字节的结构比8字节慢,而2字节是最快的。 – Michal
这样的*非常*快代码的基准过于困难。 2和8测试之间的差异仅为每个任务0.25纳秒,甚至不是时钟速度的好几倍。只需重新排序测试以获得任意不同的结果。你真正*测试的是你的机器保持处理器冷却的能力。看起来,打开粉丝有点慢,这并不罕见。如果你想要一个更加一致的结果,那么不要把热量变得太大,2500万不会让它变得更好。打开箱子,吸出灰尘兔子。 –