我需要使用 AVX 优化 8x4 和 4x8 浮点矩阵的转置。我用阿格纳·福格的矢量类库 http://www.agner.org/optimize/#vectorclass.
青色任务 - 构建 BVH 并求和最小-最大。每个循环的最后阶段都会使用转置(它们也通过多线程进行优化,但任务可能非常多)。
代码现在看起来像:
void transpose(register Vec4f (&fin)[8], register Vec8f (&mat)[4]) {
for (int i = 0;i < 8;i++) {
fin[i] = lookup<28>(Vec4i(0, 8, 16, 24) + i, (float *)mat);
}
}
需要优化的变体。如何针对 SIMD 优化此功能?
我最近用向量类编写了自己的转置变体(4x8 和 8x4)。 1.0 版。
void transpose(register Vec4f(&fin)[8], register Vec8f(&mat)[4]) {
register Vec8f a00 = blend8f<0, 8, 1, 9, 2, 10, 3, 11>(mat[0], mat[1]);
register Vec8f a10 = blend8f<0, 8, 1, 9, 2, 10, 3, 11>(mat[2], mat[3]);
register Vec8f a01 = blend8f<4, 12, 5, 13, 6, 14, 7, 15>(mat[0], mat[1]);
register Vec8f a11 = blend8f<4, 12, 5, 13, 6, 14, 7, 15>(mat[2], mat[3]);
register Vec8f v0_1 = blend8f<0, 1, 8, 9, 2, 3, 10, 11>(a00, a10);
register Vec8f v2_3 = blend8f<4, 5, 12, 13, 6, 7, 14, 15>(a00, a10);
register Vec8f v4_5 = blend8f<0, 1, 8, 9, 2, 3, 10, 11>(a01, a11);
register Vec8f v6_7 = blend8f<4, 5, 12, 13, 6, 7, 14, 15>(a01, a11);
fin[0] = v0_1.get_low();
fin[1] = v0_1.get_high();
fin[2] = v2_3.get_low();
fin[3] = v2_3.get_high();
fin[4] = v4_5.get_low();
fin[5] = v4_5.get_high();
fin[6] = v6_7.get_low();
fin[7] = v6_7.get_high();
}
void transpose(register Vec8f(&fin)[4], register Vec4f(&mat)[8]) {
register Vec8f a0_1 = Vec8f(mat[0], mat[1]);
register Vec8f a2_3 = Vec8f(mat[2], mat[3]);
register Vec8f a4_5 = Vec8f(mat[4], mat[5]);
register Vec8f a6_7 = Vec8f(mat[6], mat[7]);
register Vec8f a00 = blend8f<0, 4, 8 , 12, 1, 5, 9 , 13>(a0_1, a2_3);
register Vec8f a10 = blend8f<0, 4, 8 , 12, 1, 5, 9 , 13>(a4_5, a6_7);
register Vec8f a01 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(a0_1, a2_3);
register Vec8f a11 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(a4_5, a6_7);
fin[0] = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(a00, a10);
fin[1] = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(a00, a10);
fin[2] = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(a01, a11);
fin[3] = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(a01, a11);
}
需要2.0版本。