我正在尝试编写一个支持 SSE 的 alpha 合成器,这就是我想出的。首先,混合两个 4 像素向量的代码:
// alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each
//
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
// shuffle masks for alpha and 255 vector for 255-alpha
//
// NOTE: storing static __m128i here with _mm_set_si128 was _very_ slow, compiler doesn't seem
// to know it can store this as a const, so it had guard variables and did real static initialization,
// stick with arrays.
//
static const uint64_t allo[2] __attribute__((aligned(16))) = { 0x03ff03ff03ff03ff, 0x07ff07ff07ff07ff };
static const uint64_t alhi[2] __attribute__((aligned(16))) = { 0x0bff0bff0bff0bff, 0x0fff0fff0fff0fff };
static const uint64_t m255[2] __attribute__((aligned(16))) = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };
// replicate top two pixels from under
__m128i underhi = (__m128i)_mm_movehl_ps(
(__m128)under,
(__m128)under
);
__m128i u16_0 = _mm_cvtepu8_epi16(under); // convert 8-bit fields to 16-bit with zero extension
__m128i u16_1 = _mm_cvtepu8_epi16(underhi);
__m128i al8_0 = _mm_shuffle_epi8 (over, *(__m128i*)&allo); // replicate (alpha << 8) to each field
__m128i al8_1 = _mm_shuffle_epi8 (over, *(__m128i*)&alhi);
__m128i mal_0 = _mm_sub_epi8 (*(__m128i*)&m255, al8_0); // compute 255-alpha
__m128i mal_1 = _mm_sub_epi8 (*(__m128i*)&m255, al8_1);
__m128i mul_0 = _mm_mulhi_epu16 (u16_0, mal_0); // under*(255-over.alpha)
__m128i mul_1 = _mm_mulhi_epu16 (u16_1, mal_1);
__m128i pixel = _mm_packus_epi16 (mul_0, mul_1);
// add to background pixel with saturation
return _mm_adds_epi8(over, pixel);
}
其次,一个包装器展开多个像素操作并聚合加载/存储。达到约 32 像素/iter 似乎是最佳点:
// perform N 4-pixel blending operations at once, load/blend/store paradigm. We take a template parameter
// for the size so the compiler is more likely to unroll the loops for us.
//
template <ssize_t N>
__attribute__((always_inline, optimize("unroll-loops")))
static inline void blendN(__m128i *dst, const __m128i *punder, const __m128i *pover, bool single=false) {
__m128i under[N];
__m128i over[N];
__m128i cc = _mm_loadu_si128(pover);
// load
for (ssize_t ii=0; ii < N; ii++) {
under[ii] = _mm_loadu_si128(punder+ii);
over[ii] = single ? cc : _mm_loadu_si128( pover+ii);
}
// blend
for (ssize_t ii=0; ii < N; ii++) {
under[ii] = blend4(under[ii], over[ii]);
}
// store
for (ssize_t ii=0; ii < N; ii++) {
_mm_storeu_si128(dst+ii, under[ii]);
}
}
如此称呼:
// blend 32/16/8/4 pixels at a time
ssize_t ii=0;
for (ii *= 2; ii < len/32; ii++) { blendN<8>(vdst+8*ii, vunder+8*ii, vover+8*ii); }
for (ii *= 2; ii < len/16; ii++) { blendN<4>(vdst+4*ii, vunder+4*ii, vover+4*ii); }
for (ii *= 2; ii < len/8; ii++) { blendN<2>(vdst+2*ii, vunder+2*ii, vover+2*ii); }
for (ii *= 2; ii < len/4; ii++) { blendN<1>(vdst+1*ii, vunder+1*ii, vover+1*ii); }
// handle remainder
ii *= 4;
for (; ii < len; ii++) {
*(pdst+ii) = blend(*(punder+ii), *(pover+ii));
}
使用此功能,我可以在 i7-2600K 上获得大约 2.5 英寸/周期的吞吐量。很好奇是否有人可以对我的 SIMD 提出改进建议。
编辑:这是与 Peter Cordes 交谈后的一些更新代码。
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
// shuffle masks for alpha and 255 vector for 255-alpha
//
// NOTE: storing static __m128i is _very_ slow, compiler doesn't seem to know it can store
// this as a const, so it had guard variables and did real static initialization. Stick with
// just const
//
const __m128i allo = (__m128i)_mm_setr_epi32(0x03ff03ff, 0x03ff03ff, 0x07ff07ff, 0x07ff07ff);
const __m128i alhi = (__m128i)_mm_setr_epi32(0x0bff0bff, 0x0bff0bff, 0x0fff0fff, 0x0fff0fff);
const __m128i zero = (__m128i)_mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m128 m255 = (__m128 )_mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);
__m128i u16_0 = _mm_cvtepu8_epi16(under); // convert 8-bit fields to 16-bit with zero extension
__m128i u16_1 = _mm_unpackhi_epi8(under, zero);
__m128i al8_0 = _mm_shuffle_epi8 (over, allo); // replicate (alpha << 8) to each field
__m128i al8_1 = _mm_shuffle_epi8 (over, alhi);
__m128i mal_0 = (__m128i)_mm_xor_ps(m255, (__m128)al8_0); // compute 255-alpha
__m128i mal_1 = (__m128i)_mm_xor_ps(m255, (__m128)al8_1);
__m128i mul_0 = _mm_mulhi_epu16 (u16_0, mal_0); // under*(255-over.alpha)
__m128i mul_1 = _mm_mulhi_epu16 (u16_1, mal_1);
__m128i pixel = _mm_packus_epi16 (mul_0, mul_1);
// add to background pixel with saturation
return _mm_adds_epi8(over, pixel);
}
最大的变化是使用 unpackhi 而不是 cvtepu8 将像素下的前 8 个字节扩展到 16 位。然后使用异或而不是减法来计算 255-alpha。 xor 可以在多个端口上运行,而不是减法仅限于一个端口。在我的 i7-2600K 上,这大约每秒混合 22 亿像素,这似乎足够了。