我会这样做。未经测试。
// Load 16 bytes and propagate the first zero towards the end of the register
inline __m128i loadNullTerminated( const char* pointer )
{
// Load 16 bytes
const __m128i chars = _mm_loadu_si128( ( const __m128i* )pointer );
const __m128i zero = _mm_setzero_si128();
// 0xFF for bytes that were '\0', 0 otherwise
__m128i zeroBytes = _mm_cmpeq_epi8( chars, zero );
// If you have long strings and expect most calls to not have any zeros, uncomment the line below.
// You can return a flag to the caller, to know when to stop.
// if( _mm_testz_si128( zeroBytes, zeroBytes ) ) return chars;
// Propagate the first "0xFF" byte towards the end of the register.
// Following 8 instructions are fast, 1 cycle latency/each.
// Pretty sure _mm_movemask_epi8 / _BitScanForward / _mm_loadu_si128 is slightly slower even when the mask is in L1D
zeroBytes = _mm_or_si128( zeroBytes, _mm_slli_si128( zeroBytes, 1 ) );
zeroBytes = _mm_or_si128( zeroBytes, _mm_slli_si128( zeroBytes, 2 ) );
zeroBytes = _mm_or_si128( zeroBytes, _mm_slli_si128( zeroBytes, 4 ) );
zeroBytes = _mm_or_si128( zeroBytes, _mm_slli_si128( zeroBytes, 8 ) );
// Now apply that mask
return _mm_andnot_si128( zeroBytes, chars );
}
Update:这是另一个版本,使用了 Noah 关于 int64 的想法-1
操作说明。
可能会稍微快一些。拆卸。 https://godbolt.org/z/Pojs8d
__m128i loadNullTerminated_v2( const char* pointer )
{
// Load 16 bytes
const __m128i chars = _mm_loadu_si128( ( const __m128i* )pointer );
const __m128i zero = _mm_setzero_si128();
// 0xFF for bytes that were '\0', 0 otherwise
const __m128i zeroBytes = _mm_cmpeq_epi8( chars, zero );
// If you have long strings and expect most calls to not have any zeros, uncomment the line below.
// You can return a flag to the caller, to know when to stop.
// if( _mm_testz_si128( eq_zero, eq_zero ) ) return chars;
// Using the fact that v-1 == v+(-1), and -1 has all bits set
const __m128i ones = _mm_cmpeq_epi8( zero, zero );
__m128i mask = _mm_add_epi64( zeroBytes, ones );
// This instruction makes a mask filled with lowest valid bytes in each 64-bit lane
mask = _mm_andnot_si128( zeroBytes, mask );
// Now need to propagate across 64-bit lanes
// ULLONG_MAX if there were no zeros in the corresponding 8-byte long pieces of the string
__m128i crossLaneMask = _mm_cmpeq_epi64( zeroBytes, zero );
// Move the lower 64-bit lanes of noZeroes64 into higher position
crossLaneMask = _mm_unpacklo_epi64( mask, crossLaneMask );
// Update the mask.
// Lower 8 bytes will not change because _mm_unpacklo_epi64 copied that part from the mask.
// However, upper lane may become zeroed out.
// Happens when _mm_cmpeq_epi64 detected at least 1 '\0' in any of the first 8 characters.
mask = _mm_and_si128( mask, crossLaneMask );
// Apply that mask
return _mm_and_si128( mask, chars );
}