使用 AVX 内在函数代替 SSE 并不能提高速度 - 为什么?

2024-05-07

我已经使用 Intel 的 SSE 内在函数相当长一段时间了,并取得了良好的性能提升。因此,我希望 AVX 内在函数能够进一步加速我的程序。不幸的是,直到现在情况并非如此。可能我犯了一个愚蠢的错误,所以如果有人能帮助我,我将非常感激。

我使用 Ubuntu 11.10 和 g++ 4.6.1。我编译了我的程序(见下文)

g++ simpleExample.cpp -O3 -march=native -o simpleExample

测试系统具有Intel i7-2600 CPU。

这是说明我的问题的代码。在我的系统上,我得到输出

98.715 ms, b[42] = 0.900038 // Naive
24.457 ms, b[42] = 0.900038 // SSE
24.646 ms, b[42] = 0.900038 // AVX

请注意,选择计算 sqrt(sqrt(sqrt(x))) 只是为了确保内存带宽不会限制执行速度;这只是一个例子。

简单的示例.c++:

#include <immintrin.h>
#include <iostream>
#include <math.h> 
#include <sys/time.h>

using namespace std;

// -----------------------------------------------------------------------------
// This function returns the current time, expressed as seconds since the Epoch
// -----------------------------------------------------------------------------
double getCurrentTime(){
  struct timeval curr;
  struct timezone tz;
  gettimeofday(&curr, &tz);
  double tmp = static_cast<double>(curr.tv_sec) * static_cast<double>(1000000)
             + static_cast<double>(curr.tv_usec);
  return tmp*1e-6;
}

// -----------------------------------------------------------------------------
// Main routine
// -----------------------------------------------------------------------------
int main() {

  srand48(0);            // seed PRNG
  double e,s;            // timestamp variables
  float *a, *b;          // data pointers
  float *pA,*pB;         // work pointer
  __m128 rA,rB;          // variables for SSE
  __m256 rA_AVX, rB_AVX; // variables for AVX

  // define vector size 
  const int vector_size = 10000000;

  // allocate memory 
  a = (float*) _mm_malloc (vector_size*sizeof(float),32);
  b = (float*) _mm_malloc (vector_size*sizeof(float),32);

  // initialize vectors //
  for(int i=0;i<vector_size;i++) {
    a[i]=fabs(drand48());
    b[i]=0.0f;
  }

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// Naive implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  s = getCurrentTime();
  for (int i=0; i<vector_size; i++){
    b[i] = sqrtf(sqrtf(sqrtf(a[i])));
  }
  e = getCurrentTime();
  cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;

// -----------------------------------------------------------------------------
  for(int i=0;i<vector_size;i++) {
    b[i]=0.0f;
  }
// -----------------------------------------------------------------------------

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// SSE2 implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  pA = a; pB = b;

  s = getCurrentTime();
  for (int i=0; i<vector_size; i+=4){
    rA   = _mm_load_ps(pA);
    rB   = _mm_sqrt_ps(_mm_sqrt_ps(_mm_sqrt_ps(rA)));
    _mm_store_ps(pB,rB);
    pA += 4;
    pB += 4;
  }
  e = getCurrentTime();
  cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;

// -----------------------------------------------------------------------------
  for(int i=0;i<vector_size;i++) {
    b[i]=0.0f;
  }
// -----------------------------------------------------------------------------

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// AVX implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  pA = a; pB = b;

  s = getCurrentTime();
  for (int i=0; i<vector_size; i+=8){
    rA_AVX   = _mm256_load_ps(pA);
    rB_AVX   = _mm256_sqrt_ps(_mm256_sqrt_ps(_mm256_sqrt_ps(rA_AVX)));
    _mm256_store_ps(pB,rB_AVX);
    pA += 8;
    pB += 8;
  }
  e = getCurrentTime();
  cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;

  _mm_free(a);
  _mm_free(b);

  return 0;
}

任何帮助表示赞赏!


这是因为VSQRTPS(AVX 指令)所需的周期恰好是SQRTPS(SSE 指令)在 Sandy Bridge 处理器上。请参阅 Agner Fog 的优化指南:指令表 http://www.agner.org/optimize/instruction_tables.pdf,第 88 页。

平方根和除法等指令无法从 AVX 中受益。另一方面,加法、乘法等也是如此。

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

使用 AVX 内在函数代替 SSE 并不能提高速度 - 为什么? 的相关文章

随机推荐