这是为了回答您的问题“我该怎么做才能让它运行得更快?”正如我在评论中提到的,计时方法(可能)存在问题,我对速度改进的主要建议是使用“经典并行缩减”算法。以下代码实现了更好的(在我看来)计时测量,并且还将您的内核转换为缩减风格的内核:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
当我在相当快的 GPU(Quadro 5000,比 Tesla M2050 慢一点)上运行时,我得到以下结果:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
我们发现 GPU 的计算部分比这种(简单的单线程)CPU 实现要快得多。当我们添加传输数据的成本时,GPU 版本会更慢,但不会慢 30 倍。
通过比较,当我对你的原始算法进行计时时,我得到了这样的数字:
GPU total time = 0.118131s
GPU compute time = 0.093213s
我的系统配置是 Xeon X5560 CPU、RHEL 5.5、CUDA 5.0、Quadro5000 GPU。