一个 shfl sync 的小示例,v100测试
#include <cuda_runtime.h>
//#include <iostream>
#include <stdio.h>
#define warpSize 32
/*
__global__ void scan4(float* a, float* b) {
int laneId = threadIdx.x & 0x1f;
float value;
value = a[laneId];
value = __shfl_up_sync(0xffffffff, value, 4);
b[laneId] = value;
}
*/
__global__ void scan4(float* a, float* b) {
int laneId = threadIdx.x & 0x1f;
// Seed sample starting value (inverse of lane ID)
//int value = 31 - laneId;
float value = 1.0;
// Loop to accumulate scan within my partition.
// Scan requires log2(n) == 3 steps for 8 threads
// It works by an accumulated sum up the warp
// by 1, 2, 4, 8 etc. steps.
for (int i=1; i<=4; i*=2) {//i = 1,2,4
// We do the __shfl_sync unconditionally so that we
// can read even from threads which won't do a
// sum, and then conditionally assign the result.
//int n = __shfl_up_sync(0xffffffff, value, i, 8);
float n = __shfl_up_sync(0xffffffff, value, i, 8);
if ((laneId & 7) >= i)
value += n;
}
b[laneId] = value;
}
void printVector(char* desc, float* ptr_vec, unsigned int n){
printf("%s =\n", desc);
for(int i=0; i<n; i++){
printf(" %5.2f ",ptr_vec[i]);
}
printf("\n");
}
int main() {
float* a_h = NULL;
float* a_d = NULL;
float* b_h = NULL;
float* b_d = NULL;
a_h = (float*)malloc(warpSize*sizeof(float));
b_h = (float*)malloc(warpSize*sizeof(float));
for(int i=0; i<warpSize; i++){
a_h[i] = i+100.0;
}
//memset(b_h, 15, warpSize*sizeof(float));
for(int i=0; i<warpSize; i++){
b_h[i] = i+100.0;
}
printVector("a_h",a_h, warpSize);
printVector("b_h",b_h, warpSize);
cudaMalloc((void**)&a_d, warpSize*sizeof(float));
cudaMalloc((void**)&b_d, warpSize*sizeof(float));
cudaMemcpy(a_d, a_h, warpSize*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, warpSize*sizeof(float), cudaMemcpyHostToDevice);
scan4<<< 1, warpSize >>>(a_d, b_d);
cudaDeviceSynchronize();
cudaMemcpy(b_h, b_d, warpSize*sizeof(float), cudaMemcpyDeviceToHost);
printVector("b_d", b_h, warpSize);
cudaFree(a_d);
cudaFree(b_d);
return 0;
}