#include
<
stdio.h
>
#include
<
assert.h
>
#include
<
time.h
>
#include
<
cutil.h
>
//
Simple utility function to check for CUDA runtime errors
//
void checkCUDAError(const char* msg)
#define
LOOP_ADD_TIME 100
//
Part 2 of 2: implement the kernel
__global__
void
reverseArrayBlock(
int
*
d_a)
{
int
dx
=
blockDim.x
*
blockIdx.x
+
threadIdx.x;
for
(
int
i
=
1
; i
<=
LOOP_ADD_TIME; i
++
)
{
d_a[dx]
+=
i;
}
}
int
gpu_test()
{
clock_t start, finish;
double
duration;
//
pointer for host memory and size
int
*
h_a,transfer;
int
dimA
=
512
*
21056
;
//
256K elements (1MB total)
//
pointer for device memory
int
*
d_a;
//
define grid and block size
int
numThreadsPerBlock
=
512
;
//
Part 1 of 2: compute number of blocks needed based on array size and desired block size
int
numBlocks
=
dimA
/
numThreadsPerBlock;
printf(
"
blocks: %d/n
"
,numBlocks);
//
allocate host and device memory
size_t memSize
=
numBlocks
*
numThreadsPerBlock
*
sizeof
(
int
);
h_a
=
(
int
*
) malloc(memSize);
CUDA_SAFE_CALL(cudaMalloc( (
void
**
)
&
d_a, memSize ));
//
Initialize input array on host
for
(
int
i
=
0
; i
<
dimA;
++
i)
{
h_a[i]
=
i;
//
printf("%d ",h_a[i]);
}
start
=
clock();
//
unsigned int timer;
//
CUT_SAFE_CALL(cutCreateTimer(&timer));
//
CUT_SAFE_CALL(cutStartTimer(timer));
//
Copy host array to device array
CUDA_SAFE_CALL(cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ));
//
launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock
<<<
dimGrid, dimBlock
>>>
( d_a );
//
device to host copy
CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost ));
//
CUT_SAFE_CALL(cutStopTimer(timer));
finish
=
clock();
duration
=
(
double
)(finish
-
start)
*
1000
/
CLOCKS_PER_SEC;
printf(
"
gpu time is %f ms/n
"
, duration );
//
printf( "gpu time is %f ms/n", cutGetTimerValue(timer));
int
*
h_a2;
//
allocate host memory
h_a2
=
(
int
*
) malloc(memSize);
//
Initialize input array on host
for
(
int
i
=
0
; i
<
dimA;
++
i)
{
h_a2[i]
=
i;
//
printf("%d ",h_a[i]);
}
for
(
int
j
=
0
; j
<
dimA ;
++
j )
{
for
(
int
k
=
1
; k
<=
LOOP_ADD_TIME; k
++
)
{
h_a2[j]
+=
k;
}
}
for
(
int
j
=
0
; j
<
dimA ;
++
j )
{
if
(h_a[j]
!=
h_a2[j])printf(
"
error!/n
"
);
}
//
free host memory
free(h_a2);
//
free host memory
free(h_a);
//
free device memory
cudaFree(d_a);
return
0
;
}
//
//
//
Program main
//
//
int
cpu_test()
{
clock_t start, finish;
double
duration;
//
pointer for host memory and size
int
*
h_a,transfer;
int
dimA
=
512
*
21056
;
//
256K elements (1MB total)
//
allocate host memory
size_t memSize
=
512
*
21056
*
sizeof
(
int
);
h_a
=
(
int
*
) malloc(memSize);
//
Initialize input array on host
for
(
int
i
=
0
; i
<
dimA;
++
i)
{
h_a[i]
=
i;
//
printf("%d ",h_a[i]);
}
printf(
"
/n
"
);
start
=
clock();
for
(
int
j
=
0
; j
<
dimA ;
++
j )
{
for
(
int
k
=
1
; k
<=
LOOP_ADD_TIME; k
++
)
{
h_a[j]
+=
k;
}
}
finish
=
clock();
duration
=
(
double
)(finish
-
start)
*
1000
/
CLOCKS_PER_SEC;
printf(
"
cpu time is %f ms/n
"
, duration );
//
free host memory
free(h_a);
return
0
;
}
//
//
//
Program main
//
//
int
main(
int
argc,
char
**
argv)
{
CUT_DEVICE_INIT(argc, argv);
gpu_test();
cpu_test();
CUT_EXIT(argc, argv);
}