2、完成一个尺寸512*512的二维数组的每一行最大值的并行程序实现数据类型设置为float。需要完成4个版本。
(1) 不使用共享内存,只使用全局内存;采用具有分支发散的并行归约;
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define index 512
# define TILE_WIDTH 2
__global__ void calcSum(float* AA, int Width)
{
unsigned int tid = threadIdx.x;
int Row = blockIdx.x * blockDim.x + threadIdx.x;
for (unsigned int stride = 1;stride < blockDim.x; stride *= 2)
{
__syncthreads();
if (tid % (2 * stride) == 0 && AA[Row + stride] > AA[Row])
AA[Row] = AA[Row + stride];
}
}
int main()
{
cudaError_t cudaStatus = cudaSuccess;
//初始化cpu矩阵
int Ndim = 0, Pdim = 0, Width = 0;
Ndim = Pdim = Width = index;
int szA = Ndim * Pdim;
float* A, * AA;
A = (float*)malloc(szA * sizeof(float));
int i;//初始化矩阵,可改为学号
for (i = 0; i < szA; i++)
A[i] = i+1;
cudaStatus = cudaMalloc((void**)&AA, szA * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc1 failed!");
}
cudaStatus = cudaMemcpy(AA, A, szA * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy1 failed!");
}
dim3 dimGrid=index;
dim3 dimBlock=index;
calcSum << <dimGrid, dimBlock >> > (AA, Width);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "calcSum failed!");
return 1;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(A, AA, szA * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
//打印
for (int i = 0; i < szA; i += Width)
printf("The RoWmax is :%.1f\n", A[i]);
printf("\nArray A:\n");
for (i = 0; i < Ndim; i++) {
for (int j = 0; j < Pdim; j++)
printf("%.1f\t", A[i * Pdim + j]);
printf("\n");
}
cudaFree(AA);
free(A);
return 0;
}
(2)不使用共享内存,只使用全局内存;采用无分支发散的并行归约;
注:与上题一样,只是核函数改变
__global__ void calcSum(float* AA, int Width)
{
unsigned int tid = threadIdx.x;
int Row = blockIdx.x * blockDim.x + threadIdx.x;
for (unsigned int stride = blockDim.x/2; stride > 0; stride >>= 1)
{
__syncthreads();
if (tid < stride&& AA[Row + stride] > AA[Row])
AA[Row] = AA[Row + stride];
}
}
(2) 使用共享内存;采用具有分支发散的并行归约;
(3) #include "cuda_runtime.h"
(4) #include "device_launch_parameters.h"
(5) #include <stdio.h>
(6) #include <stdlib.h>
(7) #define index 512
(8) # define TILE_WIDTH 2
(9)
(10) __global__ void calcSum(float* AA, int Width)
(11) {
(12) __shared__ float middleware[index];//申请共享内存存放,数据不是很大情况下,不分块,可以直接存放每一个块的一行数据
(13) unsigned int tid = threadIdx.x;
(14) int Row = blockIdx.x * blockDim.x + threadIdx.x;
(15) middleware[tid] = AA[Row];
(16) for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
(17) {
(18) __syncthreads();
(19) if (tid % (2 * stride) == 0 && middleware[tid+ stride] > middleware[tid])
(20) middleware[tid] = middleware[tid + stride];
(21) }
(22) if (tid == 0)AA[Row] = middleware[0];//最大值放在数组第一个元素中
(23) }
(24)
(25) int main()
(26) {
(27) cudaError_t cudaStatus = cudaSuccess;
(28) //初始化cpu矩阵
(29) int Ndim = 0, Pdim = 0, Width = 0;
(30) Ndim = Pdim = Width = index;
(31) int szA = Ndim * Pdim;
(32) float* A, * AA;
(33) A = (float*)malloc(szA * sizeof(float));
(34) int i;
(35) **//初始化矩阵,可改为学号**
(36) for (i = 0; i < szA; i++)
(37) A[i] = i+1;
(38) cudaStatus = cudaMalloc((void**)&AA, szA * sizeof(float));
(39) if (cudaStatus != cudaSuccess) {
(40) fprintf(stderr, "cudaMalloc1 failed!");
(41) }
(42) cudaStatus = cudaMemcpy(AA, A, szA * sizeof(float), cudaMemcpyHostToDevice);
(43) if (cudaStatus != cudaSuccess) {
(44) fprintf(stderr, "cudaMemcpy1 failed!");
(45) }
(46) dim3 dimGrid = index;
(47) dim3 dimBlock = index;
(48) calcSum << <dimGrid, dimBlock >> > (AA, Width);
(49) if (cudaStatus != cudaSuccess) {
(50) fprintf(stderr, "calcSum failed!");
(51) return 1;
(52) }
(53) cudaStatus = cudaDeviceSynchronize();
(54) if (cudaStatus != cudaSuccess) {
(55) fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
(56) }
(57) // Copy output vector from GPU buffer to host memory.
(58) cudaStatus = cudaMemcpy(A, AA, szA * sizeof(float), cudaMemcpyDeviceToHost);
(59) if (cudaStatus != cudaSuccess) {
(60) fprintf(stderr, "cudaMemcpy failed!");
(61) }
(62) //打印
(63) for (int i = 0; i < szA; i += Width)
(64) printf("The RoWmax is :%.1f\n", A[i]);
(65) printf("\nArray A:\n");
(66) for (i = 0; i < Ndim; i++) {
(67) for (int j = 0; j < Pdim; j++)
(68) printf("%.1f\t", A[i * Pdim + j]);
(69) printf("\n");
(70) }
(71) cudaFree(AA);
(72) free(A);
(73) return 0;
(74) }
(4)使用共享内存,采用无分支发散的并行归约;
注:核函数改变,截图如上
__global__ void calcSum(float* AA, int Width)
{
__shared__ float middleware[index];//申请共享内存存放,数据不是很大情况下,不分块,可以直接存放每一个块的一行数据
unsigned int tid = threadIdx.x;
int Row = blockIdx.x * blockDim.x + threadIdx.x;
middleware[tid] = AA[Row];
for (unsigned int stride = blockDim.x; stride > 0; stride >>= 1)
{
__syncthreads();
if (tid < stride && middleware[tid + stride] > middleware[tid])
middleware[tid] = middleware[tid + stride];
}
if (tid == 0)AA[Row] = middleware[0];//最大值放在数组第一个元素中
}
测试16*16时结果是否正确: