方法一:自己写
创建.cu文件。
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"
template <int BLOCK_SIZE>
__global__ void MatrixMulCUDA(float* C, float* A, float* B, int wA, int wB)
{
int bx = blockIdx.x; // Block index
int by = blockIdx.y; // Block index
int tx = threadIdx.x; // Thread index thIdx.x=threadIdx.x+blockDim.x*blockIdx.x
int ty = threadIdx.y; // Thread index thIdx.y=threadIdx.y+blockDim.y*blockIdx.y
int aBegin = wA * BLOCK_SIZE * by; // 320*32*by A的行划分10份
int aEnd = aBegin + wA - 1; // 320*32*by+319 每份320个值*(32)
int aStep = BLOCK_SIZE; // 32 per block Astep = 32
int bBegin = BLOCK_SIZE * bx; // 32*bx B的列划分20份 每份320*32个值
int bStep = BLOCK_SIZE * wB; // 32*640 Bstep =
float Csub = 0;
// a=wA*32*by:32:wA*32*by+319 b=32*bx:32*wB:end
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; // 声明共享内存 32*32
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; // 声明共享内存 32*32
As[ty][tx] = A[a + wA * ty + tx]; // load matrices from device to shared
Bs[ty][tx] = B[b + wB * ty + tx]; // load matrices from device to shared
__syncthreads(); // make sure the matrices are loaded
#pragma unroll // 用于循环展开
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx]; // C_ty_tx = sum_k(A_ty_k * B_k_ty)
}
__syncthreads(); // make sure computation is done
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; // Write to device memory;
C[c + wB * ty + tx] = Csub;
}
void ConstantInit(float* data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
int MatrixMultiply(int block_size, const dim3& dimsA, const dim3& dimsB)
{
// 1. Allocate host memory
float* h_A, * h_B, * h_C;
unsigned int size_A = dimsA.x * dimsA.y; // 320*320
unsigned int mem_size_A = sizeof(float) * size_A; // memory size
cudaMallocHost(&h_A, mem_size_A); // Allocate host memory
unsigned int size_B = dimsB.x * dimsB.y; // 640*320
unsigned int mem_size_B = sizeof(float) * size_B; // memory size
cudaMallocHost(&h_B, mem_size_B); // Allocate host memory
ConstantInit(h_A, size_A, 1.0f); // initial host memory
ConstantInit(h_B, size_B, 0.1f); // initial host memory
dim3 dimsC(dimsB.x, dimsA.y, 1); // 640*320*1
unsigned int size_C = dimsC.x * dimsC.y; // 640*320
unsigned int mem_size_C = sizeof(float) * size_C; // memory size
cudaMallocHost(&h_C, mem_size_C); // Allocate host memory
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
// 2. Allocate device memory
float* d_A, * d_B, * d_C;
cudaMalloc(reinterpret_cast<void**>(&d_A), mem_size_A);
cudaMalloc(reinterpret_cast<void**>(&d_B), mem_size_B);
cudaMalloc(reinterpret_cast<void**>(&d_C), mem_size_C);
cudaEvent_t start, stop;
cudaEventCreate(&start); // Allocate CUDA events for timing
cudaEventCreate(&stop); // Allocate CUDA events for timing
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); // 异步系统
// 3. Copy host memory to device memory
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream); // copy
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream); // copy
dim3 threads(block_size, block_size); // 32*32
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); // 20*10
// 4. Compute on GPU
printf("Computing result using CUDA Kernel...\n");
MatrixMulCUDA<32> <<< grid, threads, 0, stream >>> (d_C, d_A, d_B, dimsA.x, dimsB.x);
printf("done\n");
cudaStreamSynchronize(stream);
// 5. Compute 300 times
cudaEventRecord(start, stream);
int nIter = 300;
for (int j = 0; j < nIter; j++) {
if (block_size == 32) {
MatrixMulCUDA<32> <<< grid, threads, 0, stream >>> (d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
// 6. Copy result from device to host
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
printf("Checking computed result for correctness: ");
bool correct = true;
double eps = 1.e-6;
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * 0.1f));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
h_C[i], dimsA.x * 0.1f, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// 7. Clean up memory
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaEventDestroy(start);
cudaEventDestroy(stop);
if (correct) {
return EXIT_SUCCESS;
}
else {
return EXIT_FAILURE;
}
}
int main()
{
int num_devices;
cudaGetDeviceCount(&num_devices);
struct cudaDeviceProp device_0_prop;
cudaGetDeviceProperties(&device_0_prop, 0);
printf("[Matrix Multiply Using CUDA] - Starting...\n");
int block_size = 32;
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); // 320*320
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); // 640*320
if (dimsA.x != dimsB.y)
{
printf("Error: outer matrix dim must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
int matrix_result = MatrixMultiply(block_size, dimsA, dimsB); // 320*320*1 640*320*1
exit(matrix_result);
}
方法二:利用cuBLAS函数
创建.cpp文件。
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
typedef struct _matrixSize
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;
void matrixMulCPU(float* C, const float* A, const float* B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j)
{
double sum = 0;
for (unsigned int k = 0; k < wA; ++k)
{
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void initializeCUDA(int& devID, int& iSizeMultiple, sMatrixSize& matrix_size)
{
cudaDeviceProp deviceProp;
cudaError_t error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d\n", error);
exit(EXIT_FAILURE);
}
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
int block_size = 32;
matrix_size.uiWA = 3 * block_size * iSizeMultiple;
matrix_size.uiHA = 4 * block_size * iSizeMultiple;
matrix_size.uiWB = 2 * block_size * iSizeMultiple;
matrix_size.uiHB = 3 * block_size * iSizeMultiple;
matrix_size.uiWC = 2 * block_size * iSizeMultiple;
matrix_size.uiHC = 4 * block_size * iSizeMultiple;
printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA,
matrix_size.uiHB, matrix_size.uiWB,
matrix_size.uiHC, matrix_size.uiWC);
}
int matrixMultiply(int devID, sMatrixSize& matrix_size)
{
// allocate host memory for matrices A and B
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*)malloc(mem_size_B);
srand(2006);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate host memory for the result
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*)malloc(mem_size_C);
// allocate device memory
float* d_A, * d_B, * d_C;
cudaMalloc((void**)&d_A, mem_size_A);
cudaMalloc((void**)&d_B, mem_size_B);
cudaMalloc((void**)&d_C, mem_size_C);
// copy host to device
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
// setup execution parameters
int block_size = 32;
dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
printf("Computing result using CUBLAS...");
// execute the kernel
int nIter = 30;
// CUBLAS version 2.0
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop;
cublasCreate(&handle);
//Perform warmup operation with cublas
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
// Allocate CUDA events that we'll use for timing
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, NULL); // Record the start event
for (int j = 0; j < nIter; j++)
{
// note cublas is column primary! need to transpose the order
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
}
printf("done.\n");
cudaEventRecord(stop, NULL); // Record the stop event
cudaEventSynchronize(stop); // Wait for the stop event to complete
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);
// copy result from device to host
cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
// Destroy the handle
cublasDestroy(handle);
// compute reference solution
printf("Computing result using host CPU...");
float* reference = (float*)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return EXIT_SUCCESS;
}
int main(int argc, char** argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
sMatrixSize matrix_size;
initializeCUDA(devID, sizeMult, matrix_size);
int matrix_result = matrixMultiply(devID, matrix_size);
return matrix_result;
}