#include <iostream>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
/*
A * B = C
A 矩阵大小为 m * l, 对应[0, 1, 2, ..., 1023],[0, 1, 2, ..., 1023]
B 矩阵大小为 l * n 对应[1, 1, ..., 1]
C 矩阵大小为 m * n
*/
const int M = 2048;
const int L = 1024;
const int N = 512;
/*cpu 上矩阵相乘 一般矩阵乘法*/
template <typename T>
void mat_dot_cpu_1(T* a, T* b, T* c, int m, int n, int l)
{
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n; ++j)
{
double dTemp = 0.0;
for (int k = 0; k < l; ++k)
{
dTemp += a[i * l + k] * b[k * n + j];
}
c[i * n + j] = dTemp;
}
}
}
/*cpu 上矩阵相乘 循环交换矩阵乘法
A数据的内存是连续访问的
每次读取A的一个元素,与B矩阵对应的一行做乘积运算,累加乘积结果到C矩阵的对应行*/
template <typename T>
void mat_dot_cpu_2(T* a, T* b, T* c, int m, int n, int l)
{
for (int i = 0; i < m; ++i)
{
for (int k = 0; k < l; ++k)
{
double temp = a[i * l + k];
for (int j = 0; j < n; ++j)
{
c[i * n + j] += temp * b[k * n + j];
}
}
}
}
/*cpu 上矩阵相乘 转置矩阵乘法
要解决矩阵乘法中访存不连续、无法向量化的问题,除了内部两次循环交换的方法,还有矩阵转置的方法
对B矩阵转置,转置后,原来A矩阵一行和B矩阵一列的向量内积运算就变成了A矩阵的一行和B矩阵的一行的向量内积运算*/
template <typename T>
void mat_dot_cpu_3(T* a, T* b, T* c, int m, int n, int l)
{
T* b_t = NULL;
b_t = (T*)malloc(L * N * sizeof(T));
// 将b矩阵转置成 b_t矩阵
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < l; ++j)
{
b_t[i * l + j] = b[j * n + i];
}
}
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n; ++j)
{
double temp = 0.0;
for (int k = 0; k < l; ++k)
{
temp += a[i * l + k] * b_t[j * l + k];
}
c[i * n + j] = temp;
}
}
if (NULL != b_t)
{
free(b_t);
b_t = NULL;
}
}
template <typename T>
__global__ void mat_dot_gpu(T* a, T* b, T* c, int M, int N, int L)
{
float temp = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < M && col < N)
{
for (int k = 0; k < L; k++)
{
temp += a[row * L + k] * b[k * N + col];
}
c[row * N + col] = temp;
}
}
int main()
{
float a[M * L], b[L * N], c[M * N];
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < L; ++j)
{
a[i * L + j] = j;
}
}
for (int i = 0; i < L; ++i)
{
for (int j = 0; j < N; ++j)
{
b[i * N + j] = 1;
}
}
float* dev_a, * dev_b, * dev_c;
cudaMalloc(&dev_a, sizeof(float) * M * L);
cudaMemcpy(dev_a, a, sizeof(float) * M * L, cudaMemcpyHostToDevice);
cudaMalloc(&dev_b, sizeof(float) * L * N);
cudaMemcpy(dev_b, b, sizeof(float) * L * N, cudaMemcpyHostToDevice);
cudaMalloc(&dev_c, sizeof(float) * M * N);
cudaMemcpy(dev_c, c, sizeof(float) * M * N, cudaMemcpyHostToDevice);
dim3 threads_per_block(16, 16, 1); // A 16 x 16 block threads
dim3 number_of_blocks(N / threads_per_block.x + 1, M / threads_per_block.y + 1, 1);
clock_t start = clock();
for (int i = 0; i < 100; ++i)
{
//mat_dot_cpu_1(a, b, c, M, N, L);
//mat_dot_cpu_2(a, b, c, M, N, L);
//mat_dot_cpu_3(a, b, c, M, N, L);
// 执行kernel
mat_dot_gpu << < number_of_blocks, threads_per_block >> > (dev_a, dev_b, dev_c, M, N, L);
}
cudaMemcpy(c, dev_c, sizeof(float) * M * N, cudaMemcpyDeviceToHost);
clock_t end = clock();
std::cout << end - start << "ms" << std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
调用cublas库:
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include <iostream>
#include <stdlib.h>
using namespace std;
// 定义测试矩阵的维度
int const A_ROW = 5;
int const A_COL = 6;
int const B_ROW = 6;
int const B_COL = 7;
int main()
{
// 定义状态变量
cublasStatus_t status;
float* h_A, * h_B, * h_C; //存储于内存中的矩阵
h_A = (float*)malloc(sizeof(float) * A_ROW * A_COL); //在内存中开辟空间
h_B = (float*)malloc(sizeof(float) * B_ROW * B_COL);
h_C = (float*)malloc(sizeof(float) * A_ROW * B_COL);
// 为待运算矩阵的元素赋予 0-10 范围内的随机数
for (int i = 0; i < A_ROW * A_COL; i++)
{
h_A[i] = (float)(rand() % 10 + 1);
}
for (int i = 0; i < B_ROW * B_COL; i++)
{
h_B[i] = (float)(rand() % 10 + 1);
}
// 打印待测试的矩阵
cout << "矩阵 A :" << endl;
for (int i = 0; i < A_ROW * A_COL; i++)
{
cout << h_A[i] << " ";
if ((i + 1) % A_COL == 0) cout << endl;
}
cout << endl;
cout << "矩阵 B :" << endl;
for (int i = 0; i < B_ROW * B_COL; i++)
{
cout << h_B[i] << " ";
if ((i + 1) % B_COL == 0) cout << endl;
}
cout << endl;
float* d_A, *d_B, *d_C; //存储于显存中的矩阵
cudaMalloc((void**)& d_A, sizeof(float) * A_ROW * A_COL); //在显存中开辟空间
cudaMalloc((void**)& d_B, sizeof(float) * B_ROW * B_COL);
cudaMalloc((void**)& d_C, sizeof(float) * A_ROW * B_COL);
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(d_A, h_A, sizeof(float) * A_ROW * A_COL, cudaMemcpyHostToDevice); //数据从内存拷贝到显存
cudaMemcpy(d_B, h_B, sizeof(float) * B_ROW * B_COL, cudaMemcpyHostToDevice);
float a = 1, b = 0;
cublasSgemm(
handle,
CUBLAS_OP_T, //矩阵A的属性参数,转置,按行优先
CUBLAS_OP_T, //矩阵B的属性参数,转置,按行优先
A_ROW, //矩阵A、C的行数
B_COL, //矩阵B、C的列数
A_COL, //A的列数,B的行数,此处也可为B_ROW,一样的
&a, //alpha的值
d_A, //左矩阵,为A
A_COL, //A的leading dimension,此时选择转置,按行优先,则leading dimension为A的列数
d_B, //右矩阵,为B
B_COL, //B的leading dimension,此时选择转置,按行优先,则leading dimension为B的列数
&b, //beta的值
d_C, //结果矩阵C
A_ROW //C的leading dimension,C矩阵一定按列优先,则leading dimension为C的行数
);
//此时得到的结果便是C=AB,但由于C是按列优先,故此时得到的C应该是正确结果的转置
std::cout << "计算结果的转置 ( (A*B)的转置 ):" << std::endl;
cudaMemcpy(h_C, d_C, sizeof(float) * A_ROW * B_COL, cudaMemcpyDeviceToHost);
for (int i = 0; i < A_ROW * B_COL; ++i)
{
std::cout << h_C[i] << " ";
if ((i + 1) % B_COL == 0) std::cout << std::endl;
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
system("pause");
return 0;
}