我用 C++ 编写了一个简单的矩阵乘法程序,并且它有效。我只是 C++ 的初学者,但我已经成功地让它工作了。
让我困惑的是它比 NumPy 慢得多。我已经对它进行了基准测试。
因此,我尝试使用 OpenMP 来加速,但我观察到性能完全没有变化:
#include <algorithm>
#include <chrono>
#include <iostream>
#include <omp.h>
#include <string>
#include <vector>
using std::vector;
using std::chrono::high_resolution_clock;
using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::microseconds;
using std::cout;
using line = vector<double>;
using matrix = vector<line>;
void fill(line &l) {
std::generate(l.begin(), l.end(), []() { return ((double)rand() / (RAND_MAX)); });
}
matrix random_matrx(int64_t height, int64_t width) {
matrix mat(height, line(width));
std::for_each(mat.begin(), mat.end(), fill);
return mat;
}
matrix dot_product(const matrix &mat0, const matrix &mat1) {
size_t h0, w0, h1, w1;
h0 = mat0.size();
w0 = mat0[0].size();
h1 = mat1.size();
w1 = mat1[0].size();
if (w0 != h1) {
throw std::invalid_argument("matrices cannot be cross multiplied");
}
matrix out(h0, line(w1));
for (int y = 0; y < h0; y++) {
for (int x = 0; x < w1; x++) {
double s = 0;
for (int z = 0; z < w0; z++) {
s += mat0[y][z] * mat1[z][x];
}
out[y][x] = s;
}
}
return out;
}
matrix dot_product_omp(const matrix& mat0, const matrix& mat1) {
size_t h0, w0, h1, w1;
h0 = mat0.size();
w0 = mat0[0].size();
h1 = mat1.size();
w1 = mat1[0].size();
if (w0 != h1) {
throw std::invalid_argument("matrices cannot be cross multiplied");
}
matrix out(h0, line(w1));
omp_set_num_threads(4);
#pragma omp parallel for private(y, x, z) schedule(dynamic)
for (int y = 0; y < h0; y++) {
for (int x = 0; x < w1; x++) {
double s = 0;
for (int z = 0; z < w0; z++) {
s += mat0[y][z] * mat1[z][x];
}
out[y][x] = s;
}
}
return out;
}
int main()
{
matrix a, b;
a = random_matrx(16, 9);
b = random_matrx(9, 24);
auto start = high_resolution_clock::now();
for (int64_t i = 0; i < 65536; i++) {
dot_product(a, b);
}
auto end = high_resolution_clock::now();
duration<double, std::nano> time = end - start;
double once = time.count() / 65536000;
cout << "mat(16, 9) * mat(9, 24): " + std::to_string(once) + " microseconds\n";
a = random_matrx(128, 256);
b = random_matrx(256, 512);
start = high_resolution_clock::now();
for (int64_t i = 0; i < 512; i++) {
dot_product(a, b);
}
end = high_resolution_clock::now();
time = end - start;
once = time.count() / 512000;
cout << "mat(128, 256) * mat(256, 512): " + std::to_string(once) + " microseconds\n";
start = high_resolution_clock::now();
for (int64_t i = 0; i < 512; i++) {
dot_product_omp(a, b);
}
end = high_resolution_clock::now();
time = end - start;
once = time.count() / 512000;
cout << "mat(128, 256) * mat(256, 512) omp: " + std::to_string(once) + " microseconds\n";
}
PS D:\MyScript> C:\Users\Xeni\source\repos\matmul\x64\Release\matmul.exe
mat(16, 9) * mat(9, 24): 5.200116 microseconds
mat(128, 256) * mat(256, 512): 30128.739453 microseconds
mat(128, 256) * mat(256, 512) omp: 30116.103125 microseconds
我使用 Visual Studio 2022、C++20 标准、编译器标志编译它:
/permissive- /ifcOutput "x64\Release\" /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc143.pdb" /Zc:inline /fp:precise /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /std:c17 /Gd /Oi /MD /std:c++20 /FC /Fa"x64\Release\" /EHsc /nologo /Fo"x64\Release\" /Ot /Fp"x64\Release\matmul.pch" /diagnostics:column
附加标志:
/arch:AVX2 /fp:fast
只是为什么没有任何改善呢?我怎样才能真正改善它?
我已将 OMP 版本更改为:
matrix dot_product_omp(const matrix& mat0, const matrix& mat1) {
size_t h0, w0, h1, w1;
h0 = mat0.size();
w0 = mat0[0].size();
h1 = mat1.size();
w1 = mat1[0].size();
if (w0 != h1) {
throw std::invalid_argument("matrices cannot be cross multiplied");
}
matrix out(h0, line(w1));
omp_set_num_threads(4);
#pragma omp parallel for schedule(dynamic)
for (int y = 0; y < h0; y++) {
for (int x = 0; x < w1; x++) {
double s = 0;
for (int z = 0; z < w0; z++) {
s += mat0[y][z] * mat1[z][x];
}
out[y][x] = s;
}
}
return out;
}
我已经编译了/openmp
flag,我已经进行了多次基准测试,它只使代码运行时间约为原始时间的四分之一:
PS D:\MyScript> C:\Users\Xeni\source\repos\matmul\x64\Release\matmul.exe
mat(16, 9) * mat(9, 24): 5.126476 microseconds
mat(128, 256) * mat(256, 512): 30999.137109 microseconds
mat(128, 256) * mat(256, 512) omp: 8574.475195 microseconds
NumPy 更快:
In [374]: a = np.random.random((128, 256))
In [375]: b = np.random.random((256, 512))
In [376]: %timeit a @ b
382 µs ± 19.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
我的代码慢了一个数量级。那么如何才能缩小性能差距呢?