由于 MPI-3 具有共享内存并行功能,并且它似乎与我的应用程序完美匹配,因此我正在认真考虑将我的混合 OpemMP-MPI 代码重写为纯 MPI 实现。
为了给棺材里钉上最后一颗钉子,我决定运行一个小程序来测试 OpenMP fork/join 机制的延迟。这是代码(为英特尔编译器编写):
void action1(std::vector<double>& t1, std::vector<double>& t2)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = std::sin(t2.data()[index]) * std::cos(t2.data()[index]);
}
}
void action2(std::vector<double>& t1, std::vector<double>& t2)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = t2.data()[index] * std::sin(t2.data()[index]);
}
}
void action3(std::vector<double>& t1, std::vector<double>& t2)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = t2.data()[index] * t2.data()[index];
}
}
void action4(std::vector<double>& t1, std::vector<double>& t2)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = std::sqrt(t2.data()[index]);
}
}
void action5(std::vector<double>& t1, std::vector<double>& t2)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = t2.data()[index] * 2.0;
}
}
void all_actions(std::vector<double>& t1, std::vector<double>& t2)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = std::sin(t2.data()[index]) * std::cos(t2.data()[index]);
t1.data()[index] = t2.data()[index] * std::sin(t2.data()[index]);
t1.data()[index] = t2.data()[index] * t2.data()[index];
t1.data()[index] = std::sqrt(t2.data()[index]);
t1.data()[index] = t2.data()[index] * 2.0;
}
}
int main()
{
// decide the process parameters
const auto n = std::size_t{8000000};
const auto test_count = std::size_t{500};
// garbage data...
auto t1 = std::vector<double>(n);
auto t2 = std::vector<double>(n);
//+/////////////////
// perform actions one after the other
//+/////////////////
const auto sp = timer::spot_timer();
const auto dur1 = sp.duration_in_us();
for (auto index = std::size_t{}; index < test_count; ++index)
{
#pragma noinline
action1(t1, t2);
#pragma noinline
action2(t1, t2);
#pragma noinline
action3(t1, t2);
#pragma noinline
action4(t1, t2);
#pragma noinline
action5(t1, t2);
}
const auto dur2 = sp.duration_in_us();
//+/////////////////
// perform all actions at once
//+/////////////////
const auto dur3 = sp.duration_in_us();
for (auto index = std::size_t{}; index < test_count; ++index)
{
#pragma noinline
all_actions(t1, t2);
}
const auto dur4 = sp.duration_in_us();
const auto a = dur2 - dur1;
const auto b = dur4 - dur3;
if (a < b)
{
throw std::logic_error("negative_latency_error");
}
const auto fork_join_latency = (a - b) / (test_count * 4);
// report
std::cout << "Ran the program with " << omp_get_max_threads() << ", the calculated fork/join latency is: " << fork_join_latency << " us" << std::endl;
return 0;
}
正如您所看到的,其想法是单独执行一组操作(每个操作都在一个 OpenMP 循环内)并计算其平均持续时间,然后一起执行所有这些操作(在同一个 OpenMP 循环内)并计算的平均持续时间。然后我们有一个两个变量的线性方程组,其中之一是 fork/join 机制的延迟,可以求解该方程以获得该值。
问题:
- 我是否忽略了什么?
- 目前,我正在使用“-O0”来阻止 smarty-pants 编译器执行其有趣的操作。我应该使用哪种编译器优化,这些优化也会对延迟本身等产生影响吗?
- 在我的 6 核 Coffee Lake 处理器上,我测得延迟约为 850 us。这听起来正确吗?
Edit 3
-
)根据 @paleonix 的建议,我在一开始就加入了热身计算,
-
)为了简单起见,我减少了操作的数量,并且,
-
)我已切换到“omp_get_wtime”以使其易于理解。
我现在使用标志 -O3 运行以下代码:
void action1(std::vector<double>& t1)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = std::sin(t1.data()[index]);
}
}
void action2(std::vector<double>& t1)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = std::cos(t1.data()[index]);
}
}
void action3(std::vector<double>& t1)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
t1.data()[index] = std::atan(t1.data()[index]);
}
}
void all_actions(std::vector<double>& t1, std::vector<double>& t2, std::vector<double>& t3)
{
#pragma omp parallel for schedule(static) num_threads(std::thread::hardware_concurrency())
for (auto index = std::size_t{}; index < t1.size(); ++index)
{
#pragma optimize("", off)
t1.data()[index] = std::sin(t1.data()[index]);
t2.data()[index] = std::cos(t2.data()[index]);
t3.data()[index] = std::atan(t3.data()[index]);
#pragma optimize("", on)
}
}
int main()
{
// decide the process parameters
const auto n = std::size_t{1500000}; // 12 MB (way too big for any cache)
const auto experiment_count = std::size_t{1000};
// garbage data...
auto t1 = std::vector<double>(n);
auto t2 = std::vector<double>(n);
auto t3 = std::vector<double>(n);
auto t4 = std::vector<double>(n);
auto t5 = std::vector<double>(n);
auto t6 = std::vector<double>(n);
auto t7 = std::vector<double>(n);
auto t8 = std::vector<double>(n);
auto t9 = std::vector<double>(n);
//+/////////////////
// warum-up, initialization of threads etc.
//+/////////////////
for (auto index = std::size_t{}; index < experiment_count / 10; ++index)
{
all_actions(t1, t2, t3);
}
//+/////////////////
// perform actions (part A)
//+/////////////////
const auto dur1 = omp_get_wtime();
for (auto index = std::size_t{}; index < experiment_count; ++index)
{
action1(t4);
action2(t5);
action3(t6);
}
const auto dur2 = omp_get_wtime();
//+/////////////////
// perform all actions at once (part B)
//+/////////////////
const auto dur3 = omp_get_wtime();
#pragma nofusion
for (auto index = std::size_t{}; index < experiment_count; ++index)
{
all_actions(t7, t8, t9);
}
const auto dur4 = omp_get_wtime();
const auto a = dur2 - dur1;
const auto b = dur4 - dur3;
const auto fork_join_latency = (a - b) / (experiment_count * 2);
// report
std::cout << "Ran the program with " << omp_get_max_threads() << ", the calculated fork/join latency is: "
<< fork_join_latency * 1E+6 << " us" << std::endl;
return 0;
}
这样,测得的延迟现在为 115 us。现在令我困惑的是这个值changes当动作改变时。根据我的逻辑,由于我在 A 部分和 B 部分中执行相同的操作,因此实际上应该没有任何变化。为什么会发生这种情况?