cuda cpu功能-gpu内核重叠

2024-04-20

我在尝试开发以练习 CUDA 的 CUDA 应用程序时遇到并发问题。我想通过使用 cudaMemecpyAsync 和 CUDA 内核的异步行为来共享 GPU 和 CPU 之间的工作，但我无法成功重叠 CPU 执行和 GPU 执行。

它与主机到设备的数据传输重叠，但内核执行不重叠。它基本上等待 CPU 完成并调用同步函数，然后内核开始在设备上执行。我无法理解这种行为，内核不是总是与 CPU 线程异步吗？

我的 GPU 是 Nvidia Geforce GT 550m（具有 1 个复制引擎和 1 个计算引擎的 Fermi 架构）。

我使用 CUDA 6.0 和 Nsight 4.0。

这是代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdlib.h>
#include <stdio.h>

#include <iostream>
#include <thread>
#include <chrono>
using namespace std;

struct point4D 
{
    float x;
    float y;
    float z;
    float w;
};

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC);

bool correct_output(point4D * data, unsigned int size);
void flush_buffer(point4D * data, unsigned int size);
void initialize_input(point4D *& data, unsigned int size);
void cudaCheckError(cudaError_t cudaStatus, char* err);

// Implements cross product for 4D point on the GPU-side.
__global__ void gpu_kernel(point4D * d_ptrData, point4D * d_out, point4D pB, point4D pC)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    point4D pA = d_ptrData[index];
    point4D out; out.x = 0; out.y = 0; out.z = 0; out.w = 0;

    out.x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
    out.y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
    out.z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
    out.w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);

   d_out[index] = out;
}

// Implements cross product for 4D point on the CPU-size.
void cpu_function(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    for(unsigned int index = 0; index < h_dataSize; index++)
    {
        h_out[index].x = 0; h_out[index].y = 0; h_out[index].z = 0; h_out[index].w = 0;

        point4D pA = h_ptrData[index];

        h_out[index].x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
        h_out[index].y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
        h_out[index].z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
        h_out[index].w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);
    }   
}


int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...\n", argv[0]);

    int device_count;
    cudaCheckError(cudaGetDeviceCount(&device_count), "Couldn't get device count!");

    if (device_count == 0)
    {
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    devID = 0;
    cudaCheckError(cudaSetDevice(devID), "Couldn't set device!");
    cudaCheckError(cudaGetDeviceProperties(&deviceProps, devID), "Couldn't get Device Properties");
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProps.name, deviceProps.major, deviceProps.minor);

    cudaDeviceReset();

    const unsigned int DATA_SIZE = 30000000;
    bool bFinalResults = true;

    // Input Data Initialization
    point4D pointB;
    pointB.x = 1; pointB.y = 1; pointB.z = 0; pointB.w = 0;

    point4D pointC;
    pointC.x = 1; pointC.y = 1; pointC.z = 1; pointC.w = 0;

    point4D * data = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    point4D * out_points = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    initialize_input(data, DATA_SIZE);
    //

    flush_buffer(out_points, DATA_SIZE);
    cout << endl << endl;

    // 1+way
    heterogenous_1way_plus(data, DATA_SIZE, out_points, pointB, pointC);
    bFinalResults &= correct_output(out_points, DATA_SIZE); // checking correctness

    free(out_points);
    free(data);

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
    return 0;
}

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    cout << "1-way_plus: STARTS!!!" << endl;

    // Run the %25 of the data from CPU, rest will be executed on GPU
    unsigned int ratioPercentCPUtoGPU = 25;
    unsigned int d_dataSize = (h_dataSize * (100 - ratioPercentCPUtoGPU))/100;
    h_dataSize = (h_dataSize * ratioPercentCPUtoGPU)/100;
    size_t memorySize = d_dataSize * sizeof(point4D);

    cout << "Data Ratio Between CPU and GPU:" << (float)ratioPercentCPUtoGPU/100 << endl;
    cout << "CPU will process " << h_dataSize << " data." << endl;
    cout << "GPU will process " << d_dataSize << " data." << endl;

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    cudaCheckError(cudaHostRegister(h_ptrData, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");
    cudaCheckError(cudaHostRegister(h_out, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");

    // allocate device memory
    point4D * d_in = 0; point4D * d_out = 0;
    cudaCheckError(cudaMalloc( (void **)&d_in, memorySize), "cudaMalloc failed!");
    cudaCheckError(cudaMalloc( (void **)&d_out, memorySize), "cudaMalloc failed!");

    // set kernel launch configuration
    dim3 nThreads = dim3(1000,1);
    dim3 nBlocks = dim3(d_dataSize / nThreads.x,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t stream;
    cudaCheckError(cudaStreamCreate(&stream), "cudaStreamCreate failed!");

    // create cuda event handles
    cudaEvent_t start, stop;
    cudaCheckError(cudaEventCreate(&start), "cudaEventCreate failed!");
    cudaCheckError(cudaEventCreate(&stop), "cudaEventCreate failed!");

    // main thread waits for device
    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");
    float gpu_time = 0.0f;
    cudaEventRecord(start, stream);

    cudaMemcpyAsync(d_in, h_ptrData, memorySize, cudaMemcpyHostToDevice, stream);       
    gpu_kernel<<<nBlocks, nThreads, 0, stream>>>(d_in, d_out, pB, pC);
    cudaMemcpyAsync(h_out, d_out, memorySize, cudaMemcpyDeviceToHost, stream);

    cudaEventRecord(stop, stream);

    // The memory layout of CPU processing starts after GPU's.
    cpu_function(h_ptrData + d_dataSize, h_dataSize, h_out + d_dataSize, pB, pC);       

    cudaCheckError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed!");

    cudaCheckError(cudaEventElapsedTime(&gpu_time, start, stop), "cudaEventElapsedTime failed!");

    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");

    // release resources
    cudaCheckError(cudaEventDestroy(start), "cudaEventDestroy failed!");
    cudaCheckError(cudaEventDestroy(stop), "cudaEventDestroy failed!");
    cudaCheckError(cudaHostUnregister(h_ptrData), "cudaHostUnregister failed!");
    cudaCheckError(cudaHostUnregister(h_out), "cudaHostUnregister failed!");
    cudaCheckError(cudaFree(d_in), "cudaFree failed!");
    cudaCheckError(cudaFree(d_out), "cudaFree failed!");
    cudaCheckError(cudaStreamDestroy(stream), "cudaStreamDestroy failed!");

    cudaDeviceReset();    

    cout << "Execution of GPU: " << gpu_time << "ms" << endl;
    cout << "1-way_plus: ENDS!!!" << endl;        
}

// Checks correctness of outputs
bool correct_output(point4D * data, unsigned int size)
{ 
    const static float x = 0, y = 0, z = 0, w = -1;

    for (unsigned int i = 0; i < size; i++)
    {
        if (data[i].x != x || data[i].y != y ||
            data[i].z != y || data[i].w != w)
        {
            printf("Error! data[%d] = [%f, %f, %f, %f], ref = [%f, %f, %f, %f]\n",
            i, data[i].x, data[i].y, data[i].z, data[i].w, x, y, z, w);

            return 0;
        }
    }
    return 1;
}

// Refresh the output buffer
void flush_buffer(point4D * data, unsigned int size)
{
    for(unsigned int i = 0; i < size; i++)
    {
        data[i].x = 0; data[i].y = 0; data[i].z = 0; data[i].w = 0;
    }
}

// Initialize the input data to feed the system for simulation
void initialize_input(point4D *& data, unsigned int size)
{
    for(unsigned int idx = 0; idx < size; idx++)
    {
        point4D* d = &data[idx];
        d->x = 1;
        d->y = 0;
        d->z = 0;
        d->w = 0;
    }
}

void cudaCheckError(cudaError_t cudaStatus, char* err)
{
    if(cudaStatus != cudaSuccess)
    {
        fprintf(stderr, err);
        cudaDeviceReset();
       exit(EXIT_FAILURE);
    }
}

And here is the Nsight screenshot :

从我在你的分析器图像上看到的情况来看，你得到了适当的重叠。我运行你的代码并看到类似的东西。

一般来说，代码中的关键顺序是这样的：

cudaMemcpyAsyncH2D
内核调用
cudaMemcpyAsync D2H
中央处理器功能
cudaStream同步

CPU 线程按顺序处理这些步骤。步骤 1-3 是异步的，这意味着控制权会立即返回给 CPU 线程，而无需等待底层 CUDA 操作完成。并且您希望步骤 4 与步骤 1,2 和 3 尽可能多地重叠。

我们看到的是cudaStreamSynchronize()呼叫显示在时间线中大约与start的内核执行。这意味着所有 CPU 线程活动前 the cudaStreamSynchronize()通话有完全的在那个点（即大约在实际内核执行的开始点）。因此，我们希望与步骤 1-3 重叠的 cpu 函数（步骤 4）实际上是完全的在步骤 2 开始时（就实际 CUDA 执行而言）。因此，您的 cpu 功能与第一个主机->设备 memcpy 操作完全重叠。

所以它按预期工作。因为cudaStreamSynchronize()call 会阻塞 CPU 线程，直到所有流活动完成为止，它占用从遇到它时到流活动完成点的时间线。

事实是cudaStreamSynchronize()奇怪的是，调用与内核执行的开始是一致的，并且 H2D memcpy 的结束和内核的开始之间存在间隙，这可能是由于 WDDM 批处理命令造成的。当我在 Linux 下分析你的代码时，我没有看到差距和精确的重合，但除此之外，总体流程是相同的。这是我在 Linux 下使用可视化分析器看到的结果：

请注意，在上图中，cudaStreamSynchronize()实际遇到的是during内核开始之前的 H2D memcpy 操作。

在回答评论中的问题时，我修改了应用程序，使分割百分比从 25 变为 50：

unsigned int ratioPercentCPUtoGPU = 50;

新的探查器输出如下所示：

我们看到 CPU 相对于 GPU 内核调用花费了更多时间，因此cudaStreamSynchronize()CPU 线程直到在D2Hmemcpy 操作。我们继续在linux下看到，这个点和内核执行的开始没有固定的关系。现在，CPU 执行与 H2D memcpy、内核执行和 D2H memcpy 的一小部分完全重叠。

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

C11

CUDA

cuda cpu功能-gpu内核重叠的相关文章

GoogleTest 枚举类的测试错误打印

我正在使用 GoogleTest 版本 1 7 0 来测试我的 C 应用程序我有一个枚举定义如下 namespace MyNamespace enum class MyEnum MyEnumValue MyEnumValue2 Googl
为什么大多数平台上没有“aligned_realloc”？

MSVC有自己的非标准函数 aligned malloc aligned realloc and aligned free C 17和C11引入了 std aligned alloc 其结果可以是de分配有free or realloc B
std::condition_variable::wait_for 和 std::condition_variable::wait_until 有什么区别？

The 我正在使用的参考 http en cppreference com w cpp thread condition variable对两者的解释如下 wait for 阻塞当前线程直到条件变量被唤醒或在指定的超时持续时间之后 wai
C++0x 可变参数模板按引用传递

我想为我的应用程序使用可变参数模板功能但我不希望对象按值传递因为在我的情况下对象非常复杂我想通过引用传递它们而不是作为指针 void func template
VS 程序在调试模式下崩溃，但在发布模式下不崩溃？

我正在 VS 2012 中运行以下程序来尝试 Thrust 函数查找 include cuda runtime h include device launch parameters h include
make_shared<>() 中的 WKWYL 优化是否会给某些多线程应用程序带来惩罚？

前几天我偶然看到这个非常有趣的演示 http channel9 msdn com Events GoingNative GoingNative 2012 STL11 Magic Secrets作者 Stephan T Lavavej 其中提
模板化的 typedef？

我正在使用 libgc 一个用于 C 和 C 的垃圾收集器为了使 STL 容器可被垃圾回收必须使用 gc allocator 而不是写作 std vector
如何使用 std::array 模拟 C 数组初始化“int arr[] = { e1, e2, e3, ... }”行为？

注意这个问题是关于不必指定元素数量并且仍然允许直接初始化嵌套类型这个问题 https stackoverflow com questions 6111565 now that we have stdarray what uses are
过滤参数包的类型

我想知道是否可以过滤传递给可变参数模板基于谓词模板的类型以生成另一个包含满足谓词的类型的可变参数模板 Filter a parameter pack template
是否允许将 std::vector 的元素插入到同一向量中？

考虑以下insert and emplace的成员函数std vector
C++11 Geany 设置

我正在学习 C 我需要在 Geany 中为 C 11 正确设置编译和构建命令我以为我的理解是正确的但是当使用时auto 我收到以下错误 warning auto will change meaning in C 0x please re
“gld/st_throughput”和“dram_read/write_throughput”指标之间有什么区别？

在 CUDA 可视化分析器版本 5 中我知道 gld st requested throughput 是应用程序请求的内存吞吐量然而当我试图找到硬件的实际吞吐量时我很困惑因为有两对似乎合格的指标它们是 gld st throug
在 NaN 情况下 to_string() 可以返回什么

我使用 VS 2012 遇到了非常令人恼火的行为有时我的浮点数是 NaN auto dbgHelp std to string myFloat dbgHelp最终包含5008角色你不能发明这个东西其中大部分为0 最终结果是 0 INF
Linux 上哪个版本的 C++ 库符合“ISO C++ 11”标准？

目前我的计算机上有 Debian Squeeze AMD64 linux libstdc 5 和 libstdc 6 这些 C 库符合 ISO 标准 C 11 吗不它们并不完全符合但它们有一些元素 stdlibc 上的 C 11 支持
用于虚拟继承的虚拟函数表中的虚拟基偏移量

代码如下在Ubuntu 16 04上用G 5 4编译的C 11代码 include
在 C++11 中移出 stdpriority_queue 的元素

最小的工作示例 include
如何运行和理解CUDA Visual Profiler？

我已经设置了 CUDA 5 0 并且我的 CUDA 项目运行良好但我不知道如何使用 Visual Profiler 分析我的 CUDA 项目如何运行它我还需要安装更多吗又该如何做呢我的电脑使用Window 7 64位 CUDA 5
C++11 基于范围的 for 循环效率“const auto &i”与“auto i”

在 C 11 中我可以像这样迭代一些容器 for auto i vec std cout lt lt i lt lt std endl 但我知道这是不必要的不必要地因为我只需要print的价值观vec 复制 EDIT 的每个元素vec
具有 Cuda Thrust 的多个 GPU？

如何将 Thrust 与多个 GPU 一起使用这只是使用 cudaSetDevice deviceId 的问题吗然后运行相关的 Thrust 代码使用 CUDA 4 0 或更高版本 cudaSetDevice deviceId 接下来
局部静态变量初始化是线程安全的[重复]

这个问题在这里已经有答案了假设我有一个包含三个静态函数的类如下所示 include

随机推荐

在Python中删除小数点后的尾随零

我正在使用Python 2 7 我需要更换 0 字符串在末尾比如说 a 2 50 a a replace 0 我得到 a 2 5 我对这个结果很满意现在a 200 a a replace 0 我得到 a 2 这个输出是按照我同意的设计的
如何从文本文件中逐行读取并按字符分割行？ [复制]

这个问题在这里已经有答案了我正在写一个 Bash 脚本我的问题是我想从文本文件中逐行读取并按字符分割行我想要纯 Bash 代码假设我在文本文件中有这个格式姓名用户名代码 John Doe johnDoe 534092 Joh
识别并解决 Oracle ITL 死锁

我有一个 Oracle DB 包它经常导致我认为是 ITL 感兴趣的事务列表死锁跟踪文件的相关部分如下 Deadlock graph Blocker s Waiter s Resource Name process session h
i18next 翻译外部组件

我是 i18next 的新手正在尝试本地化翻译网站一切都适用于组件内部的翻译但在外部意味着带有 i18n t 的 json 文件它不会检索所需的信息而是显示默认值我正在使用 create react app 它是文件夹引用的
Qt 样式表：无法使用 ID 选择器

我正在学习使用 Qt 样式表向我的应用程序添加不同的样式我在网上查看了 Qt 文档其中说你可以使用名为ID Selector可以将主题应用于某些对象这就是我实现此功能的方式 QPushButton button color red 但
为什么 Java 的 keytool 没有显示使用 openssl 创建的 PKCS12 信任存储的条目？

我不确定为什么 Javakeytool认为我的 p12密钥库为空如果我创建一个新的自签名证书并将其放入truststore p12pkcs12 密钥库openssl 像这样 openssl req x509 newkey rsa 4096
使用 BroadcastReceiver 进行推送通知时不会被调用

我正在尝试在插件中使用 Android 的 GCM 推送通知我正在从 Google 服务器获取有效的注册 ID 但是当我从服务器发送任何数据时 BroadcastReceiver根本没有被叫到我认为问题在于清单中的权限这是我正在使用的
如何在 Rails 表单中添加下拉

cuda cpu功能-gpu内核重叠

cuda cpu功能-gpu内核重叠 的相关文章

随机推荐

cuda cpu功能-gpu内核重叠的相关文章