cuda nms

2023-11-02

  int YoloLayerPlugin::nms_fun(int batch_size, void **inputs, void *const* outputs, size_t count, int detections_per_im, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream) const {

        if (!workspace || !workspace_size) {
            // Return required scratch space size cub style
            workspace_size  = get_size_aligned<bool>(count);  // flags
            workspace_size += get_size_aligned<int>(count);   // indices
            workspace_size += get_size_aligned<int>(count);   // indices_sorted
            workspace_size += get_size_aligned<float>(count); // scores
            workspace_size += get_size_aligned<float>(count); // scores_sorted
        
            size_t temp_size_flag = 0;
            cub::DeviceSelect::Flagged((void *)nullptr, temp_size_flag,
            cub::CountingInputIterator<int>(count),
            (bool *)nullptr, (int *)nullptr, (int *)nullptr, count);
            size_t temp_size_sort = 0;
            cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
            (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
            workspace_size += std::max(temp_size_flag, temp_size_sort);

            return workspace_size;
        }

        auto on_stream = thrust::cuda::par.on(stream);

        auto flags = get_next_ptr<bool>(count, workspace, workspace_size);
        auto indices = get_next_ptr<int>(count, workspace, workspace_size);
        auto indices_sorted = get_next_ptr<int>(count, workspace, workspace_size);
        auto scores = get_next_ptr<float>(count, workspace, workspace_size);
        auto scores_sorted = get_next_ptr<float>(count, workspace, workspace_size);

        // printf("nms batch %d \n", batch_size);

        for (int batch = 0; batch < batch_size; batch++) {
            auto in_scores = static_cast<const float *>(inputs[0]) + batch * count;
            auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * count;
            auto in_classes = static_cast<const float *>(inputs[2]) + batch * count;
            auto in_points = static_cast<const float *>(inputs[3]) + batch * count;


            auto out_scores = static_cast<float *>(outputs[0]) + batch * detections_per_im;
            auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * detections_per_im;
            auto out_classes = static_cast<float *>(outputs[2]) + batch * detections_per_im;
            auto out_points = static_cast<float4 *>(outputs[3]) + batch * detections_per_im;
            

           
            // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
            // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);

            // Discard null scores
            thrust::transform(on_stream, in_scores, in_scores + count,flags, thrust::placeholders::_1 > 0.0f);

            int *num_selected = reinterpret_cast<int *>(indices_sorted);
            cub::DeviceSelect::Flagged(workspace, workspace_size, cub::CountingInputIterator<int>(0),flags, indices, num_selected, count, stream);
            cudaStreamSynchronize(stream);
            int num_detections = *thrust::device_pointer_cast(num_selected);

            // Sort scores and corresponding indices
            thrust::gather(on_stream, indices, indices + num_detections, in_scores, scores);
            cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores)*8, stream);

            // Launch actual NMS kernel - 1 block with each thread handling n detections
            const int max_threads = 1024;
            int num_per_thread = ceil((float)num_detections / max_threads);
            nms_kernel<<<1, max_threads, 0, stream>>>(num_per_thread, nms_thresh, num_detections,
            indices_sorted, scores_sorted, in_classes, in_boxes);

            // Re-sort with updated scores
            cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
            scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores)*8, stream);

            // Gather filtered scores, boxes, classes
            num_detections = min(detections_per_im, num_detections);
            cudaMemcpyAsync(out_scores, scores, num_detections * sizeof *scores, cudaMemcpyDeviceToDevice, stream);
            if (num_detections < detections_per_im) {
                thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
            }
            thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
            thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes);
            thrust::gather(on_stream, indices, indices + num_detections, in_points, out_points);

			float tmp[10];
			cudaMemcpyAsync(tmp, out_points, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
			printf("out_points %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], tmp[7], tmp[8], tmp[9]);
            // printf("num_detections %d \n", num_detections);
            // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
            // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
        }
        
        return 0;
    }

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

C基础

c

算法

开发语言

cuda nms 的相关文章

如何使用.NET Core（FtpWebRequest）通过squid代理通过FTP获取文件？

根据https learn microsoft com en us dotnet api system net ftpwebrequest proxy view netcore 3 1 https learn microsoft com e
“RouteCollection”不包含“MapMvcAttributeRoutes”的定义

我尝试使用基于属性的路由但是当我尝试以下代码片段来激活基于属性的路由时我收到以下错误消息 RouteCollection 不包含定义 MapMvcAttributeRoutes 这是我的代码 public class RouteConf
如何重命名序列化对象列表后生成的 XML 属性

我正在序列化对象列表List
is_integral 与 is_integer：其中之一是多余的吗？

是积分 http en cppreference com w cpp types is integral and 是整数 http en cppreference com w cpp types numeric limits is inte
C# 并行与并行线程代码性能

我一直在测试 System Threading Parallel 与线程的性能我很惊讶地发现并行比线程花费更长的时间来完成任务我确信这是由于我对并行的了解有限我刚刚开始阅读我想我会分享一些片段如果有人可以向我指出并行代码比线程代码
当 f & g 修改同一个全局变量时，表达式 f() > g() 的值是否未定义或未指定？

UPDATE 由用户标记ecatmur 它是重复的在 C99 中 f g 是未定义还是只是未指定 https stackoverflow com questions 3951017 in c99 is fg undefined or mer
将 void *user_data 转换为对象

我该如何投射void something到标准 C 中的对象具体来说我想投void userdata to std map
计算复杂数组的abs()值的最快方法

我想计算 C 或 C 中复杂数组元素的绝对值最简单的方法是 for int i 0 i lt N i b i cabs a i 但对于大向量来说速度会很慢有没有办法加快速度例如使用并行化语言可以是 C 或 C 鉴于所有循环迭代都是
增量决策树 C++ 实现

有谁知道决策树分类器的增量实现吗这样当您将新实例添加到训练集中时它可以根据现有决策树分类器以低计算量并尽可能快地生成最佳决策树分类器换句话说我有一个最优决策树分类器集A 其中命名为T 1 现在我想添加实例X to set A并找到
我要恢复我的记忆！我怎样才能真正处理一个控件？

我正在制作一个应用程序它创建大量的窗口控件按钮和标签等它们都是通过函数动态生成的我遇到的问题是当我删除控件并处置它们时它们不会从内存中删除 void loadALoadOfStuff while tabControlToClea
修剪 UIImage 边框

这是我想要修剪的图像的示例我想去掉图像周围的边框在本例中是顶部和底部的黑条我在Github上找到了一个库 CKImageAdditions https github com cmkilger CKImageAdditions 但是它似
多维数组和指向指针的指针

创建多维数组时char a 10 10 根据我的书它说你必须使用类似于char a 10 将数组传递给函数为什么必须这样指定长度您不是只是将双指针传递给 with 并且该双指针不是已经指向分配的内存吗那么为什么参数不能是char a
如何在Azure功能中添加razor视图文件？

我正在创建一个应用程序它是 azure 函数项目我想在该项目中使用 Razor 视图我应该在 azure 函数中使用任何模板引擎吗得益于一些方面的进步剃刀之光项目 https github com toddams RazorLigh
除空字符串外的任何内容的正则表达式

是否可以使用正则表达式来检测任何不是空字符串的内容如下所示 string s1 string s2 string s3 string s4 etc 我知道我可以使用修剪等但我想使用正则表达式 s 将匹配任何包含至少一个非空格字符的字
C++：LPWSTR 在 cout 中打印为地址

我有一个类型变量LPTSTR 我打印到std cout with lt lt 在 ANSI 系统中不知道它是在哪里确定的它工作得很好它打印了字符串现在在 Unicode 系统中我得到的是十六进制地址而不是字符串那么为什么LP
使用 _Alignas 进行结构成员对齐

我想知道以下问题是新的吗 Alignas结盟 C11 中的说明符适用于结构成员吗我一直假设这么多但彻底阅读了 N1570 公开草案似乎表明对齐说明符不能出现在一个说明符限定符列表这就是我所期望的如果得到支持的话我已经读过几遍语
将多个 Blob 输入传递到 QueueTrigger Azure 函数的最佳方法

问题触发后生成 3 个 XML 文件完成后将它们通过 ftp 传输到站点目前的方法我有一个 HTTP 触发器 Azure 函数运行时将构造 3 个 XML 文件并将它们保存到 Azure 存储 Blob 容器中由于有多个输出
在Framework 4.6项目中使用.net core DLL

我已经在 net core 2 0 中构建了一个 DLL 现在我想在使用 net 4 6 1 框架的 WinForms 项目中使用它我可以引用该 dll 但收到 System IO FileLoadException 表示找不到 Syst
是否可以编写一个在另一个 Windows 应用程序中选择文本时收到通知的 Windows 应用程序？

我很好奇是否可以编写一个程序来监视我的文本选择一种可能的用途是编写一个与编辑器 IDE 无关的代码格式化程序应用程序服务 P 启动并以某种方式挂接到窗口中以便在任何窗口中选择文本时收到通知启动其他一些应用程序 A 用户选择 A 中
如何使用 Ioc Unity 注入依赖属性

我有以下课程 public interface IServiceA string MethodA1 public interface IServiceB string MethodB1 public class ServiceA IServ

随机推荐

在Ubuntu中安装eclipse

1 下载JDK和eclipse jdk下载网址 http www oracle com technetwork java javase downloads jdk8 downloads 2133151 html eclipse下载网址 ht
AD20铺铜操作及设置

AD20铺铜方法首先建议铺铜前先滴泪提高信号完整性 1 铺铜放置 gt 铺铜 2 铺铜时先设置属性如果以前设置过不用重设点键盘上的 Tab 键调出属性界面 3 操作沿着四个点到第4点时点鼠标左键完成划区域如
stm32同芯片但不同flash工程更换Device出现报错

目录 1 问题描述 2 解决方案 1 问题描述 stm32同芯片但不同flash工程更换Device出现报错 2 解决方案更换Device 我是从ZE换为C8 把这个从HD更换为MD 解决
Three.js - 透视相机（PerspectiveCamera）（三）

简介在three js中摄像机的作用就是不断的拍摄我们创建好的场景然后通过渲染器渲染到屏幕中想通过不同的角度观看场景就需要修改摄像机的位置来拍摄场景本文详细介绍的是透视相机 PerspectiveCamera 它是用来模拟人眼所
OpenWrt自定义luci页面来修改配置文件

在使用OpenWrt路由器的过程中经常需要根据需要改改配置文件然后重新启动服务什么的一般的做法是SSH登录路由器后台使用vi编辑器修改文件然后使用 etc init d xxxx restart 来重启服务次数多了就会觉得很繁琐
一步步写嵌入式操作系统中断处理

简单的中断处理程序简单的中断处理程序 1 获取被中断模式的将要执行的指令的地址到LR 2 将LR压入中断模式栈 3 将pc置为公共的中断服务函数入口地址并记录下一条指令地址到LR 4 从公共的中断服务函数返回 5 从spsr恢复被中断模
Ribbon负载均衡器

两种 1 1 集中式负载均衡服务端负载均衡硬件 nginx 轮询负载哈希随机权重为什么要做负载均衡 1 2 客户端负载均衡器用客户端负载均衡器很多机制可以自定义小知识不想让别人调自己只想用别人的怎么做只需要不注
用simulink 模型自动生成代码之 SPWM

正弦PWM的信号波为正弦波就是正弦波等效成一系列等幅不等宽的矩形脉冲波形其脉冲宽度是由正弦波和三角波自然相交生成的正弦波波形产生的方法有很多种但较典型的主要有对称规则采样法不对称规则采样法和平均对称规则采样法三种第一种方法由于
qt工具栏和菜单栏

以前用过qt 但是老是忘得现查现记录如下 1 一个menubar可以有多个menu 2 一个menu可以有多个action 菜单栏里的各项叫做action 而不是Menu action可以当作实体 3 每个action对应事件比如 1
Git 介绍

一理解 Git 1 分布式版本控制 Git 版本控制系统的设计思想是去中心化传统的 CVS SVN 等工具采用的是 C S 架构只有一个中心代码仓库位于服务器端而一旦由于服务器系统宕机网络不通等各种原因造成中心仓库不可用整个
使用taro框架注意避免的一些问题

1 参数名的问题 Taro request url path data prams header 这里注意header是没有s的不然消息体里的数据就会是 object Object Content Type application jso
com.google.zxing.NotFoundException 问题分析

这仅仅是一篇问题分析哈提供理解这个问题的思路并不是解决这个问题的方法背景先说背景项目中需要一个扫描二维码的功能网上找了一个比较火的 BGAQRCode Android 用了一圈感觉还不错但是在扫描页面 logcat 总是报 W
redhat中文文件名、文件夹乱码问题解决

redhat在没有安装中文rpm包之前中文会显示为乱码的小方块字样利用ssh客户端在上传中文文件名的文件或文件夹时均不能识别中文给开发应用造成很大的困扰首先安装fonts chinese 3 02 9 6 el5 noarch r
在服务器上安装vasp如何得到输出文件,科学网—VASP各输出文件解读-更新中 - 叶小球的博文...

PROCAR file For static calculations the file PROCAR contains the spd and site projected wave function character of each
单片机MPU9250/6050陀螺仪芯片驱动

单片机MPU9250 6050陀螺仪芯片驱动 CubeMX配置驱动代码项目需要陀螺仪检测设备位置角度信息所以就有了本文章代码借鉴了github上大佬写的应用了卡尔曼滤波关于寄存器的说明参考当然驱动代码中也附带了说明 https
用jquery实现仿淘宝焦点图的动画
百万前端之js生成用户登录图形验证码

用户登录的图形验证码 jquey生成引入图形验证码和前端判断是否正确参考代码如下 css login title width 20 height 3rem margin 0 auto margin top 2rem text align
如何设计一个数据库

前言我们知道软件工程是为了解决软件危机的它是采用工程的概念原理技术和方法来开发与维护软件把经过时间考验而证明正确的管理技术和当前能够得到的最好的技术方法结合起来在软件开发的过程中数据库设计是非常重要的它需要根据需求分析设抽
Android合并音频文件

java view plain copy 需求将两个amr格式音频文件合并为1个注意 amr格式的头文件为6个字节的长度 param partsPaths 各部分路径 param unitedFilePath 合并后路径 public
cuda nms

int YoloLayerPlugin nms fun int batch size void inputs void const outputs size t count int detections per im float nms t

cuda nms

cuda nms 的相关文章

随机推荐

热门标签