caffe模型TensorRT部署实践（一）

2023-05-16

参考代码

TensorRT安装包下的samples/sampleMNIST/sampleMNIST.cpp

1.设置使用的gpu id，如果不设置，默认使用第0块。

cudaSetDevice(3); //set device id

2.定义模型的输入输出和logger

static const int INPUT_H = 299; //输入图像高
static const int INPUT_W = 299;//输入图像宽
static const int CHANNELS = 3;//输入图像通道
static const int OUTPUT_SIZE = 1536;//输出特征维度
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";//deploy文件中定义的输入层名称
const char* OUTPUT_BLOB_NAME = "pool_8x8_s2";//deploy文件中定义的输出层名称

3.定义GIE模型，并将训练好的caffe模型转换到GIE模型

// create a GIE model from the caffe model and serialize it to a stream
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel("deploy.prototxt", "inceptionv4.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);

4.准备输入图像，可以采用opencv读取，也可用其他方式，根据情况编写与处理部分，最终存入一个float*中

    float data[INPUT_H*INPUT_W*CHANNELS];

    cv::Mat im = imread("gap.jpg") ;
    cv::resize(im, im, cv::Size(INPUT_W, INPUT_H));
    int mean_data[] = {104, 117, 123}; //均值
    float *pdata = data;
    for(int c = 0; c < CHANNELS; ++c)
    {
        for(int h = 0; h < INPUT_H; ++h)
        {
            for(int w = 0; w < INPUT_W; ++w)
            {
                *pdata++ = float(im.at<Vec3b>(h,w)[c] - mean_data[c]) ;
            }
        }
    }

5. 反序列化前向引擎

    // deserialize the engine
    IRuntime* runtime = createInferRuntime(gLogger);
    ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
    if (gieModelStream) gieModelStream->destroy();

6.开始前向推断

    IExecutionContext *context = engine->createExecutionContext();

    std::cout << "begin inference\n";
    // run inference
    CProTimer timet;
    float prob[OUTPUT_SIZE];
    doInference(*context, data, prob, 1);

    std::cout << "end inference " << timet.GetTime(true) << "\n";

7.释放资源并输出结果

    // destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // print a histogram of the output distribution
    std::cout << "\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    {
        std::cout << prob[i] << " ";
    }
    std::cout << std::endl;

caffeToGIEModel和doInference可参考开头给出的示例cpp。

void caffeToGIEModel(const std::string& deployFile,             // name for caffe prototxt
                     const std::string& modelFile,              // name for model
                     const std::vector<std::string>& outputs,   // network outputs
                     unsigned int maxBatchSize,                 // batch size - NB must be at least as large as the batch we want to run with)
                     IHostMemory *&gieModelStream)    // output buffer for the GIE model
{
    // create the builder
    IBuilder* builder = createInferBuilder(gLogger);

    // parse the caffe model to populate the network, then set the outputs
    INetworkDefinition* network = builder->createNetwork();
    ICaffeParser* parser = createCaffeParser();
    const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
                                                              locateFile(modelFile, directories).c_str(),
                                                              *network,
                                                              nvinfer1::DataType::kFLOAT);

    // specify which tensors are outputs
    for (auto& s : outputs)
        network->markOutput(*blobNameToTensor->find(s.c_str()));

    // Build the engine
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);

    ICudaEngine* engine = builder->buildCudaEngine(*network);
    assert(engine);

    // we don't need the network any more, and we can destroy the parser
    network->destroy();
    parser->destroy();

    // serialize the engine, then close everything down
    gieModelStream = engine->serialize();
    engine->destroy();
    builder->destroy();
    shutdownProtobufLibrary();
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
        outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * CHANNELS * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * CHANNELS * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

8. 编写MakeFile文件并编译

代码需依赖cuda, cudnn 和TensorRT库，gcc版本5.3以上，其他库可根据自身需要设定

OPENCV_INC_DIR="/user/3rdparty/opencv-3.1.0/include/"
OPENCV_LIB_DIR="/user/3rdparty/opencv-3.1.0/lib/"
CUDA_INC_DIR="/user/3rdparty/cuda/include/"
CUDA_LIB_DIR="/user/3rdparty/cuda/lib64/"
CUDNN_INC_DIR="/user/3rdparty/cudnn_7.0.5/include/"
CUDNN_LIB_DIR="/user/3rdparty/cudnn_7.0.5/lib64/"
TENSORRT_INC_DIR="/user/3rdparty/TensorRT-4.0.0.3/include/"
TENSORRT_LIB_DIR="/user/3rdparty/TensorRT-4.0.0.3/lib/"

export PATH=/user/3rdparty/gcc-5.3.0/bin:$PATH

INCLUFLAGS = -I${OPENCV_INC_DIR} \
             -I${CUDA_INC_DIR} -I${CUDNN_INC_DIR}\
             -I../common/ \
             -I${TENSORRT_INC_DIR}

LIBFLAGS = -L${OPENCV_LIB_DIR} -lopencv_imgcodecs -lopencv_imgproc -lopencv_core -lopencv_highgui \
           -L${CUDA_LIB_DIR} -L${CUDNN_LIB_DIR} -lcudnn -lcublas -lcudart_static -lnvToolsExt -lcudart \
           -L${TENSORRT_LIB_DIR} -lnvinfer -lnvparsers -lnvinfer_plugin

LIBFLAGS += -lrt -ldl -lpthread

SOURCES = main.cpp  

CXXFLAGS = -Wall -std=c++11 

EXE = inceptionv4_tensorrt

OBJECTS = $(subst .c,.o,$(SOURCES:%.cpp=%.o))

all:
    g++ -o $(EXE) $(SOURCES) $(CXXFLAGS) $(INCLUFLAGS) $(LIBFLAGS)
clean:
    rm -f $(OBJECTS) $(EXE)

9.精度和速度对比

TensorRT的float32模型与原始caffe精度基本无差异，但速度快很多，单batch的平均gpu前向速度是原始caffe模型的4～5倍左右，优化还是很给力。

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

caffe

Tensorrt

部署实践

caffe模型TensorRT部署实践（一）的相关文章

ubuntu16.04 安装CUDA 8.0 和 cuDNN 5.1 /cudnn6.0，可适用于gpu版本的(tensorflow,caffe,mxnet)

转载https zhuanlan zhihu com p 27890924文章略有修改感谢原作者环境 ubuntu 16 04 GTX 960 安装步骤安装Nvidia驱动系统设置 gt 软件与更新 gt 附加驱动如下图选择nv
TensorRT Samples: GoogleNet

关于TensorRT的介绍可以参考 http blog csdn net fengbingchun article details 78469551 以下是参考TensorRT 2 1 2中的sampleGoogleNet cpp文件改写的
Caffe源码中caffe.proto文件分析

Caffe源码 caffe version 09868ac date 2015 08 15 中有一些重要文件这里介绍下caffe proto文件在src caffe proto目录下有一个caffe proto文件 proto目录下除了
Tensorflow的非对称填充假设

为什么 TensorFlow 选择在右下角填充 With SAME填充对我来说在第一个真实像素处启动内核的中心锚点是合乎逻辑的由于使用了不对称填充这导致与其他一些框架存在差异我确实明白原则上不对称填充是好的因为否则会留下未使用
Caffe sigmoid交叉熵损失

我正在使用 sigmoid 交叉熵损失函数来解决多标签分类问题如下所示本教程然而在他们的教程结果和我的结果中输出预测都在范围内 Inf Inf 而 sigmoid 的范围是 0 1 sigmoid 仅在反向传播中处理吗也就是说前
使用tensorrt加速深度学习模型推断

使用tensorrt加速深度学习模型推断 1 import以及数据加载构建engine函数 2 导入官方模型及CIFAR100数据集 3 不采用tensort的推断时间 4 采用tensort加速使用tensorrt 库 4 1 导出o
Caffe Sigmoid交叉熵损失层损失函数

我正在查看Caffe的代码Sigmoid 交叉熵损失层 https github com BVLC caffe blob master src caffe layers sigmoid cross entropy loss layer cp
如何在caffe中将多个N维数组输入到网络中？

我想在 caffe 中创建一个用于语义分割的自定义损失层需要多个输入我希望这个损失函数有一个额外的输入因子以惩罚小物体的漏检为此我创建了一个图像 GT 其中每个像素都包含一个权重如果像素属于小物体则权重较高我是 caffe
OpenCV 深度学习人脸检测函数“cv::dnn::ConvolutionLayerImpl::getMemoryShapes”中的断言错误

我跟着tutorial https www pyimagesearch com 2018 02 26 face detection with opencv and deep learning 实现人脸检测image https ibb co
Caffe 中的批处理模式 - 没有性能提升

继这个线程 https stackoverflow com q 32504394 1103412我重新实现了图像处理代码以便一次发送 10 张图像即我现在将输入 blob 的 num 属性设置为 100 而不是 10 然而处理这批货所
Caffe的Python接口：“导入caffe”时出错

我正在尝试在 Caffe 的 Python 界面中运行它我已经运行了命令make pycaffe在 caffe 目录中并且运行良好现在当我运行命令时import caffe在终端的 python 环境中 Ubuntu 14 04 我收
Caffe：如何通过代码获取`solver.prototxt`参数？

我想访问solver prototxt参数如base lr 基础学习率或weight decay来自Python代码有什么方法可以从solver net目的谢谢根据本教程 http nbviewer jupyter org gith
GoogLeNet 模型的微调

我从头开始训练 GoogLeNet 模型但它并没有给我带来有希望的结果作为替代方案我想在我的数据集上对 GoogLeNet 模型进行微调有谁知道我应该遵循什么步骤假设您正在尝试进行图像分类这些应该是微调模型的步骤 1 分类层原
Caffe 求解器中的average_loss 字段有什么用？

有什么用average loss 有人可以举个例子或者用通俗易懂的语言解释一下吗您可以在caffe proto https github com BVLC caffe blob master src caffe proto caffe p
PyInstaller“ValueError：太多值无法解压”

pyinstaller 版本 3 2 操作系统 win10 我的 python 脚本在 Winpython Python 解释器中运行良好但是当我使用 Pyinstaller 包时 python 脚本包含 caffe 模块我将面临的问题
nvcc fatal：安装 cuda 9.1+caffe+openCV 3.4.0 时不支持 gpu 架构“compute_20”

我已经安装了CUDA 9 1 cudnn 9 1 opencv 3 4 0 caffe 当我尝试跑步时make all j8 in caffe目录下出现这个错误 nvcc fatal 不支持的 GPU 架构 compute 20 我尝试过
Caffe：如果内存中只能容纳一小部分，我该怎么办？

我正在尝试训练一个非常大的模型因此我只能将非常小的批量大小放入 GPU 内存中处理小批量的结果非常噪声梯度估计 https stackoverflow com a 33717093 1714410 我该怎么做才能避免这个问题您可以更
如何加载 caffe 模型并转换为 numpy 数组？

我有一个 caffemodel 文件其中包含 ethereon 的 caffe tensorflow 转换实用程序不支持的层我想生成我的咖啡模型的 numpy 表示我的问题是如何将 caffemodel 文件我还有 prototx
Caffe，在层中设置自定义权重

I have a network In one place I want to use concat As on this picture 不幸的是该网络无法训练为了理解为什么我想连续改变权重这意味着 FC4096 中的所有值一开始都
如何在 Caffe 中沿着通道分割 Blob

我想在Caffe中分割Blob通道这样我就可以分割一个Blob N c w h 分成两个大小相同的输出 Blob N c 2 w h 我上面描述的是非常笼统的我实际上想做的是将一个两通道输入图像分离成两个不同的图像一个进入卷积层另一

随机推荐

C++ 通过TCP Socket实现简单Http服务器

实现一个简单的Http服务器 xff0c 基于windows 平台总共五个文件 HttpServer hpp HttpServer cpp Utils hpp Utils cpp main cpp Utils hpp span class
Java并发编程--自旋锁的实现和使用

1 自旋锁介绍自旋锁是这样一类锁 xff1a 当线程等待加锁时 xff0c 不会阻塞 xff0c 不会进入等待状态 xff0c 而是保持运行状态大致的思路是 xff1a 让当前线程不停地的在循环体内执行 xff0c 当循环的条件被其他线
[Ubuntu] 可用云盘-尝试记录

Ubuntu可用云盘 xff1a https github com Aruelius cloud189 说明 xff1a 这是一个调用天翼云API的开源命令行工具 xff0c 基于Python编写 xff1b 天翼云官方没有客户端支持Lin
【无人机】基于遗传算法实现无人机编队位置规划附matlab代码

1 内容介绍现代社会的无人机成本造价低不易损耗轻巧灵便易躲藏能精确打击目标这些特点 xff0c 使其在一些高危任务中发挥了不可替代的作用 5 无人机的用处主要有两种 xff1a 民用和军事在民用方面 xff0c 我们可以运用无
【路径规划】基于DWA实现机器人动态避障附matlab代码

1 内容介绍 DWA 算法是基于机器人运动学与动力学理论的一种局部避障算法 xff0c 它将对机器人的位置控制转换为对机器人的速度控制 DWA 算法可以概括为三步一是根据机器人自身的限制以及环境制约将速度的采样空间约束在一定范围内二是根
飞控pixhawk硬件框架

本文转载于 xff1a https blog csdn net csshuke article details 78952026 xfeff xfeff 1 Phxhawk连接线路 2 Phxhawk硬件芯片列表处理器 STM32F427
PCB_layout_misc

AD的规则设置参考 https blog csdn net geek monkey article details 80249217 一些PCB厂家的工艺嘉立创https www jlc com portal vtechnology ht
怎样把经纬度坐标转换为空间直角坐标

怎样把经纬度坐标转换为空间直角坐标假设你的空间直角坐标以地球球心为原点原点到北极为正z轴原点到经纬度 0 0 为正x轴那么纬度a 北正南负经度b 东正西负的空间直角坐标为 x 61 Rcos a cos b y 61 Rcos
APM添加参数

APM添加参数参考 https ardupilot org dev docs code overview adding a new parameter html 添加应用参数模块例如 battery Parameters h k par
pixhawk6x/5x 电源插座/插头的型号

型号 xff1a molex CLIK Mate 5024430670 5024390600
intellij idea: git tag操作及 master branch相互合并操作

git tag和git branches区别 xff1f tag就像是里程碑标志的一个点 branch是一个新的征程的一条线 tag是静态的是只读的不能修改而branch是要往前走的稳定版本备份用tag 新功能开发多人用branch
C++之STL和Boost

最近一年我电话面试了数十位 C 43 43 应聘者 xff0c 惯用的暖场问题是工作中使用过 STL 的哪些组件 xff1f 使用过 Boost 的哪些组件 xff1f 得到的答案大多集中在 vector map 和 shared ptr
ubuntu 下安装intel realsense驱动

在安装之前一定要确保系统是ubuntu 14 04 3 64位 xff01 在安装之前一定要确保系统是ubuntu 14 04 3 64位 xff01 在安装之前一定要确保系统是ubuntu 14 04 3 64位 xff01 重要的事情说
windows下安装numpy,scipy遇到的问题总结

最近开始研究3D手势识别 xff0c 下载的源码包是基于python的 xff0c 需要用到扩展包numpy scipy等 xff0c 安装过程汇总遇到的问题总结如下 xff1a 1 安装numpy 下载numpy编译包 xff0c 进入该
Linux大数据处理踩坑实录

最近开发需要在linux服务器上做大数据处理 xff0c 由于对Linux开发并不是很熟悉 xff0c 因此踩了很多坑 xff0c 先作如下记录 xff1a 1 bash shell实现多进程背景如下需要从hadoop的hdfs上向服务
Deep Compression阅读理解及Caffe源码修改

更新 xff1a 没想到这篇文章写出后有这么多人关注和索要源码 xff0c 有点受宠若惊说来惭愧 xff0c 这个工作当时做的很粗糙 xff0c 源码修改的比较乱 xff0c 所以一直不太好拿出手最近终于有时间整理了一下代码并开源出来了
Tensorflow 离线安装跳坑总结

TensorFlow作为目前最被看好的深度学习开源框架 xff0c 又顶着Google爸爸的光环 xff0c 使得很多CNN网络的部署都基于此框架最近开始研究GAN xff08 生成对抗网络 xff09 xff0c 鉴于大部分源码都基于T
ARM NEON常用函数总结

NEON 技术是 ARM Cortex A 系列处理器的 128 位 SIMD xff08 单指令 xff0c 多数据 xff09 架构扩展 xff0c 旨在为消费性多媒体应用程序提供灵活强大的加速功能 xff0c 从而显著改善用户体验
TensorRT cuda8.0 cudnn 7.0.5 tar包安装

总体步骤参考这篇文章 https zhuanlan zhihu com p 35468450 1 准备环境 TensorRT 依赖cuda和cudnn xff0c 且根据下载的TensorRT版本 xff0c 需要严格保证cuda和cudn
caffe模型TensorRT部署实践（一）

参考代码 TensorRT安装包下的samples sampleMNIST sampleMNIST cpp 1 设置使用的gpu id xff0c 如果不设置 xff0c 默认使用第0块 cudaSetDevice 3 set device