54、记录yolov7 训练、部署ncnn、部署mnn、部署rk3399pro npu、部署openvino、部署oak vpu、部署TensorRT

2023-05-16

基本思想：记录一下yolov7训练人头检测和部署oak的vpu相机上和rk3399 pro开发板上，完成需求

ubuntu@ubuntu:~/yolov7$ pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113

一、准备数据集，来自官方数据集，存在问题，已经修正

链接: https://pan.baidu.com/s/1SUGKRgxeV7ddZpLdILhOwA?pwd=atjt 提取码: atjt

实验数据和模型

链接: https://pan.baidu.com/s/1mjYitnFfGvzLLcCzSQwLbg?pwd=7yaw 提取码: 7yaw

二、在现有的数据集基础上转yolo数据集

ubuntu@ubuntu:~$ git clone https://github.com/WongKinYiu/yolov7.git
Cloning into 'yolov7'...
remote: Enumerating objects: 1094, done.
remote: Total 1094 (delta 0), reused 0 (delta 0), pack-reused 1094
Receiving objects: 100% (1094/1094), 69.89 MiB | 2.21 MiB/s, done.
Resolving deltas: 100% (517/517), done.

下载权重测试一下

ubuntu@ubuntu:~/yolov7/weights$
ubuntu@ubuntu:~/yolov7/weights$ wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
ubuntu@ubuntu:~/yolov7/weights$ cd ..

测试一下

ssh://ubuntu@10.10.71.226:22/usr/bin/python -u /home/ubuntu/yolov7/detect.py
Namespace(agnostic_nms=False, augment=False, classes=None, conf_thres=0.25, device='', exist_ok=False, img_size=640, iou_thres=0.45, name='exp', no_trace=False, nosave=False, project='runs/detect', save_conf=False, save_txt=False, source='inference/images', update=False, view_img=False, weights='yolov7.pt')
YOLOR 🚀 v0.1-116-g8c0bf3f torch 1.9.0+cu111 CUDA:0 (NVIDIA GeForce RTX 3060, 12053.0MB)

Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
Model Summary: 306 layers, 36905341 parameters, 6652669 gradients, 104.5 GFLOPS
 Convert model to Traced-model... 
 traced_script_module saved! 
 model is traced! 

4 persons, 1 bus, 1 tie, Done. (11.6ms) Inference, (3.0ms) NMS
 The image with the result is saved in: runs/detect/exp2/bus.jpg
5 horses, Done. (10.8ms) Inference, (0.4ms) NMS
 The image with the result is saved in: runs/detect/exp2/horses.jpg
2 persons, 1 tie, 1 cake, Done. (12.3ms) Inference, (0.4ms) NMS
 The image with the result is saved in: runs/detect/exp2/image1.jpg
2 persons, 1 sports ball, Done. (10.9ms) Inference, (0.3ms) NMS
 The image with the result is saved in: runs/detect/exp2/image2.jpg
1 dog, 1 horse, Done. (10.7ms) Inference, (0.3ms) NMS
 The image with the result is saved in: runs/detect/exp2/image3.jpg
3 persons, 1 tie, Done. (9.6ms) Inference, (0.4ms) NMS
 The image with the result is saved in: runs/detect/exp2/zidane.jpg
Done. (0.335s)

Process finished with exit code 0

测试结果

三、转数据集

１)划分文件

# coding:utf-8

import os
import random
import argparse

parser = argparse.ArgumentParser()
# xml文件的地址，根据自己的数据进行修改 xml一般存放在Annotations下
parser.add_argument('--xml_path', default='/home/ubuntu/yolov7/trainData/Annotations', type=str, help='input xml label path')
# 数据集的划分，地址选择自己数据下的ImageSets/Main
parser.add_argument('--txt_path', default='/home/ubuntu/yolov7/trainData/ImageSets/Main', type=str, help='output txt label path')
opt = parser.parse_args()

trainval_percent = 1.0
train_percent = 0.9
xmlfilepath = opt.xml_path
txtsavepath = opt.txt_path
total_xml = os.listdir(xmlfilepath)
if not os.path.exists(txtsavepath):
    os.makedirs(txtsavepath)

num = len(total_xml)
list_index = range(num)
tv = int(num * trainval_percent)
tr = int(tv * train_percent)
trainval = random.sample(list_index, tv)
train = random.sample(trainval, tr)

file_train = open(txtsavepath + '/train.txt', 'w')
file_val = open(txtsavepath + '/val.txt', 'w')

for i in list_index:
    name = total_xml[i][:-4] + '\n'
    if i in trainval:
        if i in train:
            file_train.write(name)
        else:
            file_val.write(name)


file_train.close()
file_val.close()

2)生成标签

# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import os
from os import getcwd

sets = ['train', 'val']
classes = ["person"]  # 改成自己的类别voc2012


def convert(size, box):
    dw = 1. / (size[0])
    dh = 1. / (size[1])
    x = (box[0] + box[1]) / 2.0 - 1
    y = (box[2] + box[3]) / 2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return x, y, w, h


def convert_annotation(image_id):
    in_file = open('/home/ubuntu/yolov7/trainData/Annotations/%s.xml' % (image_id))
    out_file = open('/home/ubuntu/yolov7/trainData/labels/%s.txt' % (image_id), 'w')
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)
    for obj in root.iter('object'):
        difficult = None
        if obj.find('Difficult') is None:
            difficult = obj.find('difficult').text
        else:
            difficult = obj.find('Difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult) == 1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
             float(xmlbox.find('ymax').text))
        b1, b2, b3, b4 = b
        # 标注越界修正
        if b2 > w:
            b2 = w
        if b4 > h:
            b4 = h
        b = (b1, b2, b3, b4)
        bb = convert((w, h), b)
        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')


wd = getcwd()
for image_set in sets:
    if not os.path.exists('/home/ubuntu/yolov7/trainData/labels/'):
        os.makedirs('/home/ubuntu/yolov7/trainData/labels/')
    image_ids = open('/home/ubuntu/yolov7/trainData/ImageSets/Main/%s.txt' % (image_set)).read().strip().split()
    list_file = open('/home/ubuntu/yolov7/trainData/%s.txt' % (image_set), 'w')
    for image_id in image_ids:
        list_file.write('/home/ubuntu/yolov7/trainData/images/%s.jpg\n' % (image_id))
        convert_annotation(image_id)
    list_file.close()
print("voc_scrip finish")

四、修改coco.yaml

# COCO 2017 dataset http://cocodataset.org

# download command/URL (optional)
#download: bash ./scripts/get_coco.sh

# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
train: /home/ubuntu/yolov7/trainData/train.txt  # 118287 images
val: /home/ubuntu/yolov7/trainData/val.txt  # 5000 images
#test: ./coco/test-dev2017.txt  # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794

# number of classes
nc: 1

# class names
names: [ 'person']

修改yolov7.yaml

nc: １  # number of classes

五、开始训练

ubuntu@ubuntu:~/yolov7$ python3 train.py --weights weights/yolov7.pt --cfg cfg/training/yolov7.yaml --data data/coco.yaml --device 0 --batch-size 8 --epoch 1000 --hyp data/hyp.scratch.p5.yaml

测试

ubuntu@ubuntu:~/yolov7$ python3 detect.py --weights runs/train/exp3/weights/best.pt 
Namespace(agnostic_nms=False, augment=False, classes=None, conf_thres=0.25, device='', exist_ok=False, img_size=640, iou_thres=0.45, name='exp', no_trace=False, nosave=False, project='runs/detect', save_conf=False, save_txt=False, source='inference/images', update=False, view_img=False, weights=['runs/train/exp3/weights/best.pt'])
YOLOR 🚀 v0.1-116-g8c0bf3f torch 1.9.0+cu111 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 3910.5MB)

Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
IDetect.fuse
/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
Model Summary: 314 layers, 36481772 parameters, 6194944 gradients, 103.2 GFLOPS
 Convert model to Traced-model... 
 traced_script_module saved! 
 model is traced! 

3 persons, Done. (22.3ms) Inference, (5.9ms) NMS
 The image with the result is saved in: runs/detect/exp3/bus.jpg
Done. (18.4ms) Inference, (0.2ms) NMS
 The image with the result is saved in: runs/detect/exp3/horses.jpg
4 persons, Done. (21.1ms) Inference, (0.6ms) NMS
 The image with the result is saved in: runs/detect/exp3/image1.jpg
2 persons, Done. (18.7ms) Inference, (0.6ms) NMS
 The image with the result is saved in: runs/detect/exp3/image2.jpg
2 persons, Done. (17.9ms) Inference, (0.6ms) NMS
 The image with the result is saved in: runs/detect/exp3/image3.jpg
2 persons, Done. (15.5ms) Inference, (0.6ms) NMS
 The image with the result is saved in: runs/detect/exp3/zidane.jpg
Done. (0.550s)

测试结果(发现尴尬的不，把马脸和狗脸识别成人头了。。。。)

六、转模型到onnx，不要把nms加

ubuntu@ubuntu:~/yolov7$ python3 export.py --weights runs/train/exp3/weights/best.pt --simplify --img-size 640
Import onnx_graphsurgeon failure: No module named 'onnx_graphsurgeon'
Namespace(batch_size=1, conf_thres=0.25, device='cpu', dynamic=False, dynamic_batch=False, end2end=False, fp16=False, grid=False, img_size=[640, 640], include_nms=False, int8=False, iou_thres=0.45, max_wh=None, simplify=True, topk_all=100, weights='runs/train/exp3/weights/best.pt')
YOLOR 🚀 v0.1-116-g8c0bf3f torch 1.9.0+cu111 CPU

Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
IDetect.fuse
/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
Model Summary: 314 layers, 36481772 parameters, 6194944 gradients, 103.2 GFLOPS

Starting TorchScript export with torch 1.9.0+cu111...
TorchScript export success, saved as runs/train/exp3/weights/best.torchscript.pt
CoreML export failure: No module named 'coremltools'

Starting TorchScript-Lite export with torch 1.9.0+cu111...
TorchScript-Lite export success, saved as runs/train/exp3/weights/best.torchscript.ptl

Starting ONNX export with onnx 1.12.0...

Starting to simplify ONNX...
ONNX export success, saved as runs/train/exp3/weights/best.onnx

Export complete (11.29s). Visualize with https://github.com/lutzroeder/netron.

七、转ncnn，直接调用ncnn官方的example即可

ubuntu@ubuntu:~/ncnn/build/install/bin$ ./onnx2ncnn /home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx /home/ubuntu/yolov7/runs/train/exp3/weights/best.param /home/ubuntu/yolov7/runs/train/exp3/weights/best.bin

参考源码https://github.com/Tencent/ncnn/tree/master/examples

模型需要改掉后面的param文件这三个红框改成-1,否则会出现乱框

cmakelists.txt

cmake_minimum_required(VERSION 3.16)
project(untitled22)
set(CMAKE_CXX_FLAGS "-std=c++11")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
set(CMAKE_CXX_STANDARD 11)
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${CMAKE_SOURCE_DIR}/include)
include_directories(${CMAKE_SOURCE_DIR}/include/ncnn)
find_package(OpenCV REQUIRED)
#message(STATUS ${OpenCV_INCLUDE_DIRS})
#添加头文件
include_directories(${OpenCV_INCLUDE_DIRS})
#链接Opencv库

add_library(libncnn STATIC IMPORTED)
set_target_properties(libncnn PROPERTIES IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/lib/libncnn.a)


add_executable(untitled22 main.cpp)
target_link_libraries(untitled22 ${OpenCV_LIBS} libncnn )

main.cpp

// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

#define MAX_STRIDE 32

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

#pragma omp parallel sections
    {
#pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
#pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid = feat_blob.h;

    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h)
    {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    }
    else
    {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }

    const int num_class = feat_blob.w - 5;

    const int num_anchors = anchors.w / 2;

    for (int q = 0; q < num_anchors; q++)
    {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];

        const ncnn::Mat feat = feat_blob.channel(q);

        for (int i = 0; i < num_grid_y; i++)
        {
            for (int j = 0; j < num_grid_x; j++)
            {
                const float* featptr = feat.row(i * num_grid_x + j);
                float box_confidence = sigmoid(featptr[4]);
                if (box_confidence >= prob_threshold)
                {
                    // find class index with max class score
                    int class_index = 0;
                    float class_score = -FLT_MAX;
                    for (int k = 0; k < num_class; k++)
                    {
                        float score = featptr[5 + k];
                        if (score > class_score)
                        {
                            class_index = k;
                            class_score = score;
                        }
                    }
                    float confidence = box_confidence * sigmoid(class_score);
                    if (confidence >= prob_threshold)
                    {
                        float dx = sigmoid(featptr[0]);
                        float dy = sigmoid(featptr[1]);
                        float dw = sigmoid(featptr[2]);
                        float dh = sigmoid(featptr[3]);

                        float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                        float pb_cy = (dy * 2.f - 0.5f + i) * stride;

                        float pb_w = pow(dw * 2.f, 2) * anchor_w;
                        float pb_h = pow(dh * 2.f, 2) * anchor_h;

                        float x0 = pb_cx - pb_w * 0.5f;
                        float y0 = pb_cy - pb_h * 0.5f;
                        float x1 = pb_cx + pb_w * 0.5f;
                        float y1 = pb_cy + pb_h * 0.5f;

                        Object obj;
                        obj.rect.x = x0;
                        obj.rect.y = y0;
                        obj.rect.width = x1 - x0;
                        obj.rect.height = y1 - y0;
                        obj.label = class_index;
                        obj.prob = confidence;

                        objects.push_back(obj);
                    }
                }
            }
        }
    }
}

static int detect_yolov7(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov7;

    yolov7.opt.use_vulkan_compute = true;
    // yolov7.opt.use_bf16_storage = true;

    // original pretrained model from https://github.com/WongKinYiu/yolov7
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    yolov7.load_param("/home/ubuntu/yolov7/runs/train/exp3/weights/best.param");
    yolov7.load_model("/home/ubuntu/yolov7/runs/train/exp3/weights/best.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // letterbox pad to multiple of MAX_STRIDE
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
    int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov7.create_extractor();

    ex.input("images", in_pad);

    std::vector<Object> proposals;

    // stride 8
    {
        ncnn::Mat out;
        ex.extract("output", out);

        ncnn::Mat anchors(6);
        anchors[0] = 12.f;
        anchors[1] = 16.f;
        anchors[2] = 19.f;
        anchors[3] = 36.f;
        anchors[4] = 40.f;
        anchors[5] = 28.f;

        std::vector<Object> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);

        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }

    // stride 16
    {
        ncnn::Mat out;

        ex.extract("534", out);

        ncnn::Mat anchors(6);
        anchors[0] = 36.f;
        anchors[1] = 75.f;
        anchors[2] = 76.f;
        anchors[3] = 55.f;
        anchors[4] = 72.f;
        anchors[5] = 146.f;

        std::vector<Object> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);

        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }

    // stride 32
    {
        ncnn::Mat out;

        ex.extract("554", out);

        ncnn::Mat anchors(6);
        anchors[0] = 142.f;
        anchors[1] = 110.f;
        anchors[2] = 192.f;
        anchors[3] = 243.f;
        anchors[4] = 459.f;
        anchors[5] = 401.f;

        std::vector<Object> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);

        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
            "person"
    };

    static const unsigned char colors[19][3] = {
            {54, 67, 244},
            {99, 30, 233},
            {176, 39, 156},
            {183, 58, 103},
            {181, 81, 63},
            {243, 150, 33},
            {244, 169, 3},
            {212, 188, 0},
            {136, 150, 0},
            {80, 175, 76},
            {74, 195, 139},
            {57, 220, 205},
            {59, 235, 255},
            {7, 193, 255},
            {0, 152, 255},
            {34, 87, 255},
            {72, 85, 121},
            {158, 158, 158},
            {139, 125, 96}
    };

    int color_index = 0;

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const unsigned char* color = colors[color_index % 19];
        color_index++;

        cv::Scalar cc(color[0], color[1], color[2]);

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cc, 2);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cc, -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{


    cv::Mat m = cv::imread("/home/ubuntu/yolov7/inference/images/bus.jpg");
    if (m.empty())
    {

        return -1;
    }

    std::vector<Object> objects;
    detect_yolov7(m, objects);

    draw_objects(m, objects);

    return 0;
}

测试结果

八、mnn测试结果

ubuntu@ubuntu:~/MNN/build$ ./MNNConvert -f ONNX --modelFile /home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx --MNNModel /home/ubuntu/yolov7/runs/train/exp3/weights/best.mnn --bizCode MNN
Start to Convert Other Model Format To MNN Model...
[10:36:41] /home/ubuntu/MNN/tools/converter/source/onnx/onnxConverter.cpp:40: ONNX Model ir version: 6
Start to Optimize the MNN Net...
inputTensors : [ images, ]
outputTensors: [ 534, 554, output, ]
Converted Success!

cmakelists.txt

cmake_minimum_required(VERSION 3.16)
project(untitled22)
set(CMAKE_CXX_FLAGS "-std=c++11")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
set(CMAKE_CXX_STANDARD 11)
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${CMAKE_SOURCE_DIR}/include)
include_directories(${CMAKE_SOURCE_DIR}/include/MNN)
find_package(OpenCV REQUIRED)
#message(STATUS ${OpenCV_INCLUDE_DIRS})
#添加头文件
include_directories(${OpenCV_INCLUDE_DIRS})
#链接Opencv库

add_library(libmnn SHARED IMPORTED)
set_target_properties(libmnn PROPERTIES IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/lib/libMNN.so)


add_executable(untitled22 main.cpp)
target_link_libraries(untitled22 ${OpenCV_LIBS} libmnn )

main.cpp


#include <iostream>
#include <algorithm>
#include <vector>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/opencv.hpp>
#include<MNN/Interpreter.hpp>
#include<MNN/ImageProcess.hpp>
using namespace std;
using namespace cv;

typedef struct {
    int width;
    int height;
} YoloSize;


typedef struct {
    std::string name;
    int stride;
    std::vector<YoloSize> anchors;
} YoloLayerData;

class BoxInfo
{
public:
    int x1,y1,x2,y2,label,id;
    float score;
};

static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}
double GetIOU(cv::Rect_<float> bb_test, cv::Rect_<float> bb_gt)
{
    float in = (bb_test & bb_gt).area();
    float un = bb_test.area() + bb_gt.area() - in;

    if (un < DBL_EPSILON)
        return 0;

    return (double)(in / un);
}
std::vector<BoxInfo> decode_infer(MNN::Tensor & data, int stride,  int net_size, int num_classes,
                                  const std::vector<YoloSize> &anchors, float threshold)
{
    std::vector<BoxInfo> result;
    int batchs, channels, height, width, pred_item ;
    batchs = data.shape()[0];
    channels = data.shape()[1];
    height = data.shape()[2];
    width = data.shape()[3];
    pred_item = data.shape()[4];

    auto data_ptr = data.host<float>();
    for(int bi=0; bi<batchs; bi++)
    {
        auto batch_ptr = data_ptr + bi*(channels*height*width*pred_item);
        for(int ci=0; ci<channels; ci++)
        {
            auto channel_ptr = batch_ptr + ci*(height*width*pred_item);
            for(int hi=0; hi<height; hi++)
            {
                auto height_ptr = channel_ptr + hi*(width * pred_item);
                for(int wi=0; wi<width; wi++)
                {
                    auto width_ptr = height_ptr + wi*pred_item;
                    auto cls_ptr = width_ptr + 5;

                    auto confidence = sigmoid(width_ptr[4]);

                    for(int cls_id=0; cls_id<num_classes; cls_id++)
                    {
                        float score = sigmoid(cls_ptr[cls_id]) * confidence;
                        if(score > threshold)
                        {
                            float cx = (sigmoid(width_ptr[0]) * 2.f - 0.5f + wi) * (float) stride;
                            float cy = (sigmoid(width_ptr[1]) * 2.f - 0.5f + hi) * (float) stride;
                            float w = pow(sigmoid(width_ptr[2]) * 2.f, 2) * anchors[ci].width;
                            float h = pow(sigmoid(width_ptr[3]) * 2.f, 2) * anchors[ci].height;

                            BoxInfo box;

                            box.x1 = std::max(0, std::min(net_size, int((cx - w / 2.f) )));
                            box.y1 = std::max(0, std::min(net_size, int((cy - h / 2.f) )));
                            box.x2 = std::max(0, std::min(net_size, int((cx + w / 2.f) )));
                            box.y2 = std::max(0, std::min(net_size, int((cy + h / 2.f) )));
                            box.score = score;
                            box.label = cls_id;
                            result.push_back(box);
                        }
                    }
                }
            }
        }
    }

    return result;
}

void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
                   * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        for (int j = i + 1; j < int(input_boxes.size());) {
            float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1);
            float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1);
            float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2);
            float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH) {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            } else {
                j++;
            }
        }
    }
}
void scale_coords(std::vector<BoxInfo> &boxes, int w_from, int h_from, int w_to, int h_to)
{
    float w_ratio = float(w_to)/float(w_from);
    float h_ratio = float(h_to)/float(h_from);


    for(auto &box: boxes)
    {
        box.x1 *= w_ratio;
        box.x2 *= w_ratio;
        box.y1 *= h_ratio;
        box.y2 *= h_ratio;
    }
    return ;
}

cv::Mat draw_box(cv::Mat & cv_mat, std::vector<BoxInfo> &boxes, const std::vector<std::string> &labels,unsigned char colors[][3])
{

    for(auto box : boxes)
    {
        int width = box.x2-box.x1;
        int height = box.y2-box.y1;
        cv::Point p = cv::Point(box.x1, box.y1);
        cv::Rect rect = cv::Rect(box.x1, box.y1, width, height);
        cv::rectangle(cv_mat, rect, cv::Scalar(colors[box.label][0],colors[box.label][1],colors[box.label][2]));
        string text = labels[box.label] + ":" + std::to_string(box.score) ;
        cv::putText(cv_mat, text, p, cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(colors[box.label][0],colors[box.label][1],colors[box.label][2]));
    }
    return cv_mat;
}

int main(int argc, char **argv) {


    std::vector<std::string> labels = {
            "person"
    };
    unsigned char colors[][3] = {
            {255, 0, 0}
    };

    cv::Mat bgr = cv::imread("/home/ubuntu/yolov7/inference/images/bus.jpg");;// 预处理和源码不太一样，所以影响了后面的

    int target_size = 640;

    cv::Mat resize_img;
    cv::resize(bgr, resize_img, cv::Size(target_size, target_size));
    float cls_threshold = 0.25;

    // MNN inference
    auto mnnNet = std::shared_ptr<MNN::Interpreter>(
            MNN::Interpreter::createFromFile("/home/ubuntu/yolov7/runs/train/exp3/weights/best.mnn"));
    auto t1 = std::chrono::steady_clock::now();
    MNN::ScheduleConfig netConfig;
    netConfig.type = MNN_FORWARD_CPU;
    netConfig.numThread = 4;

    auto session = mnnNet->createSession(netConfig);
    auto input = mnnNet->getSessionInput(session, nullptr);

    mnnNet->resizeTensor(input, {1, 3, (int) target_size, (int) target_size});
    mnnNet->resizeSession(session);
    MNN::CV::ImageProcess::Config config;

    const float mean_vals[3] = {0, 0, 0};

    const float norm_255[3] = {1.f / 255, 1.f / 255.f, 1.f / 255};

    std::shared_ptr<MNN::CV::ImageProcess> pretreat(
            MNN::CV::ImageProcess::create(MNN::CV::BGR, MNN::CV::RGB, mean_vals, 3,
                                          norm_255, 3));

    pretreat->convert(resize_img.data, (int) target_size, (int) target_size, resize_img.step[0], input);


    mnnNet->runSession(session);

    std::vector<YoloLayerData> yolov7_layers{
            {"554",    32, {{142, 110}, {192, 243}, {459, 401}}},
            {"534",    16, {{36,  75}, {76,  55},  {72,  146}}},
            {"output", 8,  {{12,  16}, {19,  36},  {40,  28}}},
    };

    auto output = mnnNet->getSessionOutput(session, yolov7_layers[2].name.c_str());

    MNN::Tensor outputHost(output, output->getDimensionType());
    output->copyToHostTensor(&outputHost);

    //毫秒级
    std::vector<float> vec_scores;
    std::vector<float> vec_new_scores;
    std::vector<int> vec_labels;
    int outputHost_shape_c = outputHost.channel();
    int outputHost_shape_d = outputHost.dimensions();
    int outputHost_shape_w = outputHost.width();
    int outputHost_shape_h = outputHost.height();

    printf("shape_d=%d shape_c=%d shape_h=%d shape_w=%d outputHost.elementSize()=%d\n", outputHost_shape_d,
           outputHost_shape_c, outputHost_shape_h, outputHost_shape_w, outputHost.elementSize());
    auto yolov7_534 = mnnNet->getSessionOutput(session, yolov7_layers[1].name.c_str());

    MNN::Tensor output_534_Host(yolov7_534, yolov7_534->getDimensionType());
    yolov7_534->copyToHostTensor(&output_534_Host);


    outputHost_shape_c = output_534_Host.channel();
    outputHost_shape_d = output_534_Host.dimensions();
    outputHost_shape_w = output_534_Host.width();
    outputHost_shape_h = output_534_Host.height();
    printf("shape_d=%d shape_c=%d shape_h=%d shape_w=%d output_534_Host.elementSize()=%d\n", outputHost_shape_d,
           outputHost_shape_c, outputHost_shape_h, outputHost_shape_w, output_534_Host.elementSize());

    auto yolov7_554 = mnnNet->getSessionOutput(session, yolov7_layers[0].name.c_str());

    MNN::Tensor output_544_Host(yolov7_554, yolov7_554->getDimensionType());
    yolov7_554->copyToHostTensor(&output_544_Host);


    outputHost_shape_c = output_544_Host.channel();
    outputHost_shape_d = output_544_Host.dimensions();
    outputHost_shape_w = output_544_Host.width();
    outputHost_shape_h = output_544_Host.height();
    printf("shape_d=%d shape_c=%d shape_h=%d shape_w=%d output_544_Host.elementSize()=%d\n", outputHost_shape_d,
           outputHost_shape_c, outputHost_shape_h, outputHost_shape_w, output_544_Host.elementSize());


    std::vector<YoloLayerData> & layers = yolov7_layers;

    std::vector<BoxInfo> result;
    std::vector<BoxInfo> boxes;
    float threshold = 0.3;
    float nms_threshold = 0.7;



    boxes = decode_infer(outputHost, layers[2].stride, target_size, labels.size(), layers[2].anchors, threshold);
    result.insert(result.begin(), boxes.begin(), boxes.end());

    boxes = decode_infer(output_534_Host, layers[1].stride, target_size, labels.size(), layers[1].anchors, threshold);
    result.insert(result.begin(), boxes.begin(), boxes.end());

    boxes = decode_infer(output_544_Host, layers[0].stride, target_size, labels.size(), layers[0].anchors, threshold);
    result.insert(result.begin(), boxes.begin(), boxes.end());

    nms(result, nms_threshold);
    scale_coords(result, target_size, target_size, bgr.cols, bgr.rows);
    cv::Mat frame_show = draw_box(bgr, result, labels,colors);
    cv::imshow("out",bgr);
    cv::imwrite("dp.jpg",bgr);
    cv::waitKey(0);
    mnnNet->releaseModel();
    mnnNet->releaseSession(session);
    return 0;
}

测试结果

九、rk3399 pro

转模型

from rknn.api import RKNN

ONNX_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx'
RKNN_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best.rknn'

if __name__ == '__main__':

    # Create RKNN object
    rknn = RKNN(verbose=True)

    # pre-process config
    print('--> config model')
    rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]], reorder_channel='0 1 2',
                target_platform='rk3399pro',
                quantized_dtype='asymmetric_affine-u8', optimization_level=3, output_optimize=1)
    print('done')

    print('--> Loading model')
    ret = rknn.load_onnx(model=ONNX_MODEL)
    if ret != 0:
        print('Load model  failed!')
        exit(ret)
    print('done')

    # Build model
    print('--> Building model')
    ret = rknn.build(do_quantization=True, dataset='dataset.txt')  # ,pre_compile=True
    if ret != 0:
        print('Build yolov5s failed!')
        exit(ret)
    print('done')

    # Export rknn model
    print('--> Export RKNN model')
    ret = rknn.export_rknn(RKNN_MODEL)
    if ret != 0:
        print('Export yolov5s.rknn failed!')
        exit(ret)
    print('done')

    rknn.release()

py测试

import os
import numpy as np
import cv2
from rknn.api import RKNN


ONNX_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx'
RKNN_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best.rknn'
IMG_PATH = './bus.jpg'
DATASET = './dataset.txt'

QUANTIZE_ON = True

BOX_THRESH = 0.5
NMS_THRESH = 0.6
IMG_SIZE = 640

CLASSES = ("person")
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def xywh2xyxy(x):
    # Convert [x, y, w, h] to [x1, y1, x2, y2]
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

def process(input, mask, anchors):

    anchors = [anchors[i] for i in mask]
    grid_h, grid_w = map(int, input.shape[0:2])

    box_confidence = sigmoid(input[..., 4])
    box_confidence = np.expand_dims(box_confidence, axis=-1)

    box_class_probs = sigmoid(input[..., 5:])

    box_xy = sigmoid(input[..., :2])*2 - 0.5

    col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
    row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
    col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    grid = np.concatenate((col, row), axis=-1)
    box_xy += grid
    box_xy *= int(IMG_SIZE/grid_h)

    box_wh = pow(sigmoid(input[..., 2:4])*2, 2)
    box_wh = box_wh * anchors

    box = np.concatenate((box_xy, box_wh), axis=-1)

    return box, box_confidence, box_class_probs

def filter_boxes(boxes, box_confidences, box_class_probs):
    """Filter boxes with box threshold. It's a bit different with origin yolov5 post process!

    # Arguments
        boxes: ndarray, boxes of objects.
        box_confidences: ndarray, confidences of objects.
        box_class_probs: ndarray, class_probs of objects.

    # Returns
        boxes: ndarray, filtered boxes.
        classes: ndarray, classes for boxes.
        scores: ndarray, scores for boxes.
    """
    box_classes = np.argmax(box_class_probs, axis=-1)
    box_class_scores = np.max(box_class_probs, axis=-1)
    pos = np.where(box_confidences[...,0] >= BOX_THRESH)


    boxes = boxes[pos]
    classes = box_classes[pos]
    scores = box_class_scores[pos]

    return boxes, classes, scores

def nms_boxes(boxes, scores):
    """Suppress non-maximal boxes.

    # Arguments
        boxes: ndarray, boxes of objects.
        scores: ndarray, scores of objects.

    # Returns
        keep: ndarray, index of effective boxes.
    """
    x = boxes[:, 0]
    y = boxes[:, 1]
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]

    areas = w * h
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        xx1 = np.maximum(x[i], x[order[1:]])
        yy1 = np.maximum(y[i], y[order[1:]])
        xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
        yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])

        w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
        h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
        inter = w1 * h1

        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= NMS_THRESH)[0]
        order = order[inds + 1]
    keep = np.array(keep)
    return keep


def yolov7_post_process(input_data):
    masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    anchors = [[12,16], [ 19,36], [40,28],
               [36,75], [76,55],[72,146],
               [142,110], [192,243], [459,401]]

    boxes, classes, scores = [], [], []
    for input,mask in zip(input_data, masks):
        b, c, s = process(input, mask, anchors)
        b, c, s = filter_boxes(b, c, s)
        boxes.append(b)
        classes.append(c)
        scores.append(s)

    boxes = np.concatenate(boxes)
    boxes = xywh2xyxy(boxes)
    classes = np.concatenate(classes)
    scores = np.concatenate(scores)

    nboxes, nclasses, nscores = [], [], []
    for c in set(classes):
        inds = np.where(classes == c)
        b = boxes[inds]
        c = classes[inds]
        s = scores[inds]

        keep = nms_boxes(b, s)

        nboxes.append(b[keep])
        nclasses.append(c[keep])
        nscores.append(s[keep])

    if not nclasses and not nscores:
        return None, None, None

    boxes = np.concatenate(nboxes)
    classes = np.concatenate(nclasses)
    scores = np.concatenate(nscores)

    return boxes, classes, scores

def draw(image, boxes, scores, classes):
    """Draw the boxes on the image.

    # Argument:
        image: original image.
        boxes: ndarray, boxes of objects.
        classes: ndarray, classes of objects.
        scores: ndarray, scores of objects.
        all_classes: all classes name.
    """
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        print('class: {}, score: {}'.format(CLASSES[cl], score))
        print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))
        top = int(top)
        left = int(left)
        right = int(right)
        bottom = int(bottom)

        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(CLASSES[cl], score),
                    (top, left - 6),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 2)


def letterbox(im, new_shape=(640, 640), color=(0, 0, 0)):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)


if __name__ == '__main__':

    # Create RKNN object
    rknn = RKNN()

    if not os.path.exists(RKNN_MODEL):
        print("model not exist")
        exit(-1)

    # Load ONNX model
    print("--> Loading model")
    ret = rknn.load_rknn(RKNN_MODEL)
    if ret != 0:
        print("Load rknn model failed!")
        exit(ret)
    print("done")
    # init runtime environment
    print('--> Init runtime environment')
    ret = rknn.init_runtime()
    # ret = rknn.init_runtime('rk1808', device_id='1808')
    if ret != 0:
        print('Init runtime environment failed')
        exit(ret)
    print('done')

    # Set inputs
    img = cv2.imread(IMG_PATH)
    # img, ratio, (dw, dh) = letterbox(img, new_shape=(IMG_SIZE, IMG_SIZE))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img,(IMG_SIZE, IMG_SIZE))

    # Inference
    print('--> Running model')
    outputs = rknn.inference(inputs=[img])

    # post process
    input0_data = outputs[0]
    input1_data = outputs[1]
    input2_data = outputs[2]

    input0_data=np.transpose(input0_data, (0, 1, 4, 2,3))
    input1_data=np.transpose(input1_data, (0, 1, 4, 2,3))
    input2_data =np.transpose(input2_data, (0, 1, 4, 2,3))

    input0_data = input0_data.reshape([3,-1]+list(input0_data.shape[-2:]))
    input1_data = input1_data.reshape([3,-1]+list(input1_data.shape[-2:]))
    input2_data = input2_data.reshape([3,-1]+list(input2_data.shape[-2:]))

    input_data = list()
    input_data.append(np.transpose(input0_data, (2, 3, 0, 1)))
    input_data.append(np.transpose(input1_data, (2, 3, 0, 1)))
    input_data.append(np.transpose(input2_data, (2, 3, 0, 1)))

    boxes, classes, scores = yolov7_post_process(input_data)

    img_1 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    if boxes is not None:
        draw(img_1, boxes, scores, classes)
    cv2.imshow("post process result", img_1)
    cv2.imwrite("post.jpg",img_1)
    cv2.waitKeyEx(0)

    rknn.release()

测试结果

惊奇使用netron发现

在reshape之后，就是１×３×６×２０×２０，那其实可以使用这样写，就不用修改官方的demo模型了

这里有三种方案，第一种方式直接取shape的节点编码，缺点是python可以，第二种方式修改pytorch源代码去掉生成transpose,第三种方式是修改onnx　直接remove后面两个节点

也可以这样搞

from rknn.api import RKNN

ONNX_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx'
RKNN_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best_remove_transpose.rknn'

if __name__ == '__main__':

    # Create RKNN object
    rknn = RKNN(verbose=True)

    # pre-process config
    print('--> config model')
    rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]], reorder_channel='0 1 2',
                target_platform='rk3399pro',
                quantized_dtype='asymmetric_affine-u8', optimization_level=3, output_optimize=1)
    print('done')

    print('--> Loading model')
    ret = rknn.load_onnx(model=ONNX_MODEL,outputs=[513,533,553])
    if ret != 0:
        print('Load model  failed!')
        exit(ret)
    print('done')

    # Build model
    print('--> Building model')
    ret = rknn.build(do_quantization=True, dataset='dataset.txt')  # ,pre_compile=True
    if ret != 0:
        print('Build yolov5s failed!')
        exit(ret)
    print('done')

    # Export rknn model
    print('--> Export RKNN model')
    ret = rknn.export_rknn(RKNN_MODEL)
    if ret != 0:
        print('Export yolov5s.rknn failed!')
        exit(ret)
    print('done')

    rknn.release()

测试就直接套用yolov5的测试代码

import os
import numpy as np
import cv2
from rknn.api import RKNN


ONNX_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx'
RKNN_MODEL = '/home/ubuntu/yolov7/runs/train/exp3/weights/best_remove_transpose.rknn'
IMG_PATH = './bus.jpg'
DATASET = './dataset.txt'

QUANTIZE_ON = True

BOX_THRESH = 0.5
NMS_THRESH = 0.6
IMG_SIZE = 640

CLASSES = ("person")
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def xywh2xyxy(x):
    # Convert [x, y, w, h] to [x1, y1, x2, y2]
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

def process(input, mask, anchors):

    anchors = [anchors[i] for i in mask]
    grid_h, grid_w = map(int, input.shape[0:2])

    box_confidence = sigmoid(input[..., 4])
    box_confidence = np.expand_dims(box_confidence, axis=-1)

    box_class_probs = sigmoid(input[..., 5:])

    box_xy = sigmoid(input[..., :2])*2 - 0.5

    col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
    row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
    col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    grid = np.concatenate((col, row), axis=-1)
    box_xy += grid
    box_xy *= int(IMG_SIZE/grid_h)

    box_wh = pow(sigmoid(input[..., 2:4])*2, 2)
    box_wh = box_wh * anchors

    box = np.concatenate((box_xy, box_wh), axis=-1)

    return box, box_confidence, box_class_probs

def filter_boxes(boxes, box_confidences, box_class_probs):
    """Filter boxes with box threshold. It's a bit different with origin yolov5 post process!

    # Arguments
        boxes: ndarray, boxes of objects.
        box_confidences: ndarray, confidences of objects.
        box_class_probs: ndarray, class_probs of objects.

    # Returns
        boxes: ndarray, filtered boxes.
        classes: ndarray, classes for boxes.
        scores: ndarray, scores for boxes.
    """
    box_classes = np.argmax(box_class_probs, axis=-1)
    box_class_scores = np.max(box_class_probs, axis=-1)
    pos = np.where(box_confidences[...,0] >= BOX_THRESH)


    boxes = boxes[pos]
    classes = box_classes[pos]
    scores = box_class_scores[pos]

    return boxes, classes, scores

def nms_boxes(boxes, scores):
    """Suppress non-maximal boxes.

    # Arguments
        boxes: ndarray, boxes of objects.
        scores: ndarray, scores of objects.

    # Returns
        keep: ndarray, index of effective boxes.
    """
    x = boxes[:, 0]
    y = boxes[:, 1]
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]

    areas = w * h
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        xx1 = np.maximum(x[i], x[order[1:]])
        yy1 = np.maximum(y[i], y[order[1:]])
        xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
        yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])

        w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
        h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
        inter = w1 * h1

        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= NMS_THRESH)[0]
        order = order[inds + 1]
    keep = np.array(keep)
    return keep


def yolov7_post_process(input_data):
    masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    anchors = [[12,16], [ 19,36], [40,28],
               [36,75], [76,55],[72,146],
               [142,110], [192,243], [459,401]]

    boxes, classes, scores = [], [], []
    for input,mask in zip(input_data, masks):
        b, c, s = process(input, mask, anchors)
        b, c, s = filter_boxes(b, c, s)
        boxes.append(b)
        classes.append(c)
        scores.append(s)

    boxes = np.concatenate(boxes)
    boxes = xywh2xyxy(boxes)
    classes = np.concatenate(classes)
    scores = np.concatenate(scores)

    nboxes, nclasses, nscores = [], [], []
    for c in set(classes):
        inds = np.where(classes == c)
        b = boxes[inds]
        c = classes[inds]
        s = scores[inds]

        keep = nms_boxes(b, s)

        nboxes.append(b[keep])
        nclasses.append(c[keep])
        nscores.append(s[keep])

    if not nclasses and not nscores:
        return None, None, None

    boxes = np.concatenate(nboxes)
    classes = np.concatenate(nclasses)
    scores = np.concatenate(nscores)

    return boxes, classes, scores

def draw(image, boxes, scores, classes):
    """Draw the boxes on the image.

    # Argument:
        image: original image.
        boxes: ndarray, boxes of objects.
        classes: ndarray, classes of objects.
        scores: ndarray, scores of objects.
        all_classes: all classes name.
    """
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        print('class: {}, score: {}'.format(CLASSES[cl], score))
        print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))
        top = int(top)
        left = int(left)
        right = int(right)
        bottom = int(bottom)

        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(CLASSES[cl], score),
                    (top, left - 6),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 2)


def letterbox(im, new_shape=(640, 640), color=(0, 0, 0)):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)


if __name__ == '__main__':

    # Create RKNN object
    rknn = RKNN()

    if not os.path.exists(RKNN_MODEL):
        print("model not exist")
        exit(-1)

    # Load ONNX model
    print("--> Loading model")
    ret = rknn.load_rknn(RKNN_MODEL)
    if ret != 0:
        print("Load rknn model failed!")
        exit(ret)
    print("done")
    # init runtime environment
    print('--> Init runtime environment')
    ret = rknn.init_runtime()
    # ret = rknn.init_runtime('rk1808', device_id='1808')
    if ret != 0:
        print('Init runtime environment failed')
        exit(ret)
    print('done')

    # Set inputs
    img = cv2.imread(IMG_PATH)
    # img, ratio, (dw, dh) = letterbox(img, new_shape=(IMG_SIZE, IMG_SIZE))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img,(IMG_SIZE, IMG_SIZE))

    # Inference
    print('--> Running model')
    outputs = rknn.inference(inputs=[img])

    # post process
    input0_data = outputs[0]
    input1_data = outputs[1]
    input2_data = outputs[2]

    # input0_data=np.transpose(input0_data, (0, 1, 4, 2,3))
    # input1_data=np.transpose(input1_data, (0, 1, 4, 2,3))
    # input2_data =np.transpose(input2_data, (0, 1, 4, 2,3))

    input0_data = input0_data.reshape([3,-1]+list(input0_data.shape[-2:]))
    input1_data = input1_data.reshape([3,-1]+list(input1_data.shape[-2:]))
    input2_data = input2_data.reshape([3,-1]+list(input2_data.shape[-2:]))

    print(input0_data.shape)
    print(input1_data.shape)
    print(input2_data.shape)

    input_data = list()
    input_data.append(np.transpose(input0_data, (2, 3, 0, 1)))
    input_data.append(np.transpose(input1_data, (2, 3, 0, 1)))
    input_data.append(np.transpose(input2_data, (2, 3, 0, 1)))

    print(np.transpose(input0_data, (2, 3, 0, 1)).shape)
    print(np.transpose(input1_data, (2, 3, 0, 1)).shape)
    print(np.transpose(input2_data, (2, 3, 0, 1)).shape)

    boxes, classes, scores = yolov7_post_process(input_data)

    img_1 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    if boxes is not None:
        draw(img_1, boxes, scores, classes)
    cv2.imshow("post process result", img_1)
    cv2.waitKeyEx(0)

    rknn.release()

结果是一样的

c++测试 rk3399 pro＆rv1126,测试以rk3399 pro为准，硬件我实际用rv1126

测试硬件开发板rv1126摄像头开发板，在它的基础上进行开发，公司所属深圳思核科技有限公司

第一步：先通过adb进行系统，修改ip地址，主要因为我的ip地址网段是192.168.10.*我使用命令怎么也没搜到它，哎，看手册可以先找到ip在修改，所以这么搞

使用python同一网段ip失败，所以的用adb命令搜索

import platform
import sys
import os
import time
import threading
 
live_ip = 0
 
 
def get_os():
    os = platform.system()
    if os == "Windows":
        return "n"
    else:
        return "c"
 
 
def ping_ip(ip_str):
    cmd = ["ping", "-{op}".format(op=get_os()),
           "1", ip_str]
    output = os.popen(" ".join(cmd)).readlines()
    for line in output:
        if str(line).upper().find("TTL") >= 0:
            print("ip: %s is ok ***" % ip_str)
            global live_ip
            live_ip += 1
            break
 
 
def find_ip(ip_prefix):
    '''''
    给出当前的127.0.0 ，然后扫描整个段所有地址
    '''
    threads = []
    for i in range(1, 256):
        ip = '%s.%s' % (ip_prefix, i)
        threads.append(threading.Thread(target=ping_ip, args={ip, }))
    for i in threads:
        i.start()
    for i in threads:
        i.join()
 
 
if __name__ == "__main__":
    print("start time %s" % time.ctime())
    cmd_args = sys.argv[1:]
    args = "".join(cmd_args)
    ip_pre = "192.168.10"
    find_ip(ip_pre)
    print("end time %s" % time.ctime())
    print('本次扫描共检测到本网络存在%s台设备' % live_ip)

使用adb命令进入系统

C:\Users\Administrator>f:
 
F:\>adb.exe
Android Debug Bridge version 1.0.41
Version 29.0.5-5949299
Installed as F:\adb.exe
 
global options:
 -a         listen on all network interfaces, not just localhost
 -d         use USB device (error if multiple devices connected)
 -e         use TCP/IP device (error if multiple TCP/IP devices available)
 -s SERIAL  use device with given serial (overrides $ANDROID_SERIAL)
 -t ID      use device with given transport id
 -H         name of adb server host [default=localhost]
 -P         port of adb server [default=5037]
 -L SOCKET  listen on given socket for adb server [default=tcp:localhost:5037]
 
 
F:\>adb shell
* daemon not running; starting now at tcp:5037
* daemon started successfully
[root@owlvtech:/]# ifconfig
eth0      Link encap:Ethernet  HWaddr F6:F9:51:F0:5B:C7
          inet addr:192.168.1.100  Bcast:192.168.1.255  Mask:255.255.255.0
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:3289 errors:0 dropped:0 overruns:0 frame:0
          TX packets:117 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000
          RX bytes:247207 (241.4 KiB)  TX bytes:10048 (9.8 KiB)
          Interrupt:61
lo        Link encap:Local Loopback
          inet addr:127.0.0.1  Mask:255.0.0.0
          UP LOOPBACK RUNNING  MTU:65536  Metric:1
          RX packets:57 errors:0 dropped:0 overruns:0 frame:0
          TX packets:57 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000
          RX bytes:5472 (5.3 KiB)  TX bytes:5472 (5.3 KiB)
[root@owlvtech:/]# df -h
Filesystem                Size  Used Avail Use% Mounted on
/dev/root                 944M  169M  717M  20% /
devtmpfs                  239M     0  239M   0% /dev
tmpfs                     240M     0  240M   0% /dev/shm
tmpfs                     240M  312K  239M   1% /tmp
tmpfs                     240M  268K  239M   1% /run
/dev/mmcblk0p7            183M  138M   33M  82% /oem
/dev/mmcblk0p8           1008M  3.0M 1005M   1% /userdata
/dev/block/by-name/media  5.0G   11M  4.7G   1% /userdata/media
[root@owlvtech:/]#

第二步：先拉个视频流看一下，看官方说明支持rtsp视频流，可能我用的是wifi转有线接口，网不好，测试是可用的

C:\Users\Administrator>ffplay rtsp://192.168.10.100/live/mainstream  -x 640 -y 480

第三步：移植模型试试

ubuntu@sxj731533730:~$ ssh root@192.168.10.100
root@192.168.10.100's password:rochchip
route: SIOCADDRT: File exists
[root@owlvtech:~]#

需要交叉编译，自己编译去吧，只提供源代码

cmakelists.txt

cmake_minimum_required(VERSION 3.16)
project(untitled10)
set(CMAKE_CXX_FLAGS "-std=c++11")
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")

include_directories(${CMAKE_SOURCE_DIR})
include_directories(${CMAKE_SOURCE_DIR}/include)
find_package(OpenCV REQUIRED)
#message(STATUS ${OpenCV_INCLUDE_DIRS})
#添加头文件
include_directories(${OpenCV_INCLUDE_DIRS})
#链接Opencv库
add_library(librknn_api SHARED IMPORTED)
set_target_properties(librknn_api PROPERTIES IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/lib64/librknn_api.so)


add_executable(untitled10 main.cpp)
target_link_libraries(untitled10 ${OpenCV_LIBS} librknn_api )

main.cpp

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <queue>
#include "rknn_api.h"
#include "opencv2/core/core.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <chrono>
#define OBJ_NAME_MAX_SIZE 16
#define OBJ_NUMB_MAX_SIZE 200
#define OBJ_CLASS_NUM     1
#define PROP_BOX_SIZE     (5+OBJ_CLASS_NUM)
using namespace std;

typedef struct _BOX_RECT {
    int left;
    int right;
    int top;
    int bottom;
} BOX_RECT;

typedef struct __detect_result_t {
    char name[OBJ_NAME_MAX_SIZE];
    int class_index;
    BOX_RECT box;
    float prop;
} detect_result_t;

typedef struct _detect_result_group_t {
    int id;
    int count;
    detect_result_t results[OBJ_NUMB_MAX_SIZE];
} detect_result_group_t;

const int anchor0[6] = {12,16, 19,36, 40,28};
const int anchor1[6] = {36,75, 76,55, 72,146};
const int anchor2[6] = {142,110, 192,243, 459,401};

void printRKNNTensor(rknn_tensor_attr *attr) {
    printf("index=%d name=%s n_dims=%d dims=[%d %d %d %d] n_elems=%d size=%d "
           "fmt=%d type=%d qnt_type=%d fl=%d zp=%d scale=%f\n",
           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2],
           attr->dims[1], attr->dims[0], attr->n_elems, attr->size, 0, attr->type,
           attr->qnt_type, attr->fl, attr->zp, attr->scale);
}

float sigmoid(float x) {
    return 1.0 / (1.0 + expf(-x));
}

float unsigmoid(float y) {
    return -1.0 * logf((1.0 / y) - 1.0);
}

int process_fp(float *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
               std::vector<float> &boxes, std::vector<float> &boxScores, std::vector<int> &classId,
               float threshold) {

    int validCount = 0;
    int grid_len = grid_h * grid_w;
    float thres_sigmoid = unsigmoid(threshold);
    for (int a = 0; a < 3; a++) {
        for (int i = 0; i < grid_h; i++) {
            for (int j = 0; j < grid_w; j++) {
                float box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j];
                if (box_confidence >= thres_sigmoid) {
                    int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
                    float *in_ptr = input + offset;
                    float box_x = sigmoid(*in_ptr) * 2.0 - 0.5;
                    float box_y = sigmoid(in_ptr[grid_len]) * 2.0 - 0.5;
                    float box_w = sigmoid(in_ptr[2 * grid_len]) * 2.0;
                    float box_h = sigmoid(in_ptr[3 * grid_len]) * 2.0;
                    box_x = (box_x + j) * (float) stride;
                    box_y = (box_y + i) * (float) stride;
                    box_w = box_w * box_w * (float) anchor[a * 2];
                    box_h = box_h * box_h * (float) anchor[a * 2 + 1];
                    box_x -= (box_w / 2.0);
                    box_y -= (box_h / 2.0);
                    boxes.push_back(box_x);
                    boxes.push_back(box_y);
                    boxes.push_back(box_w);
                    boxes.push_back(box_h);

                    float maxClassProbs = in_ptr[5 * grid_len];
                    int maxClassId = 0;
                    for (int k = 1; k < OBJ_CLASS_NUM; ++k) {
                        float prob = in_ptr[(5 + k) * grid_len];
                        if (prob > maxClassProbs) {
                            maxClassId = k;
                            maxClassProbs = prob;
                        }
                    }
                    float box_conf_f32 = sigmoid(box_confidence);
                    float class_prob_f32 = sigmoid(maxClassProbs);
                    boxScores.push_back(box_conf_f32 * class_prob_f32);
                    classId.push_back(maxClassId);
                    validCount++;
                }
            }
        }
    }
    return validCount;
}

float CalculateOverlap(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
                       float ymax1) {
    float w = fmax(0.f, fmin(xmax0, xmax1) - fmax(xmin0, xmin1) + 1.0);
    float h = fmax(0.f, fmin(ymax0, ymax1) - fmax(ymin0, ymin1) + 1.0);
    float i = w * h;
    float u = (xmax0 - xmin0 + 1.0) * (ymax0 - ymin0 + 1.0) + (xmax1 - xmin1 + 1.0) * (ymax1 - ymin1 + 1.0) - i;
    return u <= 0.f ? 0.f : (i / u);
}

int nms(int validCount, std::vector<float> &outputLocations, std::vector<int> &order, float threshold) {
    for (int i = 0; i < validCount; ++i) {
        if (order[i] == -1) {
            continue;
        }
        int n = order[i];
        for (int j = i + 1; j < validCount; ++j) {
            int m = order[j];
            if (m == -1) {
                continue;
            }
            float xmin0 = outputLocations[n * 4 + 0];
            float ymin0 = outputLocations[n * 4 + 1];
            float xmax0 = outputLocations[n * 4 + 0] + outputLocations[n * 4 + 2];
            float ymax0 = outputLocations[n * 4 + 1] + outputLocations[n * 4 + 3];

            float xmin1 = outputLocations[m * 4 + 0];
            float ymin1 = outputLocations[m * 4 + 1];
            float xmax1 = outputLocations[m * 4 + 0] + outputLocations[m * 4 + 2];
            float ymax1 = outputLocations[m * 4 + 1] + outputLocations[m * 4 + 3];

            float iou = CalculateOverlap(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);

            if (iou > threshold) {
                order[j] = -1;
            }
        }
    }
    return 0;
}

int quick_sort_indice_inverse(
        std::vector<float> &input,
        int left,
        int right,
        std::vector<int> &indices) {
    float key;
    int key_index;
    int low = left;
    int high = right;
    if (left < right) {
        key_index = indices[left];
        key = input[left];
        while (low < high) {
            while (low < high && input[high] <= key) {
                high--;
            }
            input[low] = input[high];
            indices[low] = indices[high];
            while (low < high && input[low] >= key) {
                low++;
            }
            input[high] = input[low];
            indices[high] = indices[low];
        }
        input[low] = key;
        indices[low] = key_index;
        quick_sort_indice_inverse(input, left, low - 1, indices);
        quick_sort_indice_inverse(input, low + 1, right, indices);
    }
    return low;
}

int clamp(float val, int min, int max) {
    return val > min ? (val < max ? val : max) : min;
}

int post_process_fp(float *input0, float *input1, float *input2, int model_in_h, int model_in_w,
                    int h_offset, int w_offset, float resize_scale, float conf_threshold, float nms_threshold,
                    detect_result_group_t *group, const char *labels[]) {
    memset(group, 0, sizeof(detect_result_group_t));
    std::vector<float> filterBoxes;
    std::vector<float> boxesScore;
    std::vector<int> classId;
    int stride0 = 8;
    int grid_h0 = model_in_h / stride0;
    int grid_w0 = model_in_w / stride0;
    int validCount0 = 0;
    validCount0 = process_fp(input0, (int *) anchor0, grid_h0, grid_w0, model_in_h, model_in_w,
                             stride0, filterBoxes, boxesScore, classId, conf_threshold);

    int stride1 = 16;
    int grid_h1 = model_in_h / stride1;
    int grid_w1 = model_in_w / stride1;
    int validCount1 = 0;
    validCount1 = process_fp(input1, (int *) anchor1, grid_h1, grid_w1, model_in_h, model_in_w,
                             stride1, filterBoxes, boxesScore, classId, conf_threshold);

    int stride2 = 32;
    int grid_h2 = model_in_h / stride2;
    int grid_w2 = model_in_w / stride2;
    int validCount2 = 0;
    validCount2 = process_fp(input2, (int *) anchor2, grid_h2, grid_w2, model_in_h, model_in_w,
                             stride2, filterBoxes, boxesScore, classId, conf_threshold);

    int validCount = validCount0 + validCount1 + validCount2;
    // no object detect
    if (validCount <= 0) {
        return 0;
    }

    std::vector<int> indexArray;
    for (int i = 0; i < validCount; ++i) {
        indexArray.push_back(i);
    }

    quick_sort_indice_inverse(boxesScore, 0, validCount - 1, indexArray);

    nms(validCount, filterBoxes, indexArray, nms_threshold);

    int last_count = 0;
    /* box valid detect target */
    for (int i = 0; i < validCount; ++i) {

        if (indexArray[i] == -1 || boxesScore[i] < conf_threshold || last_count >= OBJ_NUMB_MAX_SIZE) {
            continue;
        }
        int n = indexArray[i];

        float x1 = filterBoxes[n * 4 + 0];
        float y1 = filterBoxes[n * 4 + 1];
        float x2 = x1 + filterBoxes[n * 4 + 2];
        float y2 = y1 + filterBoxes[n * 4 + 3];
        int id = classId[n];

        group->results[last_count].box.left = (int) ((clamp(x1, 0, model_in_w) - w_offset) / resize_scale);
        group->results[last_count].box.top = (int) ((clamp(y1, 0, model_in_h) - h_offset) / resize_scale);
        group->results[last_count].box.right = (int) ((clamp(x2, 0, model_in_w) - w_offset) / resize_scale);
        group->results[last_count].box.bottom = (int) ((clamp(y2, 0, model_in_h) - h_offset) / resize_scale);
        group->results[last_count].prop = boxesScore[i];
        group->results[last_count].class_index = id;
        const char *label = labels[id];
        strncpy(group->results[last_count].name, label, OBJ_NAME_MAX_SIZE);

        // printf("result %2d: (%4d, %4d, %4d, %4d), %s\n", i, group->results[last_count].box.left, group->results[last_count].box.top,
        //        group->results[last_count].box.right, group->results[last_count].box.bottom, label);
        last_count++;
    }
    group->count = last_count;

    return 0;
}

float deqnt_affine_to_f32(uint8_t qnt, uint8_t zp, float scale) {
    return ((float) qnt - (float) zp) * scale;
}

int32_t __clip(float val, float min, float max) {
    float f = val <= min ? min : (val >= max ? max : val);
    return f;
}

uint8_t qnt_f32_to_affine(float f32, uint8_t zp, float scale) {
    float dst_val = (f32 / scale) + zp;
    uint8_t res = (uint8_t) __clip(dst_val, 0, 255);
    return res;
}

int process_u8(uint8_t *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
               std::vector<float> &boxes, std::vector<float> &boxScores, std::vector<int> &classId,
               float threshold, uint8_t zp, float scale) {

    int validCount = 0;
    int grid_len = grid_h * grid_w;
    float thres = unsigmoid(threshold);
    uint8_t thres_u8 = qnt_f32_to_affine(thres, zp, scale);
    for (int a = 0; a < 3; a++) {
        for (int i = 0; i < grid_h; i++) {
            for (int j = 0; j < grid_w; j++) {
                uint8_t box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j];
                if (box_confidence >= thres_u8) {
                    int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
                    uint8_t *in_ptr = input + offset;
                    float box_x = sigmoid(deqnt_affine_to_f32(*in_ptr, zp, scale)) * 2.0 - 0.5;
                    float box_y = sigmoid(deqnt_affine_to_f32(in_ptr[grid_len], zp, scale)) * 2.0 - 0.5;
                    float box_w = sigmoid(deqnt_affine_to_f32(in_ptr[2 * grid_len], zp, scale)) * 2.0;
                    float box_h = sigmoid(deqnt_affine_to_f32(in_ptr[3 * grid_len], zp, scale)) * 2.0;
                    box_x = (box_x + j) * (float) stride;
                    box_y = (box_y + i) * (float) stride;
                    box_w = box_w * box_w * (float) anchor[a * 2];
                    box_h = box_h * box_h * (float) anchor[a * 2 + 1];
                    box_x -= (box_w / 2.0);
                    box_y -= (box_h / 2.0);
                    boxes.push_back(box_x);
                    boxes.push_back(box_y);
                    boxes.push_back(box_w);
                    boxes.push_back(box_h);

                    uint8_t maxClassProbs = in_ptr[5 * grid_len];
                    int maxClassId = 0;
                    for (int k = 1; k < OBJ_CLASS_NUM; ++k) {
                        uint8_t prob = in_ptr[(5 + k) * grid_len];
                        if (prob > maxClassProbs) {
                            maxClassId = k;
                            maxClassProbs = prob;
                        }
                    }
                    float box_conf_f32 = sigmoid(deqnt_affine_to_f32(box_confidence, zp, scale));
                    float class_prob_f32 = sigmoid(deqnt_affine_to_f32(maxClassProbs, zp, scale));
                    boxScores.push_back(box_conf_f32 * class_prob_f32);
                    classId.push_back(maxClassId);
                    validCount++;
                }
            }
        }
    }
    return validCount;
}

int post_process_u8(uint8_t *input0, uint8_t *input1, uint8_t *input2, int model_in_h, int model_in_w,
                    int h_offset, int w_offset, float resize_scale, float conf_threshold, float nms_threshold,
                    std::vector<uint8_t> &qnt_zps, std::vector<float> &qnt_scales,
                    detect_result_group_t *group, const char *labels[]) {

    memset(group, 0, sizeof(detect_result_group_t));

    std::vector<float> filterBoxes;
    std::vector<float> boxesScore;
    std::vector<int> classId;
    int stride0 = 8;
    int grid_h0 = model_in_h / stride0;
    int grid_w0 = model_in_w / stride0;
    int validCount0 = 0;
    validCount0 = process_u8(input0, (int *) anchor0, grid_h0, grid_w0, model_in_h, model_in_w,
                             stride0, filterBoxes, boxesScore, classId, conf_threshold, qnt_zps[0], qnt_scales[0]);

    int stride1 = 16;
    int grid_h1 = model_in_h / stride1;
    int grid_w1 = model_in_w / stride1;
    int validCount1 = 0;
    validCount1 = process_u8(input1, (int *) anchor1, grid_h1, grid_w1, model_in_h, model_in_w,
                             stride1, filterBoxes, boxesScore, classId, conf_threshold, qnt_zps[1], qnt_scales[1]);

    int stride2 = 32;
    int grid_h2 = model_in_h / stride2;
    int grid_w2 = model_in_w / stride2;
    int validCount2 = 0;
    validCount2 = process_u8(input2, (int *) anchor2, grid_h2, grid_w2, model_in_h, model_in_w,
                             stride2, filterBoxes, boxesScore, classId, conf_threshold, qnt_zps[2], qnt_scales[2]);

    int validCount = validCount0 + validCount1 + validCount2;
    // no object detect
    if (validCount <= 0) {
        return 0;
    }

    std::vector<int> indexArray;
    for (int i = 0; i < validCount; ++i) {
        indexArray.push_back(i);
    }

    quick_sort_indice_inverse(boxesScore, 0, validCount - 1, indexArray);

    nms(validCount, filterBoxes, indexArray, nms_threshold);

    int last_count = 0;
    group->count = 0;
    /* box valid detect target */
    for (int i = 0; i < validCount; ++i) {

        if (indexArray[i] == -1 || boxesScore[i] < conf_threshold || last_count >= OBJ_NUMB_MAX_SIZE) {
            continue;
        }
        int n = indexArray[i];

        float x1 = filterBoxes[n * 4 + 0];
        float y1 = filterBoxes[n * 4 + 1];
        float x2 = x1 + filterBoxes[n * 4 + 2];
        float y2 = y1 + filterBoxes[n * 4 + 3];
        int id = classId[n];

        group->results[last_count].box.left = (int) ((clamp(x1, 0, model_in_w) - w_offset) / resize_scale);
        group->results[last_count].box.top = (int) ((clamp(y1, 0, model_in_h) - h_offset) / resize_scale);
        group->results[last_count].box.right = (int) ((clamp(x2, 0, model_in_w) - w_offset) / resize_scale);
        group->results[last_count].box.bottom = (int) ((clamp(y2, 0, model_in_h) - h_offset) / resize_scale);
        group->results[last_count].prop = boxesScore[i];
        group->results[last_count].class_index = id;
        const char *label = labels[id];
        strncpy(group->results[last_count].name, label, OBJ_NAME_MAX_SIZE);

        // printf("result %2d: (%4d, %4d, %4d, %4d), %s\n", i, group->results[last_count].box.left, group->results[last_count].box.top,
        //        group->results[last_count].box.right, group->results[last_count].box.bottom, label);
        last_count++;
    }
    group->count = last_count;

    return 0;
}
void letterbox(cv::Mat rgb,cv::Mat &img_resize,int target_width,int target_height){

    float shape_0=rgb.rows;
    float shape_1=rgb.cols;
    float new_shape_0=target_height;
    float new_shape_1=target_width;
    float r=std::min(new_shape_0/shape_0,new_shape_1/shape_1);
    float new_unpad_0=int(round(shape_1*r));
    float new_unpad_1=int(round(shape_0*r));
    float dw=new_shape_1-new_unpad_0;
    float dh=new_shape_0-new_unpad_1;
    dw=dw/2;
    dh=dh/2;
    cv::Mat copy_rgb=rgb.clone();
    if(int(shape_0)!=int(new_unpad_0)&&int(shape_1)!=int(new_unpad_1)){
        cv::resize(copy_rgb,img_resize,cv::Size(new_unpad_0,new_unpad_1));
        copy_rgb=img_resize;
    }
    int top=int(round(dh-0.1));
    int bottom=int(round(dh+0.1));
    int left=int(round(dw-0.1));
    int right=int(round(dw+0.1));
    cv::copyMakeBorder(copy_rgb, img_resize,top, bottom, left, right, cv::BORDER_CONSTANT, cv::Scalar(0,0,0));

}
int main(int argc, char **argv) {
    const char *img_path = "../bus.jpg";
    const char *model_path = "../best_remove_transpose.rknn";
    const char *post_process_type = "fp";//fp
    const int target_width = 640;
    const int target_height = 640;
    const char *image_process_mode = "letter_box";
    float resize_scale = 0;
    int h_pad=0;
    int w_pad=0;
    const float nms_threshold = 0.6;
    const float conf_threshold = 0.3;
    const char *labels[] = {"person"};
    // Load image
    cv::Mat bgr = cv::imread(img_path);
    if (!bgr.data) {
        printf("cv::imread %s fail!\n", img_path);
        return -1;
    }
    cv::Mat rgb;
    //BGR->RGB
    cv::cvtColor(bgr, rgb, cv::COLOR_BGR2RGB);

    cv::Mat img_resize;
    float correction[2] = {0, 0};
    float scale_factor[] = {0, 0};
    int width=rgb.cols;
    int height=rgb.rows;
    // Letter box resize
    float img_wh_ratio = (float) width / (float) height;
    float input_wh_ratio = (float) target_width / (float) target_height;
    int resize_width;
    int resize_height;
    if (img_wh_ratio >= input_wh_ratio) {
        //pad height dim
        resize_scale = (float) target_width / (float) width;
        resize_width = target_width;
        resize_height = (int) ((float) height * resize_scale);
        w_pad = 0;
        h_pad = (target_height - resize_height) / 2;
    } else {
        //pad width dim
        resize_scale = (float) target_height / (float) height;
        resize_width = (int) ((float) width * resize_scale);
        resize_height = target_height;
        w_pad = (target_width - resize_width) / 2;;
        h_pad = 0;
    }
    if(strcmp(image_process_mode,"letter_box")==0){
    letterbox(rgb,img_resize,target_width,target_height);
    }else {
        cv::resize(rgb, img_resize, cv::Size(target_width, target_height));
    }
    // Load model
    FILE *fp = fopen(model_path, "rb");
    if (fp == NULL) {
        printf("fopen %s fail!\n", model_path);
        return -1;
    }
    fseek(fp, 0, SEEK_END);
    int model_len = ftell(fp);
    void *model = malloc(model_len);
    fseek(fp, 0, SEEK_SET);
    if (model_len != fread(model, 1, model_len, fp)) {
        printf("fread %s fail!\n", model_path);
        free(model);
        return -1;
    }


    rknn_context ctx = 0;

    int ret = rknn_init(&ctx, model, model_len, 0);
    if (ret < 0) {
        printf("rknn_init fail! ret=%d\n", ret);
        return -1;
    }

    /* Query sdk version */
    rknn_sdk_version version;
    ret = rknn_query(ctx, RKNN_QUERY_SDK_VERSION, &version,
                     sizeof(rknn_sdk_version));
    if (ret < 0) {
        printf("rknn_init error ret=%d\n", ret);
        return -1;
    }
    printf("sdk version: %s driver version: %s\n", version.api_version,
           version.drv_version);


    /* Get input,output attr */
    rknn_input_output_num io_num;
    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
    if (ret < 0) {
        printf("rknn_init error ret=%d\n", ret);
        return -1;
    }
    printf("model input num: %d, output num: %d\n", io_num.n_input,
           io_num.n_output);

    rknn_tensor_attr input_attrs[io_num.n_input];
    memset(input_attrs, 0, sizeof(input_attrs));
    for (int i = 0; i < io_num.n_input; i++) {
        input_attrs[i].index = i;
        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]),
                         sizeof(rknn_tensor_attr));
        if (ret < 0) {
            printf("rknn_init error ret=%d\n", ret);
            return -1;
        }
        printRKNNTensor(&(input_attrs[i]));
    }

    rknn_tensor_attr output_attrs[io_num.n_output];
    memset(output_attrs, 0, sizeof(output_attrs));
    for (int i = 0; i < io_num.n_output; i++) {
        output_attrs[i].index = i;
        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]),
                         sizeof(rknn_tensor_attr));
        printRKNNTensor(&(output_attrs[i]));
    }

    int input_channel = 3;
    int input_width = 0;
    int input_height = 0;
    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
        printf("model is NCHW input fmt\n");
        input_width = input_attrs[0].dims[0];
        input_height = input_attrs[0].dims[1];
        printf("input_width=%d input_height=%d\n", input_width, input_height);
    } else {
        printf("model is NHWC input fmt\n");
        input_width = input_attrs[0].dims[1];
        input_height = input_attrs[0].dims[2];
        printf("input_width=%d input_height=%d\n", input_width, input_height);
    }

    printf("model input height=%d, width=%d, channel=%d\n", input_height, input_width,
           input_channel);


/* Init input tensor */
    rknn_input inputs[1];
    memset(inputs, 0, sizeof(inputs));
    inputs[0].index = 0;
    inputs[0].buf = img_resize.data;
    inputs[0].type = RKNN_TENSOR_UINT8;
    inputs[0].size = input_width * input_height * input_channel;
    inputs[0].fmt = RKNN_TENSOR_NHWC;
    inputs[0].pass_through = 0;

    /* Init output tensor */
    rknn_output outputs[io_num.n_output];
    memset(outputs, 0, sizeof(outputs));
    for (int i = 0; i < io_num.n_output; i++) {
        if (strcmp(post_process_type, "fp") == 0) {
            outputs[i].want_float = 1;
        } else if (strcmp(post_process_type, "u8") == 0) {
            outputs[i].want_float = 0;
        }
    }
    printf("img.cols: %d, img.rows: %d\n", img_resize.cols, img_resize.rows);
    auto t1=std::chrono::steady_clock::now();
    rknn_inputs_set(ctx, io_num.n_input, inputs);
    ret = rknn_run(ctx, NULL);
    if (ret < 0) {
        printf("ctx error ret=%d\n", ret);
        return -1;
    }
    ret = rknn_outputs_get(ctx, io_num.n_output, outputs, NULL);
    if (ret < 0) {
        printf("outputs error ret=%d\n", ret);
        return -1;
    }
    /* Post process */
    std::vector<float> out_scales;
    std::vector<uint8_t> out_zps;
    for (int i = 0; i < io_num.n_output; ++i) {
        out_scales.push_back(output_attrs[i].scale);
        out_zps.push_back(output_attrs[i].zp);
    }

    detect_result_group_t detect_result_group;
    if (strcmp(post_process_type, "u8") == 0) {
        post_process_u8((uint8_t *) outputs[0].buf, (uint8_t *) outputs[1].buf, (uint8_t *) outputs[2].buf,
                        input_height, input_width,
                        h_pad, w_pad, resize_scale, conf_threshold, nms_threshold, out_zps, out_scales,
                        &detect_result_group, labels);
    } else if (strcmp(post_process_type, "fp") == 0) {
        post_process_fp((float *) outputs[0].buf, (float *) outputs[1].buf, (float *) outputs[2].buf, input_height,
                        input_width,
                        h_pad, w_pad, resize_scale, conf_threshold, nms_threshold, &detect_result_group, labels);
    }
//毫秒级
    auto t2=std::chrono::steady_clock::now();
    double dr_ms=std::chrono::duration<double,std::milli>(t2-t1).count();
    printf("%lf ms\n",dr_ms);


    for (int i = 0; i < detect_result_group.count; i++) {
        detect_result_t *det_result = &(detect_result_group.results[i]);
        printf("%s @ (%d %d %d %d) %f\n",
               det_result->name,
               det_result->box.left, det_result->box.top, det_result->box.right, det_result->box.bottom,
               det_result->prop);
        int bx1 = det_result->box.left;
        int by1 = det_result->box.top;
        int bx2 = det_result->box.right;
        int by2 = det_result->box.bottom;
        cv::rectangle(bgr, cv::Point(bx1, by1), cv::Point(bx2, by2), cv::Scalar(231, 232, 143));  //两点的方式
        char text[256];
        sprintf(text, "%s %.1f%% ", det_result->name, det_result->prop * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = bx1;
        int y = by1 - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > bgr.cols)
            x = bgr.cols - label_size.width;


        cv::rectangle(bgr, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(0, 0, 255), -1);

        cv::putText(bgr, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_DUPLEX, 0.4, cv::Scalar(255, 255, 255), 1, cv::LINE_AA);

        cv::imwrite("../bgr9.jpg", bgr);
    }


    ret = rknn_outputs_release(ctx, io_num.n_output, outputs);

    if (ret < 0) {
        printf("rknn_query fail! ret=%d\n", ret);
        goto Error;
    }


    Error:
    if (ctx > 0)
        rknn_destroy(ctx);
    if (model)
        free(model);
    if (fp)
        fclose(fp);
    return 0;
}

测试结果

十、openvino部署

转模型xml

ubuntu@ubuntu:~/yolov7$ python /opt/intel/openvino_2021/deployment_tools/model_optimizer/mo.py --input_model /home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx --output_dir /home/ubuntu/yolov7/runs/train/exp3/weights/FP16 --input_shape [1,3,640,640] --data_type FP16

使用CLion进行阅读调试代码(Debug)#

先进入clion.sh位置(/home/zranguai/software/CLion-2021.3.2/clion-2021.3.2/bin) 然后sh clion.sh
source /opt/intel/openvino_2021/bin/setupvars.sh

cmakelists.txt

cmake_minimum_required(VERSION 3.4.1)
set(CMAKE_CXX_STANDARD 14)


project(untitled23)

find_package(OpenCV REQUIRED)
find_package(ngraph REQUIRED)
find_package(InferenceEngine REQUIRED)

include_directories(
        ${OpenCV_INCLUDE_DIRS}
        ${CMAKE_CURRENT_SOURCE_DIR}
        ${CMAKE_CURRENT_BINARY_DIR}
)

add_executable(untitled23 main.cpp)

target_link_libraries(
        untitled23
        ${InferenceEngine_LIBRARIES}
        ${NGRAPH_LIBRARIES}
        ${OpenCV_LIBS}
)

main.cpp

#include <opencv2/opencv.hpp>
#include <inference_engine.hpp>
#include <iostream>
#include <chrono>
#include <opencv2/dnn/dnn.hpp>
#include <cmath>
using namespace std;
using namespace cv;
using namespace InferenceEngine;


typedef struct {
    int width;
    int height;
} YoloSize;


typedef struct {
    std::string name;
    int stride;
    std::vector<YoloSize> anchors;
} YoloLayerData;

class BoxInfo
{
public:
    int x1,y1,x2,y2,label,id;
    float score;
};
static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}
double GetIOU(cv::Rect_<float> bb_test, cv::Rect_<float> bb_gt)
{
    float in = (bb_test & bb_gt).area();
    float un = bb_test.area() + bb_gt.area() - in;

    if (un < DBL_EPSILON)
        return 0;

    return (double)(in / un);
}
std::vector<BoxInfo> decode_infer(const float *data_ptr, int stride,  int net_size, int num_classes,
                                  const std::vector<YoloSize> &anchors, float threshold,int idx)
{
    int s[3]={20,40,80};
    std::vector<BoxInfo> result;
    int batchs, channels, height, width, pred_item ;
    batchs = 1;
    channels = 3;
    height = s[idx];
    width =  s[idx];
    pred_item = 6;
    std::cout<<batchs<<" "<<channels<<" "<<height<<" "<<width<<" "<<pred_item<<" "<<std::endl;

    for(int i=0;i<20;i++){
        std::cout<<data_ptr[i]<<" ";
    }
    std::cout<<std::endl;
    for(int bi=0; bi<batchs; bi++)
    {
        auto batch_ptr = data_ptr + bi*(channels*height*width*pred_item);
        for(int ci=0; ci<channels; ci++)
        {
            auto channel_ptr = batch_ptr + ci*(height*width*pred_item);
            for(int hi=0; hi<height; hi++)
            {
                auto height_ptr = channel_ptr + hi*(width * pred_item);
                for(int wi=0; wi<width; wi++)
                {
                    auto width_ptr = height_ptr + wi*pred_item;
                    auto cls_ptr = width_ptr + 5;

                    auto confidence = sigmoid(width_ptr[4]);

                    for(int cls_id=0; cls_id<num_classes; cls_id++)
                    {
                        float score = sigmoid(cls_ptr[cls_id]) * confidence;
                        if(score > threshold)
                        {
                            float cx = (sigmoid(width_ptr[0]) * 2.f - 0.5f + wi) * (float) stride;
                            float cy = (sigmoid(width_ptr[1]) * 2.f - 0.5f + hi) * (float) stride;
                            float w = pow(sigmoid(width_ptr[2]) * 2.f, 2) * anchors[ci].width;
                            float h = pow(sigmoid(width_ptr[3]) * 2.f, 2) * anchors[ci].height;

                            BoxInfo box;

                            box.x1 = std::max(0, std::min(net_size, int((cx - w / 2.f) )));
                            box.y1 = std::max(0, std::min(net_size, int((cy - h / 2.f) )));
                            box.x2 = std::max(0, std::min(net_size, int((cx + w / 2.f) )));
                            box.y2 = std::max(0, std::min(net_size, int((cy + h / 2.f) )));
                            box.score = score;
                            box.label = cls_id;
                            result.push_back(box);
                        }
                    }
                }
            }
        }
    }

    return result;
}

void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
                   * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        for (int j = i + 1; j < int(input_boxes.size());) {
            float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1);
            float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1);
            float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2);
            float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH) {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            } else {
                j++;
            }
        }
    }
}
void scale_coords(std::vector<BoxInfo> &boxes, int w_from, int h_from, int w_to, int h_to)
{
    float w_ratio = float(w_to)/float(w_from);
    float h_ratio = float(h_to)/float(h_from);


    for(auto &box: boxes)
    {
        box.x1 *= w_ratio;
        box.x2 *= w_ratio;
        box.y1 *= h_ratio;
        box.y2 *= h_ratio;
    }
    return ;
}

cv::Mat draw_box(cv::Mat & cv_mat, std::vector<BoxInfo> &boxes, const std::vector<std::string> &labels,unsigned char colors[][3])
{

    for(auto box : boxes)
    {
        int width = box.x2-box.x1;
        int height = box.y2-box.y1;
        cv::Point p = cv::Point(box.x1, box.y1);
        cv::Rect rect = cv::Rect(box.x1, box.y1, width, height);
        cv::rectangle(cv_mat, rect, cv::Scalar(colors[box.label][0],colors[box.label][1],colors[box.label][2]));
        string text = labels[box.label] + ":" + std::to_string(box.score) ;
        cv::putText(cv_mat, text, p, cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(colors[box.label][0],colors[box.label][1],colors[box.label][2]));
    }
    return cv_mat;
}


int main(int argc, char const *argv[]) {

    string _xml_path = "/home/ubuntu/yolov7/runs/train/exp3/weights/FP16/best.xml";
    ExecutableNetwork _network;
    OutputsDataMap _outputinfo;
    //参数区

    double  _cof_threshold = 0.3;
    double  _nms_area_threshold = 0.5;
    Core ie;
    auto cnnNetwork = ie.ReadNetwork(_xml_path);
    //输入设置
    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
    InputInfo::Ptr& input = inputInfo.begin()->second;
    string _input_name = inputInfo.begin()->first;
    input->setPrecision(Precision::FP32);
    input->getInputData()->setLayout(Layout::NCHW);
    ICNNNetwork::InputShapes inputShapes = cnnNetwork.getInputShapes();
    SizeVector& inSizeVector = inputShapes.begin()->second;
    cnnNetwork.reshape(inputShapes);
    //输出设置
    _outputinfo = OutputsDataMap(cnnNetwork.getOutputsInfo());
    for (auto &output : _outputinfo) {
        output.second->setPrecision(Precision::FP32);
    }
    //获取可执行网络
    //_network =  ie.LoadNetwork(cnnNetwork, "GPU");
    _network =  ie.LoadNetwork(cnnNetwork, "CPU");

    Mat src = imread("/home/ubuntu/yolov7/inference/images/bus.jpg");
    Mat inframe;
    resize(src, inframe, Size(640, 640));

    auto start = chrono::high_resolution_clock::now();
    if(inframe.empty()){
        cout << "无效图片输入" << endl;
        return false;
    }

    cvtColor(inframe,inframe,COLOR_BGR2RGB);
    size_t img_size = 640*640;
    InferRequest::Ptr infer_request = _network.CreateInferRequestPtr();
    Blob::Ptr frameBlob = infer_request->GetBlob(_input_name);
    InferenceEngine::LockedMemory<void> blobMapped = InferenceEngine::as<InferenceEngine::MemoryBlob>(frameBlob)->wmap();
    float* blob_data = blobMapped.as<float*>();
    //nchw
    for(size_t row =0;row<640;row++){
        for(size_t col=0;col<640;col++){
            for(size_t ch =0;ch<3;ch++){
                blob_data[img_size*ch + row*640 + col] = float(inframe.at<Vec3b>(row,col)[ch])/255.0f;
            }
        }
    }
    //执行预测
    infer_request->Infer();
    //获取各层结果
    std::vector<YoloLayerData> yolov7_layers{
            {"554",    32, {{142, 110}, {192, 243}, {459, 401}}},
            {"534",    16, {{36,  75}, {76,  55},  {72,  146}}},
            {"output", 8,  {{12,  16}, {19,  36},  {40,  28}}},
    };

    vector<float> origin_rect_cof;
    std::vector<YoloLayerData> & layers = yolov7_layers;

    std::vector<BoxInfo> result;
    std::vector<BoxInfo> boxes;
    float threshold = 0.3;
    float nms_threshold = 0.7;
    int i=0;
    std::vector<std::string> labels = {
            "person"
    };
    unsigned char colors[][3] = {
            {0, 255, 0}
    };
    for (auto &output : _outputinfo) {
        auto output_name = output.first;
        std::cout<<output_name<<std::endl;
        Blob::Ptr blob = infer_request->GetBlob(yolov7_layers[2-i].name.c_str());
        LockedMemory<const void> blobMapped = as<MemoryBlob>(blob)->rmap();
        const float *output_blob = blobMapped.as<float *>();
        boxes = decode_infer(output_blob, layers[2-i].stride, 640, labels.size(), layers[2-i].anchors, threshold,2-i);
        result.insert(result.begin(), boxes.begin(), boxes.end());

        ++i;
        std::cout<<std::endl;
    }
    nms(result, nms_threshold);
    scale_coords(result, 640, 640, src.cols, src.rows);
    cv::Mat frame_show = draw_box(src, result, labels,colors);
    cv::imshow("out",src);
    cv::imwrite("dp.jpg",src);
    cv::waitKey(0);
    auto end = chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    cout << "use " << diff.count() << " s" << endl;


    return 0;
}

测试结果

十一、转oak模型　OAK-D　4T

似乎官方已经有了版本

depthai_yolo · master · OAK中国_官方 / DepthAI examples · GitCode

但是还是想自己搞个,那就干呗

ubuntu@ubuntu:~$ python3 /opt/intel/openvino_2021/deployment_tools/model_optimizer/mo.py --input_model /home/ubuntu/yolov7/runs/train/exp3/weights/best.onnx --output_dir /home/ubuntu/yolov7/runs/train/exp3/weights/F16_OAK --input_shape [1,3,640,640] --data_type FP16 --scale_values [255.0,255.0,255.0] --mean_values [0,0,0]
ubuntu@ubuntu:/opt/intel/openvino_2021/deployment_tools/tools/compile_tool$ ./compile_tool -m /home/ubuntu/yolov7/runs/train/exp3/weights/F16_OAK/best.xml -ip U8 -d MYRIAD -VPU_NUMBER_OF_SHAVES 4 -VPU_NUMBER_OF_CMX_SLICES 4

测试代码，参考yolox修改

from pathlib import Path
import numpy as np
import cv2
import depthai as dai
import time

syncNN = False

SHAPE = 640
labelMap = [
    "person"
]
BOX_THRESH=0.5
NMS_THRESH=0.5
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def xywh2xyxy(x,ratio):
    # Convert [x, y, w, h] to [x1, y1, x2, y2]
    y = np.copy(x)
    y[:, 0] = (x[:, 0] - x[:, 2] / 2 ) /ratio# top left x
    y[:, 1] = (x[:, 1] - x[:, 3] / 2 )/ratio# top left y
    y[:, 2] = (x[:, 0] + x[:, 2] / 2 ) /ratio# bottom right x
    y[:, 3] = (x[:, 1] + x[:, 3] / 2 )/ratio # bottom right y
    return y

def process(input, mask, anchors):

    anchors = [anchors[i] for i in mask]
    grid_h, grid_w = map(int, input.shape[0:2])

    box_confidence = sigmoid(input[..., 4])
    box_confidence = np.expand_dims(box_confidence, axis=-1)

    box_class_probs = sigmoid(input[..., 5:])

    box_xy = sigmoid(input[..., :2])*2 - 0.5

    col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
    row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
    col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
    grid = np.concatenate((col, row), axis=-1)
    box_xy += grid
    box_xy *= int(SHAPE/grid_h)

    box_wh = pow(sigmoid(input[..., 2:4])*2, 2)
    box_wh = box_wh * anchors

    box = np.concatenate((box_xy, box_wh), axis=-1)

    return box, box_confidence, box_class_probs

def filter_boxes(boxes, box_confidences, box_class_probs):
    """Filter boxes with box threshold. It's a bit different with origin yolov5 post process!

    # Arguments
        boxes: ndarray, boxes of objects.
        box_confidences: ndarray, confidences of objects.
        box_class_probs: ndarray, class_probs of objects.

    # Returns
        boxes: ndarray, filtered boxes.
        classes: ndarray, classes for boxes.
        scores: ndarray, scores for boxes.
    """
    box_classes = np.argmax(box_class_probs, axis=-1)
    box_class_scores = np.max(box_class_probs, axis=-1)
    pos = np.where(box_confidences[...,0] >= BOX_THRESH)


    boxes = boxes[pos]
    classes = box_classes[pos]
    scores = box_class_scores[pos]

    return boxes, classes, scores

def nms_boxes(boxes, scores):
    """Suppress non-maximal boxes.

    # Arguments
        boxes: ndarray, boxes of objects.
        scores: ndarray, scores of objects.

    # Returns
        keep: ndarray, index of effective boxes.
    """
    x = boxes[:, 0]
    y = boxes[:, 1]
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]

    areas = w * h
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        xx1 = np.maximum(x[i], x[order[1:]])
        yy1 = np.maximum(y[i], y[order[1:]])
        xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
        yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])

        w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
        h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
        inter = w1 * h1

        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= NMS_THRESH)[0]
        order = order[inds + 1]
    keep = np.array(keep)
    return keep


def yolov7_post_process(input_data,ratio):
    masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    anchors = [[12,16], [ 19,36], [40,28],
               [36,75], [76,55],[72,146],
               [142,110], [192,243], [459,401]]

    boxes, classes, scores = [], [], []
    for input,mask in zip(input_data, masks):
        b, c, s = process(input, mask, anchors)
        b, c, s = filter_boxes(b, c, s)
        boxes.append(b)
        classes.append(c)
        scores.append(s)

    boxes = np.concatenate(boxes)
    boxes = xywh2xyxy(boxes,ratio)
    classes = np.concatenate(classes)
    scores = np.concatenate(scores)

    nboxes, nclasses, nscores = [], [], []
    for c in set(classes):
        inds = np.where(classes == c)
        b = boxes[inds]
        c = classes[inds]
        s = scores[inds]

        keep = nms_boxes(b, s)

        nboxes.append(b[keep])
        nclasses.append(c[keep])
        nscores.append(s[keep])

    if not nclasses and not nscores:
        return None, None, None

    boxes = np.concatenate(nboxes)
    classes = np.concatenate(nclasses)
    scores = np.concatenate(nscores)

    return boxes, classes, scores

def draw(image, boxes, scores, classes):
    """Draw the boxes on the image.

    # Argument:
        image: original image.
        boxes: ndarray, boxes of objects.
        classes: ndarray, classes of objects.
        scores: ndarray, scores of objects.
        all_classes: all classes name.
    """
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        print('class: {}, score: {}'.format(labelMap[cl], score))
        print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))
        top = int(top)
        left = int(left)
        right = int(right)
        bottom = int(bottom)

        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(labelMap[cl], score),
                    (top, left - 6),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 2)


def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
    if len(image.shape) == 3:
        padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
    else:
        padded_img = np.ones(input_size) * 114.0
    img = np.array(image)
    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
    resized_img = cv2.resize(
        img,
        (int(img.shape[1] * r), int(img.shape[0] * r)),
        interpolation=cv2.INTER_LINEAR,
    ).astype(np.float32)
    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img

    padded_img = padded_img[:, :, ::-1]
    if mean is not None:
        padded_img -= mean
    if std is not None:
        padded_img /= std
    padded_img = padded_img.transpose(swap)
    padded_img = np.ascontiguousarray(padded_img, dtype=np.float16)
    return padded_img, r




p = dai.Pipeline()
p.setOpenVINOVersion(dai.OpenVINO.VERSION_2021_4)


class FPSHandler:
    def __init__(self, cap=None):
        self.timestamp = time.time()
        self.start = time.time()
        self.frame_cnt = 0

    def next_iter(self):
        self.timestamp = time.time()
        self.frame_cnt += 1

    def fps(self):
        return self.frame_cnt / (self.timestamp - self.start)

camera = p.create(dai.node.ColorCamera)
camera.setPreviewSize(SHAPE, SHAPE)
camera.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camera.setInterleaved(False)
camera.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
camera.setFps(60)

nn = p.create(dai.node.NeuralNetwork)
nn.setBlobPath(str(Path("/home/ubuntu/yolov7/runs/train/exp3/weights/F16_OAK/best.blob").resolve().absolute()))
nn.setNumInferenceThreads(2)
nn.input.setBlocking(True)



# Send camera frames to the host
camera_xout = p.create(dai.node.XLinkOut)
camera_xout.setStreamName("camera")
camera.preview.link(camera_xout.input)

# Send converted frames from the host to the NN
nn_xin = p.create(dai.node.XLinkIn)
nn_xin.setStreamName("nnInput")
nn_xin.out.link(nn.input)

# Send bounding boxes from the NN to the host via XLink
nn_xout = p.create(dai.node.XLinkOut)
nn_xout.setStreamName("nn")
nn.out.link(nn_xout.input)


# Pipeline is defined, now we can connect to the device
with dai.Device(p) as device:
    qCamera = device.getOutputQueue(name="camera", maxSize=4, blocking=False)
    qNnInput = device.getInputQueue("nnInput", maxSize=4, blocking=False)
    qNn = device.getOutputQueue(name="nn", maxSize=4, blocking=True)
    fps = FPSHandler()

    while True:
        inRgb = qCamera.get()
        frame = inRgb.getCvFrame()
        frame=cv2.imread("/home/ubuntu/yolov7/inference/images/bus.jpg")#可以注释掉　测试视频
        # Set these according to your dataset
        mean = (0,0,0)
        std = (1, 1, 1)

        image, ratio = preproc(frame, (SHAPE, SHAPE), mean, std)
        #print(ratio)
        #frame=cv2.resize(frame,(640,640))
        # NOTE: The model expects an FP16 input image, but ImgFrame accepts a list of ints only. I work around this by
        # spreading the FP16 across two ints
        #print(ratio)
        image = list(image)

        dai_frame = dai.ImgFrame()
        dai_frame.setHeight(SHAPE)
        dai_frame.setWidth(SHAPE)
        dai_frame.setData(image)
        qNnInput.send(dai_frame)

        if syncNN:
            in_nn = qNn.get()
        else:
            in_nn = qNn.tryGet()
        if in_nn is not None:
            fps.next_iter()
            cv2.putText(frame, "Fps: {:.2f}".format(fps.fps()), (2, SHAPE - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color=(255, 255, 255))
            layers = in_nn.getAllLayers()
            for item in layers:
                print(item.name,item.dims)
            data_output = np.array(in_nn.getLayerFp16('output')).reshape(1, 3, 80, 80, len(labelMap)+5)
            data_534 = np.array(in_nn.getLayerFp16('534')).reshape(1, 3, 40, 40, len(labelMap)+5)
            data_554 = np.array(in_nn.getLayerFp16('554')).reshape(1, 3, 20, 20, len(labelMap)+5)
            outputs=[]
            outputs.append(data_output)
            outputs.append(data_534)
            outputs.append(data_554)
            # post process
            input0_data = outputs[0]
            input1_data = outputs[1]
            input2_data = outputs[2]

            input0_data=np.transpose(input0_data, (0, 1, 4, 2,3))
            input1_data=np.transpose(input1_data, (0, 1, 4, 2,3))
            input2_data =np.transpose(input2_data, (0, 1, 4, 2,3))

            input0_data = input0_data.reshape([3, -1] + list(input0_data.shape[-2:]))
            input1_data = input1_data.reshape([3, -1] + list(input1_data.shape[-2:]))
            input2_data = input2_data.reshape([3, -1] + list(input2_data.shape[-2:]))

            print(input0_data.shape)
            print(input1_data.shape)
            print(input2_data.shape)

            input_data = list()
            input_data.append(np.transpose(input0_data, (2, 3, 0, 1)))
            input_data.append(np.transpose(input1_data, (2, 3, 0, 1)))
            input_data.append(np.transpose(input2_data, (2, 3, 0, 1)))

            print(np.transpose(input0_data, (2, 3, 0, 1)).shape)
            print(np.transpose(input1_data, (2, 3, 0, 1)).shape)
            print(np.transpose(input2_data, (2, 3, 0, 1)).shape)

            boxes, classes, scores = yolov7_post_process(input_data,ratio)

            if boxes is not None:
                draw(frame, boxes, scores, classes)

            cv2.imshow("rgb", frame)
            if cv2.waitKey(1) == ord('q'):
                break

测试结果

C++ oak

cmakelists.txt

cmake_minimum_required(VERSION 3.16)
project(depthai)
set(CMAKE_CXX_STANDARD 11)
find_package(OpenCV REQUIRED)
#message(STATUS ${OpenCV_INCLUDE_DIRS})
#添加头文件
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${CMAKE_SOURCE_DIR}/include/utility)
#链接Opencv库
find_package(depthai CONFIG REQUIRED)
add_executable(depthai main.cpp include/utility/utility.cpp)
target_link_libraries(depthai ${OpenCV_LIBS}  depthai::opencv)

main.cpp


#include <stdio.h>
#include <string>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "utility.hpp"
#include <vector>
#include "depthai/depthai.hpp"
using namespace std;
using namespace std::chrono;
using namespace cv;
typedef struct {
    int width;
    int height;
} YoloSize;


typedef struct {
    std::string name;
    int stride;
    std::vector<YoloSize> anchors;
} YoloLayerData;

class BoxInfo
{
public:
    int x1,y1,x2,y2,label,id;
    float score;
};
static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}
double GetIOU(cv::Rect_<float> bb_test, cv::Rect_<float> bb_gt)
{
    float in = (bb_test & bb_gt).area();
    float un = bb_test.area() + bb_gt.area() - in;

    if (un < DBL_EPSILON)
        return 0;

    return (double)(in / un);
}
std::vector<BoxInfo> decode_infer(vector<float> data_pt, int stride,  int net_size, int num_classes,
                                  const std::vector<YoloSize> &anchors, float threshold,int idx)
{


    const int s[3]={20,40,80};
    std::vector<BoxInfo> result;
    int batchs, channels, height, width, pred_item ;
    batchs = 1;
    channels = 3;
    height = s[idx];
    width =  s[idx];
    pred_item = 6;
    float data_ptr[batchs*channels*height*width*pred_item];
    std::cout<<batchs*channels*height*width*pred_item<<std::endl<<data_pt.size()<<std::endl;
    for(int i=0;i<data_pt.size();i++){
        data_ptr[i]=data_pt[i];
    }
    std::cout<<batchs<<" "<<channels<<" "<<height<<" "<<width<<" "<<pred_item<<" "<<std::endl;

    for(int i=0;i<20;i++){
        std::cout<<data_ptr[i]<<" ";
    }
    std::cout<<std::endl;
    for(int bi=0; bi<batchs; bi++)
    {
        auto batch_ptr = data_ptr + bi*(channels*height*width*pred_item);
        for(int ci=0; ci<channels; ci++)
        {
            auto channel_ptr = batch_ptr + ci*(height*width*pred_item);
            for(int hi=0; hi<height; hi++)
            {
                auto height_ptr = channel_ptr + hi*(width * pred_item);
                for(int wi=0; wi<width; wi++)
                {
                    auto width_ptr = height_ptr + wi*pred_item;
                    auto cls_ptr = width_ptr + 5;

                    auto confidence = sigmoid(width_ptr[4]);

                    for(int cls_id=0; cls_id<num_classes; cls_id++)
                    {
                        float score = sigmoid(cls_ptr[cls_id]) * confidence;
                        if(score > threshold)
                        {
                            float cx = (sigmoid(width_ptr[0]) * 2.f - 0.5f + wi) * (float) stride;
                            float cy = (sigmoid(width_ptr[1]) * 2.f - 0.5f + hi) * (float) stride;
                            float w = pow(sigmoid(width_ptr[2]) * 2.f, 2) * anchors[ci].width;
                            float h = pow(sigmoid(width_ptr[3]) * 2.f, 2) * anchors[ci].height;

                            BoxInfo box;

                            box.x1 = std::max(0, std::min(net_size, int((cx - w / 2.f) )));
                            box.y1 = std::max(0, std::min(net_size, int((cy - h / 2.f) )));
                            box.x2 = std::max(0, std::min(net_size, int((cx + w / 2.f) )));
                            box.y2 = std::max(0, std::min(net_size, int((cy + h / 2.f) )));
                            box.score = score;
                            box.label = cls_id;
                            result.push_back(box);
                        }
                    }
                }
            }
        }
    }

    return result;
}

void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
                   * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        for (int j = i + 1; j < int(input_boxes.size());) {
            float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1);
            float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1);
            float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2);
            float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH) {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            } else {
                j++;
            }
        }
    }
}
void scale_coords(std::vector<BoxInfo> &boxes, int w_from, int h_from, int w_to, int h_to)
{
    float w_ratio = float(w_to)/float(w_from);
    float h_ratio = float(h_to)/float(h_from);


    for(auto &box: boxes)
    {
        box.x1 *= w_ratio;
        box.x2 *= w_ratio;
        box.y1 *= h_ratio;
        box.y2 *= h_ratio;
    }
    return ;
}

cv::Mat draw_box(cv::Mat & cv_mat, std::vector<BoxInfo> &boxes, const std::vector<std::string> &labels,unsigned char colors[][3])
{

    for(auto box : boxes)
    {
        int width = box.x2-box.x1;
        int height = box.y2-box.y1;
        cv::Point p = cv::Point(box.x1, box.y1);
        cv::Rect rect = cv::Rect(box.x1, box.y1, width, height);
        cv::rectangle(cv_mat, rect, cv::Scalar(colors[box.label][0],colors[box.label][1],colors[box.label][2]));
        string text = labels[box.label] + ":" + std::to_string(box.score) ;
        cv::putText(cv_mat, text, p, cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(colors[box.label][0],colors[box.label][1],colors[box.label][2]));
    }
    return cv_mat;
}



int main(int argc, char **argv) {
    std::vector<YoloLayerData> yolov7_layers{
            {"554",    32, {{142, 110}, {192, 243}, {459, 401}}},
            {"534",    16, {{36,  75}, {76,  55},  {72,  146}}},
            {"output", 8,  {{12,  16}, {19,  36},  {40,  28}}},
    };

    vector<float> origin_rect_cof;
    std::vector<YoloLayerData> & layers = yolov7_layers;


    float threshold = 0.3;
    float nms_threshold = 0.7;
    int i=0;
    std::vector<std::string> labels = {
            "person"
    };
    unsigned char colors[][3] = {
            {0, 255, 0}
    };

    int target_width=640;
    int target_height=640;
    dai::Pipeline pipeline;
    //定义
    auto cam = pipeline.create<dai::node::XLinkIn>();
    cam->setStreamName("inFrame");
    auto net = pipeline.create<dai::node::NeuralNetwork>();
    dai::OpenVINO::Blob blob("/home/ubuntu/yolov7/runs/train/exp3/weights/F16_OAK/best.blob");
    net->setBlob(blob);
    net->input.setBlocking(false);

    //基本熟练明白oak的函数使用了　
    cam->out.link(net->input);



    //定义输出
    auto xlinkParserOut = pipeline.create<dai::node::XLinkOut>();
    xlinkParserOut->setStreamName("parseOut");
    auto xlinkoutOut = pipeline.create<dai::node::XLinkOut>();
    xlinkoutOut->setStreamName("out");

    auto xlinkoutpassthroughOut = pipeline.create<dai::node::XLinkOut>();

    xlinkoutpassthroughOut->setStreamName("passthrough");


    net->out.link(xlinkParserOut->input);

    net->passthrough.link(xlinkoutpassthroughOut->input);

    //结构推送相机
    dai::Device device(pipeline);
    //取帧显示
    auto inqueue = device.getInputQueue("inFrame");//maxsize 代表缓冲数据
    auto detqueue = device.getOutputQueue("parseOut", 8, false);//maxsize 代表缓冲数据

    bool printOutputLayersOnce=true;



    while(true) {
        cv::Mat frame=cv::imread("/home/ubuntu/yolov7/inference/images/bus.jpg");
        if(frame.empty()) break;
        cv::Mat bgr;
        cv::cvtColor(frame,bgr,cv::COLOR_BGR2RGB);
        cv::resize(bgr,bgr,cv::Size(target_width,target_height));
        auto img = std::make_shared<dai::ImgFrame>();
//bgr = resizeKeepAspectRatio(bgr, cv::Size(target_height, target_width), cv::Scalar(0));
        toPlanar(bgr, img->getData());
        img->setTimestamp(steady_clock::now());
        img->setWidth(target_height);
        img->setHeight(target_width);
        inqueue->send(img);

        auto inNN = detqueue->get<dai::NNData>();
        if( printOutputLayersOnce&&inNN) {
            std::cout << "Output layer names: ";
            for(const auto& ten : inNN->getAllLayerNames()) {
                std::cout << ten << ", ";
            }
            std::cout << std::endl;
            printOutputLayersOnce = false;
        }

        std::vector<BoxInfo> result;
        std::vector<BoxInfo> boxes;
        auto output=inNN->getLayerFp16(inNN->getAllLayerNames()[2]);
        boxes = decode_infer(output, layers[2].stride, 640, labels.size(), layers[2].anchors, threshold,2);
        result.insert(result.begin(), boxes.begin(), boxes.end());


        auto output_534=inNN->getLayerFp16(inNN->getAllLayerNames()[0]);
        boxes = decode_infer(output_534, layers[1].stride, 640, labels.size(), layers[1].anchors, threshold,1);
        result.insert(result.begin(), boxes.begin(), boxes.end());

        auto output_554=inNN->getLayerFp16(inNN->getAllLayerNames()[1]);
        boxes = decode_infer(output_554, layers[0].stride, 640, labels.size(), layers[0].anchors, threshold,0);
        result.insert(result.begin(), boxes.begin(), boxes.end());
        nms(result, nms_threshold);
        scale_coords(result, 640, 640, frame.cols, frame.rows);
        cv::Mat frame_show = draw_box(frame, result, labels,colors);

        cv::imshow("demo", frame_show);

        int key = cv::waitKey(1);
        if(key == 'q' || key == 'Q') return 0;
    }

//    while (true) {
//
//
//        auto ImgFrame = outqueue->get<dai::ImgFrame>();
//        auto frame = ImgFrame->getCvFrame();
//
//        auto inNN = detqueue->get<dai::NNData>();
//        if( printOutputLayersOnce&&inNN) {
//            std::cout << "Output layer names: ";
//            for(const auto& ten : inNN->getAllLayerNames()) {
//                std::cout << ten << ", ";
//            }
//            std::cout << std::endl;
//            printOutputLayersOnce = false;
//        }
//
//        std::vector<BoxInfo> result;
//        std::vector<BoxInfo> boxes;
//        auto output=inNN->getLayerFp16(inNN->getAllLayerNames()[2]);
//        boxes = decode_infer(output, layers[2].stride, 640, labels.size(), layers[2].anchors, threshold,2);
//        result.insert(result.begin(), boxes.begin(), boxes.end());
//
//
//        auto output_534=inNN->getLayerFp16(inNN->getAllLayerNames()[0]);
//        boxes = decode_infer(output_534, layers[1].stride, 640, labels.size(), layers[1].anchors, threshold,1);
//        result.insert(result.begin(), boxes.begin(), boxes.end());
//
//        auto output_554=inNN->getLayerFp16(inNN->getAllLayerNames()[1]);
//        boxes = decode_infer(output_554, layers[0].stride, 640, labels.size(), layers[0].anchors, threshold,0);
//        result.insert(result.begin(), boxes.begin(), boxes.end());
//        nms(result, nms_threshold);
//        scale_coords(result, 640, 640, frame.cols, frame.rows);
//        cv::Mat frame_show = draw_box(frame, result, labels,colors);
//
//        cv::imshow("demo", frame_show);
//
//        int key = cv::waitKey(1);
//        if(key == 'q' || key == 'Q') return 0;
//        cv::waitKey(1);
//
//
//    }


    return 0;
}

测试结果

YOLOV7_tiny在13fps左右


#include <stdio.h>
#include <string>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "utility.hpp"
#include <vector>
#include "depthai/depthai.hpp"

using namespace std;
using namespace std::chrono;
using namespace cv;
typedef struct {
    int width;
    int height;
} YoloSize;


typedef struct {
    std::string name;
    int stride;
    std::vector<YoloSize> anchors;
} YoloLayerData;

class BoxInfo {
public:
    int x1, y1, x2, y2, label, id;
    float score;
};

static inline float sigmoid(float x) {
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

double GetIOU(cv::Rect_<float> bb_test, cv::Rect_<float> bb_gt) {
    float in = (bb_test & bb_gt).area();
    float un = bb_test.area() + bb_gt.area() - in;

    if (un < DBL_EPSILON)
        return 0;

    return (double) (in / un);
}

std::vector<BoxInfo> decode_infer(vector<float> data_pt, int stride, int net_size, int num_classes,
                                  const std::vector<YoloSize> &anchors, float threshold, int idx) {


    const int s[3] = {20, 40, 80};
    std::vector<BoxInfo> result;
    int batchs, channels, height, width, pred_item;
    batchs = 1;
    channels = 3;
    height = s[idx];
    width = s[idx];
    pred_item = 6;
    float data_ptr[batchs * channels * height * width * pred_item];
    std::cout << batchs * channels * height * width * pred_item << std::endl << data_pt.size() << std::endl;
    for (int i = 0; i < data_pt.size(); i++) {
        data_ptr[i] = data_pt[i];
    }
    std::cout << batchs << " " << channels << " " << height << " " << width << " " << pred_item << " " << std::endl;

    for (int i = 0; i < 20; i++) {
        std::cout << data_ptr[i] << " ";
    }
    std::cout << std::endl;
    for (int bi = 0; bi < batchs; bi++) {
        auto batch_ptr = data_ptr + bi * (channels * height * width * pred_item);
        for (int ci = 0; ci < channels; ci++) {
            auto channel_ptr = batch_ptr + ci * (height * width * pred_item);
            for (int hi = 0; hi < height; hi++) {
                auto height_ptr = channel_ptr + hi * (width * pred_item);
                for (int wi = 0; wi < width; wi++) {
                    auto width_ptr = height_ptr + wi * pred_item;
                    auto cls_ptr = width_ptr + 5;

                    auto confidence = sigmoid(width_ptr[4]);

                    for (int cls_id = 0; cls_id < num_classes; cls_id++) {
                        float score = sigmoid(cls_ptr[cls_id]) * confidence;
                        if (score > threshold) {
                            float cx = (sigmoid(width_ptr[0]) * 2.f - 0.5f + wi) * (float) stride;
                            float cy = (sigmoid(width_ptr[1]) * 2.f - 0.5f + hi) * (float) stride;
                            float w = pow(sigmoid(width_ptr[2]) * 2.f, 2) * anchors[ci].width;
                            float h = pow(sigmoid(width_ptr[3]) * 2.f, 2) * anchors[ci].height;

                            BoxInfo box;

                            box.x1 = std::max(0, std::min(net_size, int((cx - w / 2.f))));
                            box.y1 = std::max(0, std::min(net_size, int((cy - h / 2.f))));
                            box.x2 = std::max(0, std::min(net_size, int((cx + w / 2.f))));
                            box.y2 = std::max(0, std::min(net_size, int((cy + h / 2.f))));
                            box.score = score;
                            box.label = cls_id;
                            result.push_back(box);
                        }
                    }
                }
            }
        }
    }

    return result;
}

void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
                   * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        for (int j = i + 1; j < int(input_boxes.size());) {
            float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1);
            float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1);
            float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2);
            float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH) {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            } else {
                j++;
            }
        }
    }
}

void scale_coords(std::vector<BoxInfo> &boxes, int w_from, int h_from, int w_to, int h_to) {
    float w_ratio = float(w_to) / float(w_from);
    float h_ratio = float(h_to) / float(h_from);


    for (auto &box: boxes) {
        box.x1 *= w_ratio;
        box.x2 *= w_ratio;
        box.y1 *= h_ratio;
        box.y2 *= h_ratio;
    }
    return;
}

cv::Mat draw_box(cv::Mat &cv_mat, std::vector<BoxInfo> &boxes, const std::vector<std::string> &labels,
                 unsigned char colors[][3]) {

    for (auto box : boxes) {
        int width = box.x2 - box.x1;
        int height = box.y2 - box.y1;
        cv::Point p = cv::Point(box.x1, box.y1);
        cv::Rect rect = cv::Rect(box.x1, box.y1, width, height);
        cv::rectangle(cv_mat, rect, cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
        string text = labels[box.label] + ":" + std::to_string(box.score);
        cv::putText(cv_mat, text, p, cv::FONT_HERSHEY_PLAIN, 1,
                    cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
    }
    return cv_mat;
}


int main(int argc, char **argv) {
    std::vector<YoloLayerData> yolov7_layers{
            {"554",    32, {{116, 90}, {156, 198}, {373, 326}}},
            {"534",    16, {{30,  61}, {62,  45},  {59,  119}}},
            {"output", 8,  {{12,  16}, {16,  30},  {33,  23}}},
    };

    vector<float> origin_rect_cof;
    std::vector<YoloLayerData> &layers = yolov7_layers;


    float threshold = 0.4;
    float nms_threshold = 0.5;
    int i = 0;
    std::vector<std::string> labels = {
            "person"
    };
    unsigned char colors[][3] = {
            {0, 255, 0}
    };

    int target_width = 640;
    int target_height = 640;
    dai::Pipeline pipeline;
    auto camRgb = pipeline.create<dai::node::ColorCamera>();
    camRgb->setPreviewSize(640, 640);
    camRgb->setBoardSocket(dai::CameraBoardSocket::RGB);
    camRgb->setResolution(dai::ColorCameraProperties::SensorResolution::THE_1080_P);
    camRgb->setInterleaved(false);
    camRgb->setColorOrder(dai::ColorCameraProperties::ColorOrder::RGB);
    camRgb->setFps(60);


    //定义
    //auto cam = pipeline.create<dai::node::XLinkIn>();
    //camRgb->setStreamName("inFrame");
    auto net = pipeline.create<dai::node::NeuralNetwork>();
    dai::OpenVINO::Blob blob("/home/ubuntu/experiment_yolov7/exp7_YOLOV7Tiny_rknn/weights/F16_OAK/best.blob");
    net->setBlob(blob);
    net->input.setBlocking(false);

    //基本熟练明白oak的函数使用了　
    camRgb->preview.link(net->input);



    //定义输出
    auto xlinkParserOut = pipeline.create<dai::node::XLinkOut>();
    xlinkParserOut->setStreamName("parseOut");
    auto xlinkoutOut = pipeline.create<dai::node::XLinkOut>();
    xlinkoutOut->setStreamName("out");

    auto xlinkoutpassthroughOut = pipeline.create<dai::node::XLinkOut>();

    xlinkoutpassthroughOut->setStreamName("passthrough");


    net->out.link(xlinkParserOut->input);

    net->passthrough.link(xlinkoutpassthroughOut->input);

    //结构推送相机
    dai::Device device(pipeline);
    //取帧显示
    // auto inqueue = device.getInputQueue("inFrame");//maxsize 代表缓冲数据
    auto detqueue = device.getOutputQueue("parseOut", 8, false);//maxsize 代表缓冲数据
    auto passthrough = device.getOutputQueue("passthrough", 8, false);//maxsize 代表缓冲数据

    bool printOutputLayersOnce = true;



//    while(true) {
//        cv::Mat frame=cv::imread("/home/ubuntu/yolov7/inference/images/bus.jpg");
//        if(frame.empty()) break;
//        cv::Mat bgr;
//        cv::cvtColor(frame,bgr,cv::COLOR_BGR2RGB);
//        cv::resize(bgr,bgr,cv::Size(target_width,target_height));
//        auto img = std::make_shared<dai::ImgFrame>();
bgr = resizeKeepAspectRatio(bgr, cv::Size(target_height, target_width), cv::Scalar(0));
//        toPlanar(bgr, img->getData());
//        img->setTimestamp(steady_clock::now());
//        img->setWidth(target_height);
//        img->setHeight(target_width);
//        inqueue->send(img);
//
//        auto inNN = detqueue->get<dai::NNData>();
//        if( printOutputLayersOnce&&inNN) {
//            std::cout << "Output layer names: ";
//            for(const auto& ten : inNN->getAllLayerNames()) {
//                std::cout << ten << ", ";
//            }
//            std::cout << std::endl;
//            printOutputLayersOnce = false;
//        }
//
//        std::vector<BoxInfo> result;
//        std::vector<BoxInfo> boxes;
//        auto output=inNN->getLayerFp16(inNN->getAllLayerNames()[2]);
//        boxes = decode_infer(output, layers[2].stride, 640, labels.size(), layers[2].anchors, threshold,2);
//        result.insert(result.begin(), boxes.begin(), boxes.end());
//
//
//        auto output_534=inNN->getLayerFp16(inNN->getAllLayerNames()[0]);
//        boxes = decode_infer(output_534, layers[1].stride, 640, labels.size(), layers[1].anchors, threshold,1);
//        result.insert(result.begin(), boxes.begin(), boxes.end());
//
//        auto output_554=inNN->getLayerFp16(inNN->getAllLayerNames()[1]);
//        boxes = decode_infer(output_554, layers[0].stride, 640, labels.size(), layers[0].anchors, threshold,0);
//        result.insert(result.begin(), boxes.begin(), boxes.end());
//        nms(result, nms_threshold);
//        scale_coords(result, 640, 640, frame.cols, frame.rows);
//        cv::Mat frame_show = draw_box(frame, result, labels,colors);
//
//        cv::imshow("demo", frame_show);
//        cv::imwrite("demo.jpg",frame_show);
//        int key = cv::waitKey(1);
//        if(key == 'q' || key == 'Q') return 0;
//    }
    auto startTime = steady_clock::now();
    int counter = 0;
    float fps = 0;
    while (true) {


        auto ImgFrame = passthrough->get<dai::ImgFrame>();
        auto frame = ImgFrame->getCvFrame();

        auto inNN = detqueue->get<dai::NNData>();
        if (printOutputLayersOnce && inNN) {
            std::cout << "Output layer names: ";
            for (const auto &ten : inNN->getAllLayerNames()) {
                std::cout << ten << ", ";
            }
            std::cout << std::endl;
            printOutputLayersOnce = false;
        }

        std::vector<BoxInfo> result;
        std::vector<BoxInfo> boxes;
        auto output = inNN->getLayerFp16(inNN->getAllLayerNames()[2]);
        boxes = decode_infer(output, layers[2].stride, 640, labels.size(), layers[2].anchors, threshold, 2);
        result.insert(result.begin(), boxes.begin(), boxes.end());


        auto output_534 = inNN->getLayerFp16(inNN->getAllLayerNames()[0]);
        boxes = decode_infer(output_534, layers[1].stride, 640, labels.size(), layers[1].anchors, threshold, 1);
        result.insert(result.begin(), boxes.begin(), boxes.end());

        auto output_554 = inNN->getLayerFp16(inNN->getAllLayerNames()[1]);
        boxes = decode_infer(output_554, layers[0].stride, 640, labels.size(), layers[0].anchors, threshold, 0);
        result.insert(result.begin(), boxes.begin(), boxes.end());
        nms(result, nms_threshold);
        scale_coords(result, 640, 640, frame.cols, frame.rows);
        cv::Mat frame_show = draw_box(frame, result, labels, colors);

        counter++;
        auto currentTime = steady_clock::now();
        auto elapsed = duration_cast<duration<float>>(currentTime - startTime);
        if (elapsed > seconds(1)) {
            fps = counter / elapsed.count();
            counter = 0;
            startTime = currentTime;
        }


        std::stringstream fpsStr;
        fpsStr << "NN fps: " << std::fixed << std::setprecision(2) << fps;
        cv::putText(frame, fpsStr.str(), cv::Point(2, frame_show.rows - 4), cv::FONT_HERSHEY_TRIPLEX, 0.4,
                    cv::Scalar(255, 255, 255));


        cv::imshow("demo", frame_show);

        int key = cv::waitKey(1);
        if (key == 'q' || key == 'Q') return 0;
        cv::waitKey(1);


    }


    return 0;
}

＆＆改了一版本yolov7_tiny检测人头的和深度测定版本


#include <stdio.h>
#include <string>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "utility.hpp"
#include <vector>
#include "depthai/depthai.hpp"

static std::atomic<bool> newConfig{false};
using namespace std;
using namespace std::chrono;
using namespace cv;
typedef struct {
    int width;
    int height;
} YoloSize;


typedef struct {
    std::string name;
    int stride;
    std::vector<YoloSize> anchors;
} YoloLayerData;

class BoxInfo {
public:
    int x1, y1, x2, y2, label, id;
    float score;
};

static inline float sigmoid(float x) {
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

double GetIOU(cv::Rect_<float> bb_test, cv::Rect_<float> bb_gt) {
    float in = (bb_test & bb_gt).area();
    float un = bb_test.area() + bb_gt.area() - in;

    if (un < DBL_EPSILON)
        return 0;

    return (double) (in / un);
}

std::vector<BoxInfo> decode_infer(vector<float> data_pt, int stride, int net_size, int num_classes,
                                  const std::vector<YoloSize> &anchors, float threshold, int idx) {


    const int s[3] = {20, 40, 80};
    std::vector<BoxInfo> result;
    int batchs, channels, height, width, pred_item;
    batchs = 1;
    channels = 3;
    height = s[idx];
    width = s[idx];
    pred_item = 6;
    float data_ptr[batchs * channels * height * width * pred_item];
    //std::cout << batchs * channels * height * width * pred_item << std::endl << data_pt.size() << std::endl;
    for (int i = 0; i < data_pt.size(); i++) {
        data_ptr[i] = data_pt[i];
    }
    //std::cout << batchs << " " << channels << " " << height << " " << width << " " << pred_item << " " << std::endl;

//    for (int i = 0; i < 20; i++) {
//        std::cout << data_ptr[i] << " ";
//    }
    std::cout << std::endl;
    for (int bi = 0; bi < batchs; bi++) {
        auto batch_ptr = data_ptr + bi * (channels * height * width * pred_item);
        for (int ci = 0; ci < channels; ci++) {
            auto channel_ptr = batch_ptr + ci * (height * width * pred_item);
            for (int hi = 0; hi < height; hi++) {
                auto height_ptr = channel_ptr + hi * (width * pred_item);
                for (int wi = 0; wi < width; wi++) {
                    auto width_ptr = height_ptr + wi * pred_item;
                    auto cls_ptr = width_ptr + 5;

                    auto confidence = sigmoid(width_ptr[4]);

                    for (int cls_id = 0; cls_id < num_classes; cls_id++) {
                        float score = sigmoid(cls_ptr[cls_id]) * confidence;
                        if (score > threshold) {
                            float cx = (sigmoid(width_ptr[0]) * 2.f - 0.5f + wi) * (float) stride;
                            float cy = (sigmoid(width_ptr[1]) * 2.f - 0.5f + hi) * (float) stride;
                            float w = pow(sigmoid(width_ptr[2]) * 2.f, 2) * anchors[ci].width;
                            float h = pow(sigmoid(width_ptr[3]) * 2.f, 2) * anchors[ci].height;

                            BoxInfo box;

                            box.x1 = std::max(0, std::min(net_size, int((cx - w / 2.f))));
                            box.y1 = std::max(0, std::min(net_size, int((cy - h / 2.f))));
                            box.x2 = std::max(0, std::min(net_size, int((cx + w / 2.f))));
                            box.y2 = std::max(0, std::min(net_size, int((cy + h / 2.f))));
                            box.score = score;
                            box.label = cls_id;
                            result.push_back(box);
                        }
                    }
                }
            }
        }
    }

    return result;
}

void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
                   * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        for (int j = i + 1; j < int(input_boxes.size());) {
            float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1);
            float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1);
            float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2);
            float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH) {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            } else {
                j++;
            }
        }
    }
}

void scale_coords(std::vector<BoxInfo> &boxes, int w_from, int h_from, int w_to, int h_to) {
    float w_ratio = float(w_to) / float(w_from);
    float h_ratio = float(h_to) / float(h_from);


    for (auto &box: boxes) {
        box.x1 *= w_ratio;
        box.x2 *= w_ratio;
        box.y1 *= h_ratio;
        box.y2 *= h_ratio;
    }
    return;
}

cv::Mat draw_box(cv::Mat &cv_mat, std::vector<BoxInfo> &boxes, const std::vector<std::string> &labels,
                 unsigned char colors[][3]) {

    for (auto box : boxes) {
        int width = box.x2 - box.x1;
        int height = box.y2 - box.y1;
        cv::Point p = cv::Point(box.x1, box.y1);
        cv::Rect rect = cv::Rect(box.x1, box.y1, width, height);
        cv::rectangle(cv_mat, rect, cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
        string text = labels[box.label] + ":" + std::to_string(box.score);
        cv::putText(cv_mat, text, p, cv::FONT_HERSHEY_PLAIN, 1,
                    cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
    }
    return cv_mat;
}


int main(int argc, char **argv) {
    std::vector<YoloLayerData> yolov7_layers{
            {"554",    32, {{116, 90}, {156, 198}, {373, 326}}},
            {"534",    16, {{30,  61}, {62,  45},  {59,  119}}},
            {"output", 8,  {{12,  16}, {16,  30},  {33,  23}}},
    };

    vector<float> origin_rect_cof;
    std::vector<YoloLayerData> &layers = yolov7_layers;


    float threshold = 0.4;
    float nms_threshold = 0.5;
    int i = 0;
    std::vector<std::string> labels = {
            "head"
    };
    unsigned char colors[][3] = {
            {0, 255, 0}
    };

    float target_width = 640;
    float target_height = 640;
    dai::Pipeline pipeline;
    auto camRgb = pipeline.create<dai::node::ColorCamera>();
    auto rgbOut=pipeline.create<dai::node::XLinkOut>();
    rgbOut->setStreamName("rgbOut");
    camRgb->video.link(rgbOut->input);
    camRgb->setPreviewSize(640, 640);
    camRgb->setBoardSocket(dai::CameraBoardSocket::RGB);
    camRgb->setResolution(dai::ColorCameraProperties::SensorResolution::THE_1080_P);
    camRgb->setInterleaved(false);
    camRgb->setColorOrder(dai::ColorCameraProperties::ColorOrder::RGB);
    camRgb->setFps(60);
    camRgb->setPreviewKeepAspectRatio(false);
    auto monoLeft = pipeline.create<dai::node::MonoCamera>();
    auto monoRight = pipeline.create<dai::node::MonoCamera>();

    auto stereo = pipeline.create<dai::node::StereoDepth>();
    auto spatialDataCalculator = pipeline.create<dai::node::SpatialLocationCalculator>();



    //定义
    //auto cam = pipeline.create<dai::node::XLinkIn>();
    //camRgb->setStreamName("inFrame");
    auto net = pipeline.create<dai::node::NeuralNetwork>();
    dai::OpenVINO::Blob blob("../best.blob");
    net->setBlob(blob);
    net->input.setBlocking(false);

    //基本熟练明白oak的函数使用了　
    camRgb->preview.link(net->input);
    auto xoutDepth = pipeline.create<dai::node::XLinkOut>();
    auto xoutSpatialData = pipeline.create<dai::node::XLinkOut>();
    auto xinSpatialCalcConfig = pipeline.create<dai::node::XLinkIn>();
    xoutDepth->setStreamName("depth");
    xoutSpatialData->setStreamName("spatialData");
    xinSpatialCalcConfig->setStreamName("spatialCalcConfig");

    monoLeft->setResolution(dai::MonoCameraProperties::SensorResolution::THE_720_P);
    monoLeft->setBoardSocket(dai::CameraBoardSocket::LEFT);
    monoRight->setResolution(dai::MonoCameraProperties::SensorResolution::THE_720_P);
    monoRight->setBoardSocket(dai::CameraBoardSocket::RIGHT);
    stereo->setDefaultProfilePreset(dai::node::StereoDepth::PresetMode::HIGH_DENSITY);
    stereo->setLeftRightCheck(true);
    stereo->setDepthAlign(dai::CameraBoardSocket::RGB);
    stereo->setExtendedDisparity(true);
    // LR-check is required for depth alignment
    dai::Point2f topLeft(0.4f, 0.4f);
    dai::Point2f bottomRight(0.6f, 0.6f);

    dai::SpatialLocationCalculatorConfigData config;
    config.depthThresholds.lowerThreshold = 100;
    config.depthThresholds.upperThreshold = 10000;
    config.roi = dai::Rect(topLeft, bottomRight);

    spatialDataCalculator->inputConfig.setWaitForMessage(false);
    spatialDataCalculator->initialConfig.addROI(config);

    // Linking
    monoLeft->out.link(stereo->left);
    monoRight->out.link(stereo->right);

    spatialDataCalculator->passthroughDepth.link(xoutDepth->input);
    stereo->depth.link(spatialDataCalculator->inputDepth);

    spatialDataCalculator->out.link(xoutSpatialData->input);
    xinSpatialCalcConfig->out.link(spatialDataCalculator->inputConfig);




    //定义输出
    auto xlinkParserOut = pipeline.create<dai::node::XLinkOut>();
    xlinkParserOut->setStreamName("parseOut");
    auto xlinkoutOut = pipeline.create<dai::node::XLinkOut>();
    xlinkoutOut->setStreamName("out");

    auto xlinkoutpassthroughOut = pipeline.create<dai::node::XLinkOut>();

    xlinkoutpassthroughOut->setStreamName("passthrough");


    net->out.link(xlinkParserOut->input);

    net->passthrough.link(xlinkoutpassthroughOut->input);

    //结构推送相机
    dai::Device device(pipeline);
    //取帧显示
    // auto inqueue = device.getInputQueue("inFrame");//maxsize 代表缓冲数据
    auto detqueue = device.getOutputQueue("parseOut", 8, false);//maxsize 代表缓冲数据
    auto passthrough = device.getOutputQueue("passthrough", 8, false);//maxsize 代表缓冲数据
    auto depthQueue = device.getOutputQueue("depth", 8, false);
    auto spatialCalcQueue = device.getOutputQueue("spatialData", 8, false);
    auto spatialCalcConfigInQueue = device.getInputQueue("spatialCalcConfig");
    auto videoqueue=device.getOutputQueue("rgbOut",8,false);

    bool printOutputLayersOnce = true;

    auto color = cv::Scalar(0, 255, 0);

    auto startTime = steady_clock::now();
    int counter = 0;
    float fps = 0;
    while (true) {

        auto inDepth = depthQueue->get<dai::ImgFrame>();
        auto ImgFrame = videoqueue->get<dai::ImgFrame>();
       // auto ImgFrame = passthrough->get<dai::ImgFrame>();
        auto frame = ImgFrame->getCvFrame();
        target_height=frame.rows*1.0;
       target_width=frame.cols*1.0;
        cv::Mat depthFrameColor;
        cv::Mat depthFrame = inDepth->getFrame();  // depthFrame values are in millimeters
        cv::normalize(depthFrame, depthFrameColor, 255, 0, cv::NORM_INF, CV_8UC1);
        cv::equalizeHist(depthFrameColor, depthFrameColor);
        cv::applyColorMap(depthFrameColor, depthFrameColor, cv::COLORMAP_HOT);


        auto inNN = detqueue->get<dai::NNData>();
        if (printOutputLayersOnce && inNN) {
            std::cout << "Output layer names: ";
            for (const auto &ten : inNN->getAllLayerNames()) {
                std::cout << ten << ", ";
            }
            std::cout << std::endl;
            printOutputLayersOnce = false;
        }

        std::vector<BoxInfo> result;
        std::vector<BoxInfo> boxes;
        auto output = inNN->getLayerFp16(inNN->getAllLayerNames()[2]);
        boxes = decode_infer(output, layers[2].stride, 640, labels.size(), layers[2].anchors, threshold, 2);
        result.insert(result.begin(), boxes.begin(), boxes.end());


        auto output_534 = inNN->getLayerFp16(inNN->getAllLayerNames()[0]);
        boxes = decode_infer(output_534, layers[1].stride, 640, labels.size(), layers[1].anchors, threshold, 1);
        result.insert(result.begin(), boxes.begin(), boxes.end());

        auto output_554 = inNN->getLayerFp16(inNN->getAllLayerNames()[1]);
        boxes = decode_infer(output_554, layers[0].stride, 640, labels.size(), layers[0].anchors, threshold, 0);
        result.insert(result.begin(), boxes.begin(), boxes.end());
        nms(result, nms_threshold);
        scale_coords(result, 640, 640, frame.cols, frame.rows);

        cv::Mat frame_show = draw_box(frame, result, labels, colors);

        counter++;
        auto currentTime = steady_clock::now();
        auto elapsed = duration_cast<duration<float>>(currentTime - startTime);
        if (elapsed > seconds(1)) {
            fps = counter / elapsed.count();
            counter = 0;
            startTime = currentTime;
        }


        std::stringstream fpsStr;
        fpsStr << "NN fps: " << std::fixed << std::setprecision(2) << fps;
        printf("fps %f\n",fps);
        cv::putText(frame, fpsStr.str(), cv::Point(2, frame_show.rows - 4), cv::FONT_HERSHEY_TRIPLEX, 0.4,
                    cv::Scalar(0, 255, 0));
        for(auto &item:result){
            topLeft.x = item.x1 * depthFrameColor.cols / target_width / depthFrameColor.cols;
            topLeft.y = item.y1 * depthFrameColor.rows / target_height / depthFrame.rows;
            bottomRight.x = ( item.x2 * depthFrameColor.cols / target_width) /depthFrameColor.cols;
            bottomRight.y = ( item.y2 * depthFrameColor.rows / target_height ) /depthFrameColor.rows;
            //std::cout<<topLeft.x<<" "<<topLeft.y<<" "<<bottomRight.x<<" "<<bottomRight.y<<" "<<std::endl;
            config.roi = dai::Rect(topLeft, bottomRight);
            dai::SpatialLocationCalculatorConfig cfg;
            cfg.addROI(config);
            spatialCalcConfigInQueue->send(cfg);

            cv::Mat depthFrame = inDepth->getFrame();  // depthFrame values are in millimeters
            cv::Mat depthFrameColor;

            cv::normalize(depthFrame, depthFrameColor, 255, 0, cv::NORM_INF, CV_8UC1);
            cv::equalizeHist(depthFrameColor, depthFrameColor);
            cv::applyColorMap(depthFrameColor, depthFrameColor, cv::COLORMAP_HOT);

            auto spatialData = spatialCalcQueue->get<dai::SpatialLocationCalculatorData>()->getSpatialLocations();
            for(auto depthData : spatialData) {
                auto roi = depthData.config.roi;
                roi = roi.denormalize(depthFrameColor.cols, depthFrameColor.rows);
                auto xmin = (int)roi.topLeft().x;
                auto ymin = (int)roi.topLeft().y;
                auto xmax = (int)roi.bottomRight().x;
                auto ymax = (int)roi.bottomRight().y;

                auto depthMin = depthData.depthMin;
                auto depthMax = depthData.depthMax;
                auto coords = depthData.spatialCoordinates;
                auto distance = std::sqrt(coords.x * coords.x + coords.y * coords.y + coords.z * coords.z);
                auto fontType = cv::FONT_HERSHEY_TRIPLEX;
                std::stringstream depthDistance;
                depthDistance.precision(2);
                depthDistance << fixed << static_cast<float>(distance / 1000.0f) << "m";
                cv::putText(depthFrameColor, depthDistance.str(), cv::Point(item.x1 + 10, item.y1 + 70), fontType, 0.5, color);

                std::stringstream rgbDistance;
                rgbDistance.precision(2);
                rgbDistance << fixed << static_cast<float>(distance / 1000.0f) << "m";
                cv::putText(frame, rgbDistance.str(), cv::Point(item.x1 + 10, item.y1 + 70), fontType, 0.5, color);

                cv::rectangle(frame, cv::Rect(cv::Point(item.x1, item.y1), cv::Point(item.x2, item.y2)), color, cv::FONT_HERSHEY_SIMPLEX);
                std::stringstream rgb_depthX,depthX;
                rgb_depthX << "X: " << (int)depthData.spatialCoordinates.x << " mm";
                cv::putText(frame, rgb_depthX.str(), cv::Point(item.x1 + 10, item.y1 + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
                std::stringstream rgb_depthY,depthY;
                rgb_depthY << "Y: " << (int)depthData.spatialCoordinates.y << " mm";
                cv::putText(frame, rgb_depthY.str(), cv::Point(item.x1 + 10, item.y1 + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
                std::stringstream rgb_depthZ,depthZ;
                rgb_depthZ << "Z: " << (int)depthData.spatialCoordinates.z << " mm";
                cv::putText(frame, rgb_depthZ.str(), cv::Point(item.x1 + 10, item.y1 + 50), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                cv::rectangle(depthFrameColor, cv::Rect(cv::Point(xmin, ymin), cv::Point(xmax, ymax)), color, cv::FONT_HERSHEY_SIMPLEX);

                depthX << "X: " << (int)depthData.spatialCoordinates.x << " mm";
                cv::putText(depthFrameColor, depthX.str(), cv::Point(xmin + 10, ymin + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                depthY << "Y: " << (int)depthData.spatialCoordinates.y << " mm";
                cv::putText(depthFrameColor, depthY.str(), cv::Point(xmin+ 10, ymin + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                depthZ << "Z: " << (int)depthData.spatialCoordinates.z << " mm";
                cv::putText(depthFrameColor, depthZ.str(), cv::Point(xmin + 10, ymin + 50), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
            }


            // Show the frame
            cv::imshow("depth", depthFrameColor);
            cv::imwrite("depth.jpg",depthFrameColor);
        }




        cv::imshow("demo", frame_show);
        cv::imwrite("demo.jpg",frame_show);
        int key = cv::waitKey(1);
        if (key == 'q' || key == 'Q') return 0;
        cv::waitKey(1);


    }


    return 0;
}

测试结果

测试模型链接: https://pan.baidu.com/s/1Fapu0m68M7u_45XxLl7IyQ?pwd=g3f9 提取码: g3f9

就不放了　1080p的人头检测和深度图显示

杯子检测的代码链接: https://pan.baidu.com/s/10SuFJ5dVAYCF4em8aUSlDw?pwd=4va5 提取码: 4va5


#include <stdio.h>
#include <string>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "utility.hpp"
#include <vector>
#include "depthai/depthai.hpp"

static std::atomic<bool> newConfig{false};
using namespace std;
using namespace std::chrono;
using namespace cv;
typedef struct {
    int width;
    int height;
} YoloSize;


typedef struct {
    std::string name;
    int stride;
    std::vector<YoloSize> anchors;
} YoloLayerData;

class BoxInfo {
public:
    int x1, y1, x2, y2, label, id;
    float score;
};

static inline float sigmoid(float x) {
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

double GetIOU(cv::Rect_<float> bb_test, cv::Rect_<float> bb_gt) {
    float in = (bb_test & bb_gt).area();
    float un = bb_test.area() + bb_gt.area() - in;

    if (un < DBL_EPSILON)
        return 0;

    return (double) (in / un);
}

std::vector<BoxInfo> decode_infer(vector<float> data_pt, int stride, int net_size, int num_classes,
                                  const std::vector<YoloSize> &anchors, float threshold, int idx) {


    const int s[3] = {20, 40, 80};
    std::vector<BoxInfo> result;
    int batchs, channels, height, width, pred_item;
    batchs = 1;
    channels = 3;
    height = s[idx];
    width = s[idx];
    pred_item = 6;
    float data_ptr[batchs * channels * height * width * pred_item];
    //std::cout << batchs * channels * height * width * pred_item << std::endl << data_pt.size() << std::endl;
    for (int i = 0; i < data_pt.size(); i++) {
        data_ptr[i] = data_pt[i];
    }
    //std::cout << batchs << " " << channels << " " << height << " " << width << " " << pred_item << " " << std::endl;

//    for (int i = 0; i < 20; i++) {
//        std::cout << data_ptr[i] << " ";
//    }
    std::cout << std::endl;
    for (int bi = 0; bi < batchs; bi++) {
        auto batch_ptr = data_ptr + bi * (channels * height * width * pred_item);
        for (int ci = 0; ci < channels; ci++) {
            auto channel_ptr = batch_ptr + ci * (height * width * pred_item);
            for (int hi = 0; hi < height; hi++) {
                auto height_ptr = channel_ptr + hi * (width * pred_item);
                for (int wi = 0; wi < width; wi++) {
                    auto width_ptr = height_ptr + wi * pred_item;
                    auto cls_ptr = width_ptr + 5;

                    auto confidence = sigmoid(width_ptr[4]);

                    for (int cls_id = 0; cls_id < num_classes; cls_id++) {
                        float score = sigmoid(cls_ptr[cls_id]) * confidence;
                        if (score > threshold) {
                            float cx = (sigmoid(width_ptr[0]) * 2.f - 0.5f + wi) * (float) stride;
                            float cy = (sigmoid(width_ptr[1]) * 2.f - 0.5f + hi) * (float) stride;
                            float w = pow(sigmoid(width_ptr[2]) * 2.f, 2) * anchors[ci].width;
                            float h = pow(sigmoid(width_ptr[3]) * 2.f, 2) * anchors[ci].height;

                            BoxInfo box;

                            box.x1 = std::max(0, std::min(net_size, int((cx - w / 2.f))));
                            box.y1 = std::max(0, std::min(net_size, int((cy - h / 2.f))));
                            box.x2 = std::max(0, std::min(net_size, int((cx + w / 2.f))));
                            box.y2 = std::max(0, std::min(net_size, int((cy + h / 2.f))));
                            box.score = score;
                            box.label = cls_id;
                            result.push_back(box);
                        }
                    }
                }
            }
        }
    }

    return result;
}

void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
                   * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        for (int j = i + 1; j < int(input_boxes.size());) {
            float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1);
            float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1);
            float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2);
            float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH) {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            } else {
                j++;
            }
        }
    }
}

void scale_coords(std::vector<BoxInfo> &boxes, int w_from, int h_from, int w_to, int h_to) {
    float w_ratio = float(w_to) / float(w_from);
    float h_ratio = float(h_to) / float(h_from);


    for (auto &box: boxes) {
        box.x1 *= w_ratio;
        box.x2 *= w_ratio;
        box.y1 *= h_ratio;
        box.y2 *= h_ratio;
    }
    return;
}

cv::Mat draw_box(cv::Mat &cv_mat, std::vector<BoxInfo> &boxes, const std::vector<std::string> &labels,
                 unsigned char colors[][3]) {

    for (auto box : boxes) {
        int width = box.x2 - box.x1;
        int height = box.y2 - box.y1;
        cv::Point p = cv::Point(box.x1, box.y1);
        cv::Rect rect = cv::Rect(box.x1, box.y1, width, height);
        cv::rectangle(cv_mat, rect, cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
        string text = labels[box.label] + ":" + std::to_string(box.score);
        cv::putText(cv_mat, text, p, cv::FONT_HERSHEY_PLAIN, 1,
                    cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
    }
    return cv_mat;
}


int main(int argc, char **argv) {
    std::vector<YoloLayerData> yolov7_layers{
            {"324",    32, {{116, 90}, {156, 198}, {373, 326}}},
            {"304",    16, {{30,  61}, {62,  45},  {59,  119}}},
            {"output", 8,  {{12,  16}, {16,  30},  {33,  23}}},
    };

    vector<float> origin_rect_cof;
    std::vector<YoloLayerData> &layers = yolov7_layers;


    float threshold = 0.4;
    float nms_threshold = 0.5;
    int i = 0;
    std::vector<std::string> labels = {
            "cup"
    };
    unsigned char colors[][3] = {
            {0, 255, 0}
    };

    float target_width = 640;
    float target_height = 640;
    dai::Pipeline pipeline;
    auto camRgb = pipeline.create<dai::node::ColorCamera>();
    auto rgbOut=pipeline.create<dai::node::XLinkOut>();
    rgbOut->setStreamName("rgbOut");
    camRgb->video.link(rgbOut->input);
    camRgb->setPreviewSize(640, 640);
    camRgb->setBoardSocket(dai::CameraBoardSocket::RGB);
    camRgb->setResolution(dai::ColorCameraProperties::SensorResolution::THE_1080_P);
    camRgb->setInterleaved(false);
    camRgb->setColorOrder(dai::ColorCameraProperties::ColorOrder::RGB);
    camRgb->setFps(60);
    camRgb->setPreviewKeepAspectRatio(false);
    auto monoLeft = pipeline.create<dai::node::MonoCamera>();
    auto monoRight = pipeline.create<dai::node::MonoCamera>();

    auto stereo = pipeline.create<dai::node::StereoDepth>();
    auto spatialDataCalculator = pipeline.create<dai::node::SpatialLocationCalculator>();



    //定义
    //auto cam = pipeline.create<dai::node::XLinkIn>();
    //camRgb->setStreamName("inFrame");
    auto net = pipeline.create<dai::node::NeuralNetwork>();
    dai::OpenVINO::Blob blob("../model_300_300/best_cup.blob");
    net->setBlob(blob);
    net->input.setBlocking(false);

    //基本熟练明白oak的函数使用了　
    camRgb->preview.link(net->input);
    auto xoutDepth = pipeline.create<dai::node::XLinkOut>();
    auto xoutSpatialData = pipeline.create<dai::node::XLinkOut>();
    auto xinSpatialCalcConfig = pipeline.create<dai::node::XLinkIn>();
    xoutDepth->setStreamName("depth");
    xoutSpatialData->setStreamName("spatialData");
    xinSpatialCalcConfig->setStreamName("spatialCalcConfig");

    monoLeft->setResolution(dai::MonoCameraProperties::SensorResolution::THE_720_P);
    monoLeft->setBoardSocket(dai::CameraBoardSocket::LEFT);
    monoRight->setResolution(dai::MonoCameraProperties::SensorResolution::THE_720_P);
    monoRight->setBoardSocket(dai::CameraBoardSocket::RIGHT);
    stereo->setDefaultProfilePreset(dai::node::StereoDepth::PresetMode::HIGH_ACCURACY);
    stereo->setLeftRightCheck(true);
    stereo->setDepthAlign(dai::CameraBoardSocket::RGB);
    stereo->setExtendedDisparity(true);
    // LR-check is required for depth alignment
    dai::Point2f topLeft(0.4f, 0.4f);
    dai::Point2f bottomRight(0.6f, 0.6f);

    dai::SpatialLocationCalculatorConfigData config;
    config.depthThresholds.lowerThreshold = 100;
    config.depthThresholds.upperThreshold = 10000;
    config.roi = dai::Rect(topLeft, bottomRight);

    spatialDataCalculator->inputConfig.setWaitForMessage(false);
    spatialDataCalculator->initialConfig.addROI(config);

    // Linking
    monoLeft->out.link(stereo->left);
    monoRight->out.link(stereo->right);

    spatialDataCalculator->passthroughDepth.link(xoutDepth->input);
    stereo->depth.link(spatialDataCalculator->inputDepth);

    spatialDataCalculator->out.link(xoutSpatialData->input);
    xinSpatialCalcConfig->out.link(spatialDataCalculator->inputConfig);




    //定义输出
    auto xlinkParserOut = pipeline.create<dai::node::XLinkOut>();
    xlinkParserOut->setStreamName("parseOut");
    auto xlinkoutOut = pipeline.create<dai::node::XLinkOut>();
    xlinkoutOut->setStreamName("out");

    auto xlinkoutpassthroughOut = pipeline.create<dai::node::XLinkOut>();

    xlinkoutpassthroughOut->setStreamName("passthrough");


    net->out.link(xlinkParserOut->input);

    net->passthrough.link(xlinkoutpassthroughOut->input);

    //结构推送相机
    dai::Device device(pipeline);
    //取帧显示
    // auto inqueue = device.getInputQueue("inFrame");//maxsize 代表缓冲数据
    auto detqueue = device.getOutputQueue("parseOut", 8, false);//maxsize 代表缓冲数据
    auto passthrough = device.getOutputQueue("passthrough", 8, false);//maxsize 代表缓冲数据
    auto depthQueue = device.getOutputQueue("depth", 8, false);
    auto spatialCalcQueue = device.getOutputQueue("spatialData", 8, false);
    auto spatialCalcConfigInQueue = device.getInputQueue("spatialCalcConfig");
    auto videoqueue=device.getOutputQueue("rgbOut",8,false);

    bool printOutputLayersOnce = true;

    auto color = cv::Scalar(0, 255, 0);

    auto startTime = steady_clock::now();
    int counter = 0;
    float fps = 0;
    while (true) {

        auto inDepth = depthQueue->get<dai::ImgFrame>();
        auto ImgFrame = videoqueue->get<dai::ImgFrame>();
        // auto ImgFrame = passthrough->get<dai::ImgFrame>();
        auto frame = ImgFrame->getCvFrame();
        target_height=frame.rows*1.0;
        target_width=frame.cols*1.0;
        cv::Mat depthFrameColor;
        cv::Mat depthFrame = inDepth->getFrame();  // depthFrame values are in millimeters
        cv::normalize(depthFrame, depthFrameColor, 255, 0, cv::NORM_INF, CV_8UC1);
        cv::equalizeHist(depthFrameColor, depthFrameColor);
        cv::applyColorMap(depthFrameColor, depthFrameColor, cv::COLORMAP_HOT);


        auto inNN = detqueue->get<dai::NNData>();
        if (printOutputLayersOnce && inNN) {
            std::cout << "Output layer names: ";
            for (const auto &ten : inNN->getAllLayerNames()) {
                std::cout << ten << ", ";
            }
            std::cout << std::endl;
            printOutputLayersOnce = false;
        }

        std::vector<BoxInfo> result;
        std::vector<BoxInfo> boxes;
        auto output = inNN->getLayerFp16(inNN->getAllLayerNames()[2]);
        boxes = decode_infer(output, layers[2].stride, 640, labels.size(), layers[2].anchors, threshold, 2);
        result.insert(result.begin(), boxes.begin(), boxes.end());


        auto output_534 = inNN->getLayerFp16(inNN->getAllLayerNames()[0]);
        boxes = decode_infer(output_534, layers[1].stride, 640, labels.size(), layers[1].anchors, threshold, 1);
        result.insert(result.begin(), boxes.begin(), boxes.end());

        auto output_554 = inNN->getLayerFp16(inNN->getAllLayerNames()[1]);
        boxes = decode_infer(output_554, layers[0].stride, 640, labels.size(), layers[0].anchors, threshold, 0);
        result.insert(result.begin(), boxes.begin(), boxes.end());
        nms(result, nms_threshold);
        scale_coords(result, 640, 640, frame.cols, frame.rows);

        cv::Mat frame_show = draw_box(frame, result, labels, colors);

        counter++;
        auto currentTime = steady_clock::now();
        auto elapsed = duration_cast<duration<float>>(currentTime - startTime);
        if (elapsed > seconds(1)) {
            fps = counter / elapsed.count();
            counter = 0;
            startTime = currentTime;
        }


        std::stringstream fpsStr;
        fpsStr << "NN fps: " << std::fixed << std::setprecision(2) << fps;
        printf("fps %f\n",fps);
        cv::putText(frame, fpsStr.str(), cv::Point(2, frame_show.rows - 4), cv::FONT_HERSHEY_TRIPLEX, 0.4,
                    cv::Scalar(0, 255, 0));

        for(auto &item:result){
            topLeft.x = item.x1 * depthFrameColor.cols / target_width / depthFrameColor.cols;
            topLeft.y = item.y1 * depthFrameColor.rows / target_height / depthFrame.rows;
            bottomRight.x = ( item.x2 * depthFrameColor.cols / target_width) /depthFrameColor.cols;
            bottomRight.y = ( item.y2 * depthFrameColor.rows / target_height ) /depthFrameColor.rows;
            //std::cout<<topLeft.x<<" "<<topLeft.y<<" "<<bottomRight.x<<" "<<bottomRight.y<<" "<<std::endl;
            config.roi = dai::Rect(topLeft, bottomRight);
            dai::SpatialLocationCalculatorConfig cfg;
            cfg.addROI(config);
            spatialCalcConfigInQueue->send(cfg);


            auto spatialData = spatialCalcQueue->get<dai::SpatialLocationCalculatorData>()->getSpatialLocations();
            for(auto depthData : spatialData) {
                auto roi = depthData.config.roi;
                roi = roi.denormalize(depthFrameColor.cols, depthFrameColor.rows);
                auto xmin = (int)roi.topLeft().x;
                auto ymin = (int)roi.topLeft().y;
                auto xmax = (int)roi.bottomRight().x;
                auto ymax = (int)roi.bottomRight().y;

                auto depthMin = depthData.depthMin;
                auto depthMax = depthData.depthMax;
                auto coords = depthData.spatialCoordinates;
                auto distance = std::sqrt(coords.x * coords.x + coords.y * coords.y + coords.z * coords.z);
                auto fontType = cv::FONT_HERSHEY_TRIPLEX;
                std::stringstream depthDistance;
                depthDistance.precision(2);
                depthDistance << fixed << static_cast<float>(distance / 1000.0f) << "m";
                cv::putText(depthFrameColor, depthDistance.str(), cv::Point(item.x1 + 10, item.y1 + 70), fontType, 0.5, color);

                std::stringstream rgbDistance;
                rgbDistance.precision(2);
                rgbDistance << fixed << static_cast<float>(distance / 1000.0f) << "m";
                cv::putText(frame, rgbDistance.str(), cv::Point(item.x1 + 10, item.y1 + 70), fontType, 0.5, color);

                cv::rectangle(frame, cv::Rect(cv::Point(item.x1, item.y1), cv::Point(item.x2, item.y2)), color, cv::FONT_HERSHEY_SIMPLEX);
                std::stringstream rgb_depthX,depthX;
                rgb_depthX << "X: " << (int)depthData.spatialCoordinates.x << " mm";
                cv::putText(frame, rgb_depthX.str(), cv::Point(item.x1 + 10, item.y1 + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
                std::stringstream rgb_depthY,depthY;
                rgb_depthY << "Y: " << (int)depthData.spatialCoordinates.y << " mm";
                cv::putText(frame, rgb_depthY.str(), cv::Point(item.x1 + 10, item.y1 + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
                std::stringstream rgb_depthZ,depthZ;
                rgb_depthZ << "Z: " << (int)depthData.spatialCoordinates.z << " mm";
                cv::putText(frame, rgb_depthZ.str(), cv::Point(item.x1 + 10, item.y1 + 50), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                cv::rectangle(depthFrameColor, cv::Rect(cv::Point(xmin, ymin), cv::Point(xmax, ymax)), color, cv::FONT_HERSHEY_SIMPLEX);

                depthX << "X: " << (int)depthData.spatialCoordinates.x << " mm";
                cv::putText(depthFrameColor, depthX.str(), cv::Point(xmin + 10, ymin + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                depthY << "Y: " << (int)depthData.spatialCoordinates.y << " mm";
                cv::putText(depthFrameColor, depthY.str(), cv::Point(xmin+ 10, ymin + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                depthZ << "Z: " << (int)depthData.spatialCoordinates.z << " mm";
                cv::putText(depthFrameColor, depthZ.str(), cv::Point(xmin + 10, ymin + 50), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
            }


            // Show the frame
        }


        cv::imshow("depth", depthFrameColor);


        cv::imshow("demo", frame_show);
        cv::imwrite("demo.jpg",frame_show);
        int key = cv::waitKey(1);
        if (key == 'q' || key == 'Q') return 0;
        cv::waitKey(1);


    }


    return 0;
}

效果图

补充一个编码处理的检测

CMakelists.txt

cmake_minimum_required(VERSION 3.16)
project(depthai)
set(CMAKE_CXX_STANDARD 11)
find_package(OpenCV REQUIRED)
#message(STATUS ${OpenCV_INCLUDE_DIRS})
#添加头文件
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${CMAKE_SOURCE_DIR}/include)
include_directories(${CMAKE_SOURCE_DIR}/include/utility)
#链接Opencv库
find_package(depthai CONFIG REQUIRED)
add_executable(depthai main.cpp include/utility/utility.cpp)
target_link_libraries(depthai ${OpenCV_LIBS}  depthai::opencv -lavformat -lavcodec -lswscale -lavutil -lz)

main.cpp


#include <stdio.h>
#include <string>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
extern "C"
{
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libavutil/imgutils.h>
#include <libswscale/swscale.h>
}
#include "utility.hpp"
#include <vector>
#include "depthai/depthai.hpp"

static std::atomic<bool> newConfig{false};
using namespace std;
using namespace std::chrono;
using namespace cv;
typedef struct {
    int width;
    int height;
} YoloSize;


typedef struct {
    std::string name;
    int stride;
    std::vector<YoloSize> anchors;
} YoloLayerData;

class BoxInfo {
public:
    int x1, y1, x2, y2, label, id;
    float score;
};

static inline float sigmoid(float x) {
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

double GetIOU(cv::Rect_<float> bb_test, cv::Rect_<float> bb_gt) {
    float in = (bb_test & bb_gt).area();
    float un = bb_test.area() + bb_gt.area() - in;

    if (un < DBL_EPSILON)
        return 0;

    return (double) (in / un);
}

std::vector<BoxInfo> decode_infer(vector<float> data_pt, int stride, int net_size, int num_classes,
                                  const std::vector<YoloSize> &anchors, float threshold, int idx) {


    const int s[3] = {20, 40, 80};
    std::vector<BoxInfo> result;
    int batchs, channels, height, width, pred_item;
    batchs = 1;
    channels = 3;
    height = s[idx];
    width = s[idx];
    pred_item = 6;
    float data_ptr[batchs * channels * height * width * pred_item];
    //std::cout << batchs * channels * height * width * pred_item << std::endl << data_pt.size() << std::endl;
    for (int i = 0; i < data_pt.size(); i++) {
        data_ptr[i] = data_pt[i];
    }
    //std::cout << batchs << " " << channels << " " << height << " " << width << " " << pred_item << " " << std::endl;

//    for (int i = 0; i < 20; i++) {
//        std::cout << data_ptr[i] << " ";
//    }
    std::cout << std::endl;
    for (int bi = 0; bi < batchs; bi++) {
        auto batch_ptr = data_ptr + bi * (channels * height * width * pred_item);
        for (int ci = 0; ci < channels; ci++) {
            auto channel_ptr = batch_ptr + ci * (height * width * pred_item);
            for (int hi = 0; hi < height; hi++) {
                auto height_ptr = channel_ptr + hi * (width * pred_item);
                for (int wi = 0; wi < width; wi++) {
                    auto width_ptr = height_ptr + wi * pred_item;
                    auto cls_ptr = width_ptr + 5;

                    auto confidence = sigmoid(width_ptr[4]);

                    for (int cls_id = 0; cls_id < num_classes; cls_id++) {
                        float score = sigmoid(cls_ptr[cls_id]) * confidence;
                        if (score > threshold) {
                            float cx = (sigmoid(width_ptr[0]) * 2.f - 0.5f + wi) * (float) stride;
                            float cy = (sigmoid(width_ptr[1]) * 2.f - 0.5f + hi) * (float) stride;
                            float w = pow(sigmoid(width_ptr[2]) * 2.f, 2) * anchors[ci].width;
                            float h = pow(sigmoid(width_ptr[3]) * 2.f, 2) * anchors[ci].height;

                            BoxInfo box;

                            box.x1 = std::max(0, std::min(net_size, int((cx - w / 2.f))));
                            box.y1 = std::max(0, std::min(net_size, int((cy - h / 2.f))));
                            box.x2 = std::max(0, std::min(net_size, int((cx + w / 2.f))));
                            box.y2 = std::max(0, std::min(net_size, int((cy + h / 2.f))));
                            box.score = score;
                            box.label = cls_id;
                            result.push_back(box);
                        }
                    }
                }
            }
        }
    }

    return result;
}

void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
    std::vector<float> vArea(input_boxes.size());
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
                   * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
    }
    for (int i = 0; i < int(input_boxes.size()); ++i) {
        for (int j = i + 1; j < int(input_boxes.size());) {
            float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1);
            float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1);
            float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2);
            float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2);
            float w = std::max(float(0), xx2 - xx1 + 1);
            float h = std::max(float(0), yy2 - yy1 + 1);
            float inter = w * h;
            float ovr = inter / (vArea[i] + vArea[j] - inter);
            if (ovr >= NMS_THRESH) {
                input_boxes.erase(input_boxes.begin() + j);
                vArea.erase(vArea.begin() + j);
            } else {
                j++;
            }
        }
    }
}

void scale_coords(std::vector<BoxInfo> &boxes, int w_from, int h_from, int w_to, int h_to) {
    float w_ratio = float(w_to) / float(w_from);
    float h_ratio = float(h_to) / float(h_from);


    for (auto &box: boxes) {
        box.x1 *= w_ratio;
        box.x2 *= w_ratio;
        box.y1 *= h_ratio;
        box.y2 *= h_ratio;
    }
    return;
}

cv::Mat draw_box(cv::Mat &cv_mat, std::vector<BoxInfo> &boxes, const std::vector<std::string> &labels,
                 unsigned char colors[][3]) {

    for (auto box : boxes) {
        int width = box.x2 - box.x1;
        int height = box.y2 - box.y1;
        cv::Point p = cv::Point(box.x1, box.y1);
        cv::Rect rect = cv::Rect(box.x1, box.y1, width, height);
        cv::rectangle(cv_mat, rect, cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
        string text = labels[box.label] + ":" + std::to_string(box.score);
        cv::putText(cv_mat, text, p, cv::FONT_HERSHEY_PLAIN, 1,
                    cv::Scalar(colors[box.label][0], colors[box.label][1], colors[box.label][2]));
    }
    return cv_mat;
}


int main(int argc, char **argv) {
    std::vector<YoloLayerData> yolov7_layers{
            {"324",    32, {{116, 90}, {156, 198}, {373, 326}}},
            {"304",    16, {{30,  61}, {62,  45},  {59,  119}}},
            {"output", 8,  {{12,  16}, {16,  30},  {33,  23}}},
    };

    vector<float> origin_rect_cof;
    std::vector<YoloLayerData> &layers = yolov7_layers;


    float threshold = 0.4;
    float nms_threshold = 0.5;
    int i = 0;
    std::vector<std::string> labels = {
            "cup"
    };
    unsigned char colors[][3] = {
            {0, 255, 0}
    };

    float target_width = 640;
    float target_height = 640;
    dai::Pipeline pipeline;
    auto camRgb = pipeline.create<dai::node::ColorCamera>();
    auto rgbOut=pipeline.create<dai::node::XLinkOut>();
    rgbOut->setStreamName("rgbOut");
    //camRgb->video.link(rgbOut->input);
    camRgb->setPreviewSize(640, 640);
    camRgb->setBoardSocket(dai::CameraBoardSocket::RGB);
    camRgb->setResolution(dai::ColorCameraProperties::SensorResolution::THE_1080_P);
    camRgb->setInterleaved(false);
    camRgb->setColorOrder(dai::ColorCameraProperties::ColorOrder::RGB);
    camRgb->setFps(60);
    camRgb->setPreviewKeepAspectRatio(false);
    auto monoLeft = pipeline.create<dai::node::MonoCamera>();
    auto monoRight = pipeline.create<dai::node::MonoCamera>();

    auto stereo = pipeline.create<dai::node::StereoDepth>();
    auto spatialDataCalculator = pipeline.create<dai::node::SpatialLocationCalculator>();
    auto Encoder = pipeline.create<dai::node::VideoEncoder>();

    Encoder->setDefaultProfilePreset(camRgb->getVideoSize(), camRgb->getFps(),
                                     dai::VideoEncoderProperties::Profile::H264_MAIN);


    camRgb->video.link(Encoder->input);

    //定义
    //auto cam = pipeline.create<dai::node::XLinkIn>();
    //camRgb->setStreamName("inFrame");
    auto net = pipeline.create<dai::node::NeuralNetwork>();
    dai::OpenVINO::Blob blob("../model_300_300/best_cup.blob");
    net->setBlob(blob);
    net->input.setBlocking(false);

    //基本熟练明白oak的函数使用了　
    camRgb->preview.link(net->input);
    camRgb->setFps(30);
    camRgb->setVideoSize(1920, 1080);

    auto xoutDepth = pipeline.create<dai::node::XLinkOut>();
    auto xoutSpatialData = pipeline.create<dai::node::XLinkOut>();
    auto xinSpatialCalcConfig = pipeline.create<dai::node::XLinkIn>();
    xoutDepth->setStreamName("depth");
    xoutSpatialData->setStreamName("spatialData");
    xinSpatialCalcConfig->setStreamName("spatialCalcConfig");

    monoLeft->setResolution(dai::MonoCameraProperties::SensorResolution::THE_720_P);
    monoLeft->setBoardSocket(dai::CameraBoardSocket::LEFT);
    monoRight->setResolution(dai::MonoCameraProperties::SensorResolution::THE_720_P);
    monoRight->setBoardSocket(dai::CameraBoardSocket::RIGHT);
    stereo->setDefaultProfilePreset(dai::node::StereoDepth::PresetMode::HIGH_ACCURACY);
    stereo->setLeftRightCheck(true);
    stereo->setDepthAlign(dai::CameraBoardSocket::RGB);
    stereo->setExtendedDisparity(true);
    // LR-check is required for depth alignment
    dai::Point2f topLeft(0.4f, 0.4f);
    dai::Point2f bottomRight(0.6f, 0.6f);

    dai::SpatialLocationCalculatorConfigData config;
    config.depthThresholds.lowerThreshold = 100;
    config.depthThresholds.upperThreshold = 10000;
    config.roi = dai::Rect(topLeft, bottomRight);

    spatialDataCalculator->inputConfig.setWaitForMessage(false);
    spatialDataCalculator->initialConfig.addROI(config);

    // Linking
    monoLeft->out.link(stereo->left);
    monoRight->out.link(stereo->right);

    spatialDataCalculator->passthroughDepth.link(xoutDepth->input);
    stereo->depth.link(spatialDataCalculator->inputDepth);

    spatialDataCalculator->out.link(xoutSpatialData->input);
    xinSpatialCalcConfig->out.link(spatialDataCalculator->inputConfig);

//定义输出
    auto xlinkoutpreviewOut = pipeline.create<dai::node::XLinkOut>();
    xlinkoutpreviewOut->setStreamName("out264");
    Encoder->bitstream.link(xlinkoutpreviewOut->input);



    //定义输出
    auto xlinkParserOut = pipeline.create<dai::node::XLinkOut>();
    xlinkParserOut->setStreamName("parseOut");
    auto xlinkoutOut = pipeline.create<dai::node::XLinkOut>();
    xlinkoutOut->setStreamName("out");

    auto xlinkoutpassthroughOut = pipeline.create<dai::node::XLinkOut>();

    xlinkoutpassthroughOut->setStreamName("passthrough");


    net->out.link(xlinkParserOut->input);

    net->passthrough.link(xlinkoutpassthroughOut->input);
//    auto manip = pipeline.create<dai::node::ImageManip>();
//
//    manip->initialConfig.setResize(416, 416);
//    manip->initialConfig.setFrameType(dai::ImgFrame::Type::NV12);
//    camRgb->preview.link(manip->inputImage);
//    manip->out.link(Encoder->input);
    //结构推送相机
    dai::Device device(pipeline);
    //取帧显示
    // auto inqueue = device.getInputQueue("inFrame");//maxsize 代表缓冲数据
    auto detqueue = device.getOutputQueue("parseOut", 8, false);//maxsize 代表缓冲数据
    auto passthrough = device.getOutputQueue("passthrough", 8, false);//maxsize 代表缓冲数据
    auto depthQueue = device.getOutputQueue("depth", 8, false);
    auto spatialCalcQueue = device.getOutputQueue("spatialData", 8, false);
    auto spatialCalcConfigInQueue = device.getInputQueue("spatialCalcConfig");
    auto videoqueue=device.getOutputQueue("out264", camRgb->getFps(),false);

    bool printOutputLayersOnce = true;

    auto color = cv::Scalar(0, 255, 0);

    auto startTime = steady_clock::now();
    int counter = 0;
    float fps = 0;


    int width = 1920;
    int height = 1080;
    AVCodec *pCodec = avcodec_find_decoder(AV_CODEC_ID_H264);
    AVCodecContext *pCodecCtx = avcodec_alloc_context3(pCodec);
    int ret = avcodec_open2(pCodecCtx, pCodec, NULL);
    if (ret < 0) {//打开解码器
        char buf[1024] = {0};
        av_strerror(ret, buf, sizeof(buf) - 1);
        cerr << buf << endl;
        return -1;
    }
    AVFrame *picture = av_frame_alloc();
    picture->width = width;
    picture->height = height;
    picture->format = AV_PIX_FMT_YUV420P;
    ret = av_frame_get_buffer(picture, 1);
    if (ret < 0) {
        char buf[1024] = {0};
        av_strerror(ret, buf, sizeof(buf) - 1);
        cerr << buf << endl;
        return -1;
    }
    AVFrame *pFrame = av_frame_alloc();
    pFrame->width = width;
    pFrame->height = height;
    pFrame->format = AV_PIX_FMT_YUV420P;
    ret = av_frame_get_buffer(pFrame, 1);
    if (ret < 0) {
        char buf[1024] = {0};
        av_strerror(ret, buf, sizeof(buf) - 1);
        cerr << buf << endl;
        return -1;
    }
    AVFrame *pFrameRGB = av_frame_alloc();
    pFrameRGB->width = width;
    pFrameRGB->height = height;
    pFrameRGB->format = AV_PIX_FMT_RGB24;
    ret = av_frame_get_buffer(pFrameRGB, 1);
    if (ret < 0) {
        char buf[1024] = {0};
        av_strerror(ret, buf, sizeof(buf) - 1);
        cerr << buf << endl;
        return -1;
    }


    int picture_size = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, width, height,
                                                1);//计算这个格式的图片，需要多少字节来存储
    uint8_t *out_buff = (uint8_t *) av_malloc(picture_size * sizeof(uint8_t));
    av_image_fill_arrays(picture->data, picture->linesize, out_buff, AV_PIX_FMT_YUV420P, width,
                         height, 1);
    //这个函数 是缓存转换格式，可以不用 以为上面已经设置了AV_PIX_FMT_YUV420P
    SwsContext *img_convert_ctx = sws_getContext(width, height, AV_PIX_FMT_YUV420P,
                                                 width, height, AV_PIX_FMT_RGB24, 4,
                                                 NULL, NULL, NULL);
    AVPacket *packet = av_packet_alloc();

    // auto startTime = steady_clock::now();
//    int counter = 0;
//    float fps = 0;



    while (true) {

        auto inDepth = depthQueue->get<dai::ImgFrame>();
        //auto ImgFrame = videoqueue->get<dai::ImgFrame>();
        // auto ImgFrame = passthrough->get<dai::ImgFrame>();
        //auto frame = ImgFrame->getCvFrame();
        printf("h264\n");
        //videoFile.write((char *) (h265Packet->getData().data()), h265Packet->getData().size());
        auto h264Packet = videoqueue->get<dai::ImgFrame>();
        packet->data = (uint8_t *) h264Packet->getData().data();    //这里填入一个指向完整H264数据帧的指针
        packet->size = h264Packet->getData().size();        //这个填入H265 数据帧的大小
        packet->stream_index = 0;
        ret = avcodec_send_packet(pCodecCtx, packet);
        if (ret < 0) {
            char buf[1024] = {0};
            av_strerror(ret, buf, sizeof(buf) - 1);
            cerr << buf << endl;
            continue;
        }
        av_packet_unref(packet);
        int ret = avcodec_receive_frame(pCodecCtx, pFrame);
        av_frame_is_writable(pFrame);
        if (ret < 0) {
            char buf[1024] = {0};
            av_strerror(ret, buf, sizeof(buf) - 1);
            cerr << buf << endl;
            continue;
        }

        sws_scale(img_convert_ctx, pFrame->data, pFrame->linesize, 0,
                  height,
                  pFrameRGB->data, pFrameRGB->linesize);


        cv::Mat mRGB(cv::Size(width, height), CV_8UC3);
        mRGB.data = (unsigned char *) pFrameRGB->data[0];
        cv::Mat mBGR;
        cv::cvtColor(mRGB, mBGR, cv::COLOR_RGB2BGR);
        cv::Mat frame = mBGR;


        target_height=frame.rows*1.0;
        target_width=frame.cols*1.0;
        cv::Mat depthFrameColor;
        cv::Mat depthFrame = inDepth->getFrame();  // depthFrame values are in millimeters
        cv::normalize(depthFrame, depthFrameColor, 255, 0, cv::NORM_INF, CV_8UC1);
        cv::equalizeHist(depthFrameColor, depthFrameColor);
        cv::applyColorMap(depthFrameColor, depthFrameColor, cv::COLORMAP_HOT);


        auto inNN = detqueue->get<dai::NNData>();
        if (printOutputLayersOnce && inNN) {
            std::cout << "Output layer names: ";
            for (const auto &ten : inNN->getAllLayerNames()) {
                std::cout << ten << ", ";
            }
            std::cout << std::endl;
            printOutputLayersOnce = false;
        }

        std::vector<BoxInfo> result;
        std::vector<BoxInfo> boxes;
        auto output = inNN->getLayerFp16(inNN->getAllLayerNames()[2]);
        boxes = decode_infer(output, layers[2].stride, 640, labels.size(), layers[2].anchors, threshold, 2);
        result.insert(result.begin(), boxes.begin(), boxes.end());


        auto output_534 = inNN->getLayerFp16(inNN->getAllLayerNames()[0]);
        boxes = decode_infer(output_534, layers[1].stride, 640, labels.size(), layers[1].anchors, threshold, 1);
        result.insert(result.begin(), boxes.begin(), boxes.end());

        auto output_554 = inNN->getLayerFp16(inNN->getAllLayerNames()[1]);
        boxes = decode_infer(output_554, layers[0].stride, 640, labels.size(), layers[0].anchors, threshold, 0);
        result.insert(result.begin(), boxes.begin(), boxes.end());
        nms(result, nms_threshold);
        scale_coords(result, 640, 640, frame.cols, frame.rows);

        cv::Mat frame_show = draw_box(frame, result, labels, colors);

        counter++;
        auto currentTime = steady_clock::now();
        auto elapsed = duration_cast<duration<float>>(currentTime - startTime);
        if (elapsed > seconds(1)) {
            fps = counter / elapsed.count();
            counter = 0;
            startTime = currentTime;
        }


        std::stringstream fpsStr;
        fpsStr << "NN fps: " << std::fixed << std::setprecision(2) << fps;
        printf("fps %f\n",fps);
        cv::putText(frame, fpsStr.str(), cv::Point(2, frame_show.rows - 4), cv::FONT_HERSHEY_TRIPLEX, 0.4,
                    cv::Scalar(0, 255, 0));

        for(auto &item:result){
            topLeft.x = item.x1 * depthFrameColor.cols / target_width / depthFrameColor.cols;
            topLeft.y = item.y1 * depthFrameColor.rows / target_height / depthFrame.rows;
            bottomRight.x = ( item.x2 * depthFrameColor.cols / target_width) /depthFrameColor.cols;
            bottomRight.y = ( item.y2 * depthFrameColor.rows / target_height ) /depthFrameColor.rows;
            //std::cout<<topLeft.x<<" "<<topLeft.y<<" "<<bottomRight.x<<" "<<bottomRight.y<<" "<<std::endl;
            config.roi = dai::Rect(topLeft, bottomRight);
            dai::SpatialLocationCalculatorConfig cfg;
            cfg.addROI(config);
            spatialCalcConfigInQueue->send(cfg);


            auto spatialData = spatialCalcQueue->get<dai::SpatialLocationCalculatorData>()->getSpatialLocations();
            for(auto depthData : spatialData) {
                auto roi = depthData.config.roi;
                roi = roi.denormalize(depthFrameColor.cols, depthFrameColor.rows);
                auto xmin = (int)roi.topLeft().x;
                auto ymin = (int)roi.topLeft().y;
                auto xmax = (int)roi.bottomRight().x;
                auto ymax = (int)roi.bottomRight().y;

                auto depthMin = depthData.depthMin;
                auto depthMax = depthData.depthMax;
                auto coords = depthData.spatialCoordinates;
                auto distance = std::sqrt(coords.x * coords.x + coords.y * coords.y + coords.z * coords.z);
                auto fontType = cv::FONT_HERSHEY_TRIPLEX;
                std::stringstream depthDistance;
                depthDistance.precision(2);
                depthDistance << fixed << static_cast<float>(distance / 1000.0f) << "m";
                cv::putText(depthFrameColor, depthDistance.str(), cv::Point(item.x1 + 10, item.y1 + 70), fontType, 0.5, color);

                std::stringstream rgbDistance;
                rgbDistance.precision(2);
                rgbDistance << fixed << static_cast<float>(distance / 1000.0f) << "m";
                cv::putText(frame, rgbDistance.str(), cv::Point(item.x1 + 10, item.y1 + 70), fontType, 0.5, color);

                cv::rectangle(frame, cv::Rect(cv::Point(item.x1, item.y1), cv::Point(item.x2, item.y2)), color, cv::FONT_HERSHEY_SIMPLEX);
                std::stringstream rgb_depthX,depthX;
                rgb_depthX << "X: " << (int)depthData.spatialCoordinates.x << " mm";
                cv::putText(frame, rgb_depthX.str(), cv::Point(item.x1 + 10, item.y1 + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
                std::stringstream rgb_depthY,depthY;
                rgb_depthY << "Y: " << (int)depthData.spatialCoordinates.y << " mm";
                cv::putText(frame, rgb_depthY.str(), cv::Point(item.x1 + 10, item.y1 + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
                std::stringstream rgb_depthZ,depthZ;
                rgb_depthZ << "Z: " << (int)depthData.spatialCoordinates.z << " mm";
                cv::putText(frame, rgb_depthZ.str(), cv::Point(item.x1 + 10, item.y1 + 50), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                cv::rectangle(depthFrameColor, cv::Rect(cv::Point(xmin, ymin), cv::Point(xmax, ymax)), color, cv::FONT_HERSHEY_SIMPLEX);

                depthX << "X: " << (int)depthData.spatialCoordinates.x << " mm";
                cv::putText(depthFrameColor, depthX.str(), cv::Point(xmin + 10, ymin + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                depthY << "Y: " << (int)depthData.spatialCoordinates.y << " mm";
                cv::putText(depthFrameColor, depthY.str(), cv::Point(xmin+ 10, ymin + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);

                depthZ << "Z: " << (int)depthData.spatialCoordinates.z << " mm";
                cv::putText(depthFrameColor, depthZ.str(), cv::Point(xmin + 10, ymin + 50), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
            }


            // Show the frame
        }


        cv::imshow("depth", depthFrameColor);


        cv::imshow("demo", frame_show);
        cv::imwrite("demo.jpg",frame_show);
        int key = cv::waitKey(1);
        if (key == 'q' || key == 'Q') return 0;
        cv::waitKey(1);


    }


    return 0;
}

参考

mnn-yolov5: Depoly YoloV5 by MNN.

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

54、记录yolov7 训练、部署ncnn、部署mnn、部署rk3399pro npu、部署openvino、部署oak vpu、部署TensorRT

54、记录yolov7 训练、部署ncnn、部署mnn、部署rk3399pro npu、部署openvino、部署oak vpu、部署TensorRT 的相关文章

随机推荐

热门标签