当前位置:网站首页>CPU/GPU(CUDA)版本的 YOLOv5后处理代码
CPU/GPU(CUDA)版本的 YOLOv5后处理代码
2022-07-22 21:28:00 【HELLOWORLD2424】
CPU/GPU(CUDA)版本的 YOLOv5后处理代码
说明
这里是YOLOv5后处理的代码,载入主干网络保存的数组,分别用CPU、GPU实现后处理,得到检测框。在项目的主文件夹,新建src,然后把代码放入src文件夹,编写makefile来编译,注意cuda相关路径需要更改。编译后二进制文件会输出在workspace目录下。
box.hpp
#ifndef BOX_HPP
#define BOX_HPP
struct Box{
float left, top, right, bottom, confidence;
int label;
Box() = default;
Box(float left, float top, float right, float bottom, float confidence, int label):
left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label){
}
};
#endif // BOX_HPP
gpu_decode.cu
#include <cuda_runtime.h>
static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
*ox = matrix[0] * x + matrix[1] * y + matrix[2];
*oy = matrix[3] * x + matrix[4] * y + matrix[5];
}
static __global__ void decode_kernel(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT
){
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= num_bboxes) return;
float* pitem = predict + (5 + num_classes) * position;
float objectness = pitem[4];
if(objectness < confidence_threshold)
return;
float* class_confidence = pitem + 5;
float confidence = *class_confidence++;
int label = 0;
for(int i = 1; i < num_classes; ++i, ++class_confidence){
if(*class_confidence > confidence){
confidence = *class_confidence;
label = i;
}
}
confidence *= objectness;
if(confidence < confidence_threshold)
return;
int index = atomicAdd(parray, 1);
if(index >= max_objects)
return;
float cx = *pitem++;
float cy = *pitem++;
float width = *pitem++;
float height = *pitem++;
float left = cx - width * 0.5f;
float top = cy - height * 0.5f;
float right = cx + width * 0.5f;
float bottom = cy + height * 0.5f;
// affine_project(invert_affine_matrix, left, top, &left, &top);
// affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
// left, top, right, bottom, confidence, class, keepflag
float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
static __device__ float box_iou(
float aleft, float atop, float aright, float abottom,
float bleft, float btop, float bright, float bbottom
){
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if(c_area == 0.0f)
return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT){
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = min((int)*bboxes, max_objects);
if (position >= count)
return;
// left, top, right, bottom, confidence, class, keepflag
float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
for(int i = 0; i < count; ++i){
float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
if(i == position || pcurrent[5] != pitem[5]) continue;
if(pitem[4] >= pcurrent[4]){
if(pitem[4] == pcurrent[4] && i < position)
continue;
float iou = box_iou(
pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
pitem[0], pitem[1], pitem[2], pitem[3]
);
if(iou > threshold){
pcurrent[6] = 0; // 1=keep, 0=ignore
return;
}
}
}
}
void decode_kernel_invoker(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream){
auto block = num_bboxes > 512 ? 512 : num_bboxes;
auto grid = (num_bboxes + block - 1) / block;
/* 如果核函数有波浪线,没关系,他是正常的,你只是看不顺眼罢了 */
decode_kernel<<<grid, block, 0, stream>>>(
predict, num_bboxes, num_classes, confidence_threshold,
invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT
);
block = max_objects > 512 ? 512 : max_objects;
grid = (max_objects + block - 1) / block;
fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
}
main.cpp
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <stdio.h>
#include <chrono>
#include <fstream>
#include "box.hpp"
using namespace std;
using namespace cv;
#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)
bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
if(code != cudaSuccess){
const char* err_name = cudaGetErrorName(code);
const char* err_message = cudaGetErrorString(code);
printf("runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message);
return false;
}
return true;
}
static std::vector<uint8_t> load_file(const string& file){
ifstream in(file, ios::in | ios::binary);
if (!in.is_open())
return {
};
in.seekg(0, ios::end);
size_t length = in.tellg();
std::vector<uint8_t> data;
if (length > 0){
in.seekg(0, ios::beg);
data.resize(length);
in.read((char*)&data[0], length);
}
in.close();
return data;
}
vector<Box> cpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> boxes;
int num_classes = cols - 5;
for(int i = 0; i < rows; ++i){
float* pitem = predict + i * cols;
float objness = pitem[4];
if(objness < confidence_threshold)
continue;
float* pclass = pitem + 5;
int label = std::max_element(pclass, pclass + num_classes) - pclass;
float prob = pclass[label];
float confidence = prob * objness;
if(confidence < confidence_threshold)
continue;
float cx = pitem[0];
float cy = pitem[1];
float width = pitem[2];
float height = pitem[3];
float left = cx - width * 0.5;
float top = cy - height * 0.5;
float right = cx + width * 0.5;
float bottom = cy + height * 0.5;
boxes.emplace_back(left, top, right, bottom, confidence, (float)label);
}
std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){
return a.confidence > b.confidence;});
std::vector<bool> remove_flags(boxes.size());
std::vector<Box> box_result;
box_result.reserve(boxes.size());
auto iou = [](const Box& a, const Box& b){
float cross_left = std::max(a.left, b.left);
float cross_top = std::max(a.top, b.top);
float cross_right = std::min(a.right, b.right);
float cross_bottom = std::min(a.bottom, b.bottom);
float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
float union_area = std::max(0.0f, a.right - a.left) * std::max(0.0f, a.bottom - a.top)
+ std::max(0.0f, b.right - b.left) * std::max(0.0f, b.bottom - b.top) - cross_area;
if(cross_area == 0 || union_area == 0) return 0.0f;
return cross_area / union_area;
};
for(int i = 0; i < boxes.size(); ++i){
if(remove_flags[i]) continue;
auto& ibox = boxes[i];
box_result.emplace_back(ibox);
for(int j = i + 1; j < boxes.size(); ++j){
if(remove_flags[j]) continue;
auto& jbox = boxes[j];
if(ibox.label == jbox.label){
// class matched
if(iou(ibox, jbox) >= nms_threshold)
remove_flags[j] = true;
}
}
}
return box_result;
}
void decode_kernel_invoker(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream);
vector<Box> gpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> box_result;
cudaStream_t stream = nullptr;
checkRuntime(cudaStreamCreate(&stream));
float* predict_device = nullptr;
float* output_device = nullptr;
float* output_host = nullptr;
int max_objects = 1000;
int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag
checkRuntime(cudaMalloc(&predict_device, rows * cols * sizeof(float)));
checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMemcpyAsync(predict_device, predict, rows * cols * sizeof(float), cudaMemcpyHostToDevice, stream));
decode_kernel_invoker(
predict_device, rows, cols - 5, confidence_threshold,
nms_threshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, stream
);
checkRuntime(cudaMemcpyAsync(output_host, output_device,
sizeof(int) + max_objects * NUM_BOX_ELEMENT * sizeof(float),
cudaMemcpyDeviceToHost, stream
));
checkRuntime(cudaStreamSynchronize(stream));
int num_boxes = min((int)output_host[0], max_objects);
for(int i = 0; i < num_boxes; ++i){
float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i;
int keep_flag = ptr[6];
if(keep_flag){
box_result.emplace_back(
ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5]
);
}
}
checkRuntime(cudaStreamDestroy(stream));
checkRuntime(cudaFree(predict_device));
checkRuntime(cudaFree(output_device));
checkRuntime(cudaFreeHost(output_host));
return box_result;
}
int main(){
auto data = load_file("predict.data");
auto image = cv::imread("input-image.jpg");
float* ptr = (float*)data.data();
int nelem = data.size() / sizeof(float);
int ncols = 85;
int nrows = nelem / ncols;
auto boxes = gpu_decode(ptr, nrows, ncols);
for(auto& box : boxes){
cv::rectangle(image, cv::Point(box.left, box.top), cv::Point(box.right, box.bottom), cv::Scalar(0, 255, 0), 2);
cv::putText(image, cv::format("%.2f", box.confidence), cv::Point(box.left, box.top - 7), 0, 0.8, cv::Scalar(0, 0, 255), 2, 16);
}
cv::imwrite("image-draw.jpg", image);
return 0;
}
makefile
cc := g++
name := pro
workdir := workspace
srcdir := src
objdir := objs
stdcpp := c++11
cuda_home := /datav/software/anaconda3/lib/python3.9/site-packages/trtpy/trt8cuda112cudnn8
syslib := /datav/software/anaconda3/lib/python3.9/site-packages/trtpy/lib
cpp_pkg := /datav/software/anaconda3/lib/python3.9/site-packages/trtpy/cpp-packages
cuda_arch :=
nvcc := $(cuda_home)/bin/nvcc -ccbin=$(cc)
# 定义cpp的路径查找和依赖项mk文件
cpp_srcs := $(shell find $(srcdir) -name "*.cpp")
cpp_objs := $(cpp_srcs:.cpp=.cpp.o)
cpp_objs := $(cpp_objs:$(srcdir)/%=$(objdir)/%)
cpp_mk := $(cpp_objs:.cpp.o=.cpp.mk)
# 定义cu文件的路径查找和依赖项mk文件
cu_srcs := $(shell find $(srcdir) -name "*.cu")
cu_objs := $(cu_srcs:.cu=.cu.o)
cu_objs := $(cu_objs:$(srcdir)/%=$(objdir)/%)
cu_mk := $(cu_objs:.cu.o=.cu.mk)
# 定义opencv和cuda需要用到的库文件
link_cuda := cudart cublas
link_trtpro :=
link_tensorRT :=
link_opencv := opencv_core opencv_imgcodecs opencv_imgproc
link_sys := stdc++ dl
link_librarys := $(link_cuda) $(link_tensorRT) $(link_sys) $(link_opencv)
# 定义头文件路径,请注意斜杠后边不能有空格
# 只需要写路径,不需要写-I
include_paths := src \
$(cuda_home)/include/cuda \
$(cuda_home)/include/tensorRT \
$(cpp_pkg)/opencv4.2/include
# 定义库文件路径,只需要写路径,不需要写-L
library_paths := $(cuda_home)/lib64 $(syslib) $(cpp_pkg)/opencv4.2/lib
# 把library path给拼接为一个字符串,例如a b c => a:b:c
# 然后使得LD_LIBRARY_PATH=a:b:c
empty :=
library_path_export := $(subst $(empty) $(empty),:,$(library_paths))
# 把库路径和头文件路径拼接起来成一个,批量自动加-I、-L、-l
run_paths := $(foreach item,$(library_paths),-Wl,-rpath=$(item))
include_paths := $(foreach item,$(include_paths),-I$(item))
library_paths := $(foreach item,$(library_paths),-L$(item))
link_librarys := $(foreach item,$(link_librarys),-l$(item))
# 如果是其他显卡,请修改-gencode=arch=compute_75,code=sm_75为对应显卡的能力
# 显卡对应的号码参考这里:https://developer.nvidia.com/zh-cn/cuda-gpus#compute
# 如果是 jetson nano,提示找不到-m64指令,请删掉 -m64选项。不影响结果
cpp_compile_flags := -std=$(stdcpp) -w -g -O0 -m64 -fPIC -fopenmp -pthread
cu_compile_flags := -std=$(stdcpp) -w -g -O0 -m64 $(cuda_arch) -Xcompiler "$(cpp_compile_flags)"
link_flags := -pthread -fopenmp -Wl,-rpath='$$ORIGIN'
cpp_compile_flags += $(include_paths)
cu_compile_flags += $(include_paths)
link_flags += $(library_paths) $(link_librarys) $(run_paths)
# 如果头文件修改了,这里的指令可以让他自动编译依赖的cpp或者cu文件
ifneq ($(MAKECMDGOALS), clean)
-include $(cpp_mk) $(cu_mk)
endif
$(name) : $(workdir)/$(name)
all : $(name)
run : $(name)
@cd $(workdir) && ./$(name) $(run_args)
$(workdir)/$(name) : $(cpp_objs) $(cu_objs)
@echo Link [email protected]
@mkdir -p $(dir [email protected])
@$(cc) $^ -o [email protected] $(link_flags)
$(objdir)/%.cpp.o : $(srcdir)/%.cpp
@echo Compile CXX $<
@mkdir -p $(dir [email protected])
@$(cc) -c $< -o [email protected] $(cpp_compile_flags)
$(objdir)/%.cu.o : $(srcdir)/%.cu
@echo Compile CUDA $<
@mkdir -p $(dir [email protected])
@$(nvcc) -c $< -o [email protected] $(cu_compile_flags)
# 编译cpp依赖项,生成mk文件
$(objdir)/%.cpp.mk : $(srcdir)/%.cpp
@echo Compile depends C++ $<
@mkdir -p $(dir [email protected])
@$(cc) -M $< -MF [email protected] -MT $(@:.cpp.mk=.cpp.o) $(cpp_compile_flags)
# 编译cu文件的依赖项,生成cumk文件
$(objdir)/%.cu.mk : $(srcdir)/%.cu
@echo Compile depends CUDA $<
@mkdir -p $(dir [email protected])
@$(nvcc) -M $< -MF [email protected] -MT $(@:.cu.mk=.cu.o) $(cu_compile_flags)
# 定义清理指令
clean :
@rm -rf $(objdir) $(workdir)/$(name) $(workdir)/input-image-pytorch.jpg $(workdir)/image-draw.jpg
# 防止符号被当做文件
.PHONY : clean run $(name)
# 导出依赖库路径,使得能够运行起来
export LD_LIBRARY_PATH:=$(library_path_export)
边栏推荐
- Inside the hard core of LAN SDN technology - evpn implementation of 16 three from thing to person user roaming in the park
- LAN SDN hard core technology insider 18 beautiful new world
- 无代码生产新模式探索
- 记一次线上SQL死锁事故:如何避免死锁?
- UE4引擎的CopyTexture, CopyToResolveTarget
- 避错,常见Appium相关问题及解决方案
- 开发过程中的总结 BaseService 为所有的 Controller或Service 提供一个公共获取 Service 的文件,减少重复注入
- Copytexture, copytoresolvetarget of UE4 engine
- Scala Generic 泛型类详解 - T
- 局域网SDN硬核技术内幕 18 美丽新世界
猜你喜欢

聊聊并发编程的12种业务场景

Application of workflow engine in vivo marketing automation

11.37万的星瑞是怎样一个产品和表现力?一起来看看吧

FastAPI学习(二)——FastAPI+Jinjia2模板渲染网页(跳转返回渲染页面)

remove函数的实现

Wechat hotel reservation applet graduation project (7) Interim inspection report

开幕在即 | “万物互联,使能千行百业”2022开放原子全球开源峰会OpenAtom OpenHarmony分论坛

第二篇如何使用SourceTree更新代码到本地

微信小程序项目实战

第一篇sourcetree安装
随机推荐
remove函数的实现
Uniapp switches the tab bar to display different pages, remembers the page location and pulls up to get new data
沃尔沃xc90的安全性如何?一起来看看吧
11.37万的星瑞是怎样一个产品和表现力?一起来看看吧
LAN SDN technology hard core insider 4 from computing virtualization to network virtualization
Wechat hotel reservation applet graduation project (5) assignment
【刷题记录】18. 四数之和
局域网SDN技术硬核内幕 8 从二层交换到三层路由
Talk about 12 business scenarios of concurrent programming
Implementation of remove function
6-15漏洞利用-smb-RCE远程命令执行
【翻译】宣布Krius--加速你对Kubernetes的监控采用
百度搜索打击盗版网文站点:互联网内容侵权现象为何屡禁不止
我是如何在一周内拿到4份offer的?
URL的结构解读
正版Adobe软件来了!Adobe全球唯一正版全家桶订阅只需0元/年
《postgresql指南--内幕探索》第一章 数据库集簇、数据库和数据表
自定义flink es source
Wechat hotel reservation applet graduation project (7) Interim inspection report
Redis——JedisConnectionException Could not get a resource from the pool