diff --git a/CMakeLists.txt b/CMakeLists.txt index f62802a..54e54b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,8 +4,8 @@ project(YOLOv8_TensorRT_CPP) # Use ccache to speed up rebuilds include(cmake/ccache.cmake) -# Set C++ version and optimization level -set(CMAKE_CXX_STANDARD 17) +# Set C++ version and optimization level (tensorrt_cpp_api v7 requires C++20) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations") # CMake Options @@ -19,14 +19,20 @@ find_package(OpenCV REQUIRED) # TODO: Specify the path to TensorRT root dir set(TensorRT_DIR /home/cyrus/work/libs/TensorRT-10.0.0.6/) -# Build the TensorRT inference engine library -# TensorRT is found and linked by the tensorrt-cpp-api +# Build the tensorrt_cpp_api v7 library (git submodule). Enable its OpenCV interop so we can hand +# cv::cuda::GpuMat frames to the fused preprocessing kernel zero-copy (trtcpp::opencv::viewOf). +# NOTE: the libs/tensorrt-cpp-api submodule must be at tensorrt_cpp_api v7.0.0+ (see MIGRATION.md). +set(TRT_CPP_API_WITH_OPENCV ON CACHE BOOL "" FORCE) +set(TRT_CPP_API_BUILD_PREPROC ON CACHE BOOL "" FORCE) add_subdirectory(libs/tensorrt-cpp-api) -# Build the YoloV8 library +# Build the YoloV8 library. v7 exposes namespaced targets and propagates its own include dirs +# (the tensorrt_cpp_api/ root), so the old libs/.../src include is gone. add_library(YoloV8_TRT SHARED src/yolov8.cpp) -target_link_libraries(YoloV8_TRT PUBLIC tensorrt_cpp_api ${OpenCV_LIBS}) -target_include_directories(YoloV8_TRT PUBLIC libs/tensorrt-cpp-api/src) +target_link_libraries(YoloV8_TRT PUBLIC + tensorrt_cpp_api::tensorrt_cpp_api + tensorrt_cpp_api::preproc + ${OpenCV_LIBS}) # Build and link the executables add_executable(detect_object_image src/object_detection_image.cpp) diff --git a/MIGRATION.md b/MIGRATION.md new file mode 100644 index 0000000..49c99d8 --- /dev/null +++ b/MIGRATION.md @@ -0,0 +1,64 @@ +# Migration to tensorrt_cpp_api v7 + +This branch (`v7-migration`) ports YOLOv8-TensorRT-CPP from the v6 `tensorrt-cpp-api` to **v7**, +which is a clean break (new namespace `trtcpp`, no-throw `Status`/`Result`, name-keyed tensors, +PImpl headers with no OpenCV/TensorRT leakage). See the library's `docs/upgrading_from_v6.md`. + +## Verification status + +Built and run on an RTX 3080 Laptop GPU against a CUDA-12.6 OpenCV-CUDA build and the v7 library: +`detect_object_image` on `images/team.jpg` (FP16 YOLOv8n) runs end-to-end and detects the people in +the frame and writes the annotated image (~10 objects; the exact count shifts by one or two across +FP16 engine rebuilds, as TensorRT's tactic selection moves borderline detections across the +confidence threshold). The library, `opencv_interop`, the preproc sublib, and the YoloV8 detection +code compile and run. The video/CSI demo targets additionally need an OpenCV built with +`highgui`/`videoio`. + +## Required after pulling: init the submodule + +`libs/tensorrt-cpp-api` is a git submodule pinned to a v7 release; populate it before building: + +```sh +git submodule update --init +``` + +## What changed + +**Build (`CMakeLists.txt`)** +- C++17 → **C++20** (v7 requirement). +- Enable the library's OpenCV interop and preprocessing before `add_subdirectory`: + `TRT_CPP_API_WITH_OPENCV=ON`, `TRT_CPP_API_BUILD_PREPROC=ON`. +- Link the namespaced v7 targets `tensorrt_cpp_api::tensorrt_cpp_api` + `tensorrt_cpp_api::preproc` + (was `tensorrt_cpp_api`); dropped the `libs/.../src` include — v7 propagates its own include root. + +**Inference layer (`src/yolov8.{h,cpp}`)** +- `Engine` → `trtcpp::Engine` (non-templated; runtime `DType`). IO is now name-keyed; the + class caches `m_inputName`/`m_outputNames`/`m_inputShape`/`m_outputShapes` and a reusable + NCHW-float input `Tensor` plus a `Stream`. +- `Options` + `buildLoadNetwork(onnx, SUB, DIV, NORMALIZE)` → `BuildOptions` + + `EngineBuilder::buildAndLoad(onnx, opts)`. +- Preprocessing: the v6 OpenCV `cvtColor` + `resizeKeepAspectRatioPadRightBottom` + in-engine + HWC→NCHW/normalize is replaced by **one fused kernel**, `preproc::letterboxToTensor`, fed a + zero-copy `trtcpp::opencv::viewOf(gpuImg)` device view (BGR→RGB via `swapRB`, letterbox pad + right/bottom, `scale = 1/255`). Box-mapping ratio is unchanged. +- `runInference(GpuMat, nested-vectors)` + `Engine::transformOutput` → `engine.infer(...)` + returning name-keyed owning `Tensor`s; each output is read back with `toHost(stream)` (explicit + D2H + sync) into a flat `std::vector`. The detect/pose/seg **post-processing math is + unchanged** — only how it obtains dims (`getOutputDims().d[i]` → cached `Shape[i]`) and the flat + output buffers. +- Errors: v6 `bool`/exception checks → unwrap `Result`/`Status` (throwing a `std::runtime_error` + with `.status().message()` to preserve this app's exception-based control flow). + +**Precision (`src/cmd_line_util.h`)** +- `Precision::FP32/FP16/INT8` → `trtcpp::Precision::kFp32/kFp16/kInt8Qdq`. +- **INT8 caveat:** `kInt8Qdq` expects an explicit Q/DQ ONNX (no calibration data). The v6 flow + (a calibration-image directory) maps to `kInt8CalibLegacy`, which is only available when the + library is built against **TensorRT < 11** and requires constructing an `ICalibrator` + (`tensorrt_cpp_api/calibrator.h`) and setting `BuildOptions.calibrator`. That wiring is **not** + ported here — quantize to a QDQ ONNX, or restore a calibrator if you need legacy PTQ. + +**Misc** +- Added `src/stopwatch.h` (a small `std::chrono` `preciseStopwatch`) to replace the timing utility + v6 shipped in the engine library and v7 does not. +- The OpenCV modules the v6 `engine.h` pulled in transitively (`imgcodecs`, `videoio`, `highgui`) + are now included explicitly where used. diff --git a/libs/tensorrt-cpp-api b/libs/tensorrt-cpp-api index f93f973..166ce91 160000 --- a/libs/tensorrt-cpp-api +++ b/libs/tensorrt-cpp-api @@ -1 +1 @@ -Subproject commit f93f973cd03f1caa710c60d60c0d6feb9ed79e4a +Subproject commit 166ce91ce71b4ac7c39611796ced4a8bf20fcab9 diff --git a/src/benchmark.cpp b/src/benchmark.cpp index b7cf8ce..d96f420 100644 --- a/src/benchmark.cpp +++ b/src/benchmark.cpp @@ -1,6 +1,7 @@ #include "cmd_line_util.h" +#include "stopwatch.h" #include "yolov8.h" -#include +#include // imread (was pulled in transitively by the v6 engine.h) // Benchmarks the specified model int main(int argc, char *argv[]) { diff --git a/src/cmd_line_util.h b/src/cmd_line_util.h index 9aed68d..e6a4dc1 100644 --- a/src/cmd_line_util.h +++ b/src/cmd_line_util.h @@ -191,11 +191,14 @@ inline bool parseArguments(int argc, char *argv[], YoloV8Config &config, std::st return false; if (nextArgument == "FP32") { - config.precision = Precision::FP32; + config.precision = trtcpp::Precision::kFp32; } else if (nextArgument == "FP16") { - config.precision = Precision::FP16; + config.precision = trtcpp::Precision::kFp16; } else if (nextArgument == "INT8") { - config.precision = Precision::INT8; + // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style + // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when + // the library is built against TensorRT < 11). + config.precision = trtcpp::Precision::kInt8Qdq; } else { std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl; return false; @@ -379,11 +382,14 @@ inline bool parseArgumentsVideo(int argc, char *argv[], YoloV8Config &config, st return false; if (nextArgument == "FP32") { - config.precision = Precision::FP32; + config.precision = trtcpp::Precision::kFp32; } else if (nextArgument == "FP16") { - config.precision = Precision::FP16; + config.precision = trtcpp::Precision::kFp16; } else if (nextArgument == "INT8") { - config.precision = Precision::INT8; + // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style + // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when + // the library is built against TensorRT < 11). + config.precision = trtcpp::Precision::kInt8Qdq; } else { std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl; return false; diff --git a/src/object_detection_csi_jetson.cpp b/src/object_detection_csi_jetson.cpp index 63682a3..5f94c27 100644 --- a/src/object_detection_csi_jetson.cpp +++ b/src/object_detection_csi_jetson.cpp @@ -1,6 +1,10 @@ #include "cmd_line_util.h" #include "yolov8.h" -#include +// Under tensorrt_cpp_api v7 the public headers pull in no OpenCV, so the modules this tool needs +// (VideoCapture from videoio, imshow/waitKey from highgui) must be included explicitly rather than +// transitively via the old v6 "engine.h". +#include +#include // Runs object detection on video stream then displays annotated results. int main(int argc, char *argv[]) { diff --git a/src/object_detection_image.cpp b/src/object_detection_image.cpp index 4271ad1..9f1cc06 100644 --- a/src/object_detection_image.cpp +++ b/src/object_detection_image.cpp @@ -1,5 +1,6 @@ #include "cmd_line_util.h" #include "yolov8.h" +#include // imread/imwrite (was pulled in transitively by the v6 engine.h) // Runs object detection on an input image then saves the annotated image to disk. int main(int argc, char *argv[]) { diff --git a/src/object_detection_video_stream.cpp b/src/object_detection_video_stream.cpp index 3f53af7..9348107 100644 --- a/src/object_detection_video_stream.cpp +++ b/src/object_detection_video_stream.cpp @@ -1,6 +1,7 @@ #include "cmd_line_util.h" #include "yolov8.h" -#include +#include // imshow/waitKey (was pulled in transitively by the v6 engine.h) +#include // VideoCapture / CAP_PROP_* // Runs object detection on video stream then displays annotated results. int main(int argc, char *argv[]) { diff --git a/src/stopwatch.h b/src/stopwatch.h new file mode 100644 index 0000000..04af649 --- /dev/null +++ b/src/stopwatch.h @@ -0,0 +1,17 @@ +#pragma once +// Minimal stopwatch for the ENABLE_BENCHMARKS timing and the benchmark executable. v6 got this +// from the engine library's util/Stopwatch.h, which v7 does not ship; this is a drop-in local +// replacement with the same preciseStopwatch interface. +#include + +template class Stopwatch { +public: + template T elapsedTime() const { + return static_cast(std::chrono::duration_cast(Clock::now() - start_).count()); + } + +private: + typename Clock::time_point start_ = Clock::now(); +}; + +using preciseStopwatch = Stopwatch; diff --git a/src/yolov8.cpp b/src/yolov8.cpp index cbd315c..a2bff04 100644 --- a/src/yolov8.cpp +++ b/src/yolov8.cpp @@ -1,77 +1,102 @@ #include "yolov8.h" -#include +#include "stopwatch.h" +#include // std::cout in the ENABLE_BENCHMARKS timing (was transitive via the v6 engine.h) +#include + +namespace { +// Unwrap a v7 Result, throwing on error (this app uses exceptions). Calling .value() directly +// would assert in debug builds and be undefined behavior in -DNDEBUG release builds when the +// Result holds an error (e.g. a dynamic/oversized shape, OOM, or a non-float output dtype). +template T must(trtcpp::Result r, const char *what) { + if (!r) { + throw std::runtime_error(std::string("Error: ") + what + ": " + r.status().message()); + } + return std::move(r).value(); +} +} // namespace YoloV8::YoloV8(const std::string &onnxModelPath, const std::string &trtModelPath, const YoloV8Config &config) : PROBABILITY_THRESHOLD(config.probabilityThreshold), NMS_THRESHOLD(config.nmsThreshold), TOP_K(config.topK), SEG_CHANNELS(config.segChannels), SEG_H(config.segH), SEG_W(config.segW), SEGMENTATION_THRESHOLD(config.segmentationThreshold), CLASS_NAMES(config.classNames), NUM_KPS(config.numKPS), KPS_THRESHOLD(config.kpsThreshold) { - // Specify options for GPU inference - Options options; - options.optBatchSize = 1; - options.maxBatchSize = 1; - + // Specify build options for the v7 engine builder. (Batch knobs are now expressed as + // optimization profiles; this detector uses the model's static 1x3xHxW input.) + trtcpp::BuildOptions options; options.precision = config.precision; - options.calibrationDataDirectoryPath = config.calibrationDataDirectory; + options.engineCacheDir = "."; // build-or-load caches next to the working dir; v7 detects staleness - if (options.precision == Precision::INT8) { - if (options.calibrationDataDirectoryPath.empty()) { - throw std::runtime_error("Error: Must supply calibration data path for INT8 calibration"); - } + // v7 INT8: prefer an explicit-QDQ ONNX with Precision::kInt8Qdq (no calibration data). Legacy + // calibrator PTQ (kInt8CalibLegacy) is only available when the library is built against + // TensorRT < 11 and is wired via BuildOptions.calibrator (see tensorrt_cpp_api/calibrator.h). + if (options.precision == trtcpp::Precision::kInt8CalibLegacy && config.calibrationDataDirectory.empty()) { + throw std::runtime_error("Error: Must supply calibration data path for legacy INT8 calibration"); } - // Create our TensorRT inference engine - m_trtEngine = std::make_unique>(options); - - // Build the onnx model into a TensorRT engine file, cache the file to disk, and then load the TensorRT engine file into memory. - // If the engine file already exists on disk, this function will not rebuild but only load into memory. - // The engine file is rebuilt any time the above Options are changed. - if (!onnxModelPath.empty()) { - // Build the ONNX model into a TensorRT engine file - auto succ = m_trtEngine->buildLoadNetwork(onnxModelPath, SUB_VALS, DIV_VALS, NORMALIZE); - if (!succ) { - const std::string errMsg = "Error: Unable to build or load the TensorRT engine from ONNX model. " - "Try increasing TensorRT log severity to kVERBOSE (in /libs/tensorrt-cpp-api/engine.cpp)."; - throw std::runtime_error(errMsg); + // Obtain a ready-to-run v7 Engine, either by building the ONNX into a TensorRT engine (caching + // it next to the working dir, rebuilding only when stale) or by loading a prebuilt .trt/.engine + // file directly. Preprocessing (BGR->RGB, letterbox, 1/255 scale) is fused on the GPU in + // preprocess(), so the v6 SUB_VALS/DIV_VALS/NORMALIZE are no longer passed at build/load time. + auto loadEngine = [&]() -> trtcpp::Result { + if (!onnxModelPath.empty()) { + // Build the ONNX model into a TensorRT engine (or load a fresh cached one) and deserialize it. + return trtcpp::EngineBuilder{}.buildAndLoad(onnxModelPath, options); } - } else if (!trtModelPath.empty()) { // If no ONNX model, check for TRT model - // Load the TensorRT engine file directly - bool succ = m_trtEngine->loadNetwork(trtModelPath, SUB_VALS, DIV_VALS, NORMALIZE); - if (!succ) { - throw std::runtime_error("Error: Unable to load TensorRT engine from " + trtModelPath); + if (!trtModelPath.empty()) { + // No ONNX model: deserialize a prebuilt TensorRT engine file directly. + return trtcpp::Engine::loadFromFile(trtModelPath); } - } else { - throw std::runtime_error("Error: Neither ONNX model nor TensorRT engine path provided."); + return trtcpp::Status{trtcpp::StatusCode::kInvalidArgument, "Neither ONNX model nor TensorRT engine path provided."}; + }; + auto engine = loadEngine(); + if (!engine) { + throw std::runtime_error("Error: Unable to build or load the TensorRT engine: " + engine.status().message()); } -} - -std::vector> YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) { - // Populate the input vectors - const auto &inputDims = m_trtEngine->getInputDims(); - - // Convert the image from BGR to RGB - cv::cuda::GpuMat rgbMat; - cv::cuda::cvtColor(gpuImg, rgbMat, cv::COLOR_BGR2RGB); - - auto resized = rgbMat; - - // Resize to the model expected input size while maintaining the aspect ratio with the use of padding - if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) { - // Only resize if not already the right size to avoid unecessary copy - resized = Engine::resizeKeepAspectRatioPadRightBottom(rgbMat, inputDims[0].d[1], inputDims[0].d[2]); + m_engine = std::make_unique(std::move(engine).value()); + + // Cache IO metadata once (v7 is name-keyed and non-templated). + m_inputName = m_engine->inputNames().front(); + m_outputNames = m_engine->outputNames(); + m_inputShape = must(m_engine->tensorShape(m_inputName), "query input shape"); // [1,3,H,W] + for (const auto &name : m_outputNames) { + m_outputShapes.push_back(must(m_engine->tensorShape(name), "query output shape")); } - // Convert to format expected by our inference engine - // The reason for the strange format is because it supports models with multiple inputs as well as batching - // In our case though, the model only has a single input and we are using a batch size of 1. - std::vector input{std::move(resized)}; - std::vector> inputs{std::move(input)}; - - // These params will be used in the post-processing stage - m_imgHeight = rgbMat.rows; - m_imgWidth = rgbMat.cols; - m_ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(rgbMat.cols), inputDims[0].d[1] / static_cast(rgbMat.rows)); + // Pre-allocate the NCHW float input tensor. allocate() errors (and we throw) on a dynamic + // input shape or a CUDA OOM rather than crashing on an unchecked .value(). + m_input = must(trtcpp::Tensor::allocate(trtcpp::DType::kFloat32, m_inputShape, trtcpp::Device::kCuda), "allocate input tensor"); +} - return inputs; +void YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) { + // Record original dims + the letterbox ratio used by post-processing to map boxes back to the + // source image. inputShape is [1, 3, H, W]. + m_imgHeight = static_cast(gpuImg.rows); + m_imgWidth = static_cast(gpuImg.cols); + const int inH = static_cast(m_inputShape[2]); + const int inW = static_cast(m_inputShape[3]); + m_ratio = 1.f / std::min(inW / m_imgWidth, inH / m_imgHeight); + + // One fused GPU kernel replaces the v6 cvtColor + resizeKeepAspectRatioPadRightBottom and the + // in-engine HWC->NCHW + normalize: BGR->RGB, letterbox-resize (pad right/bottom), scale by + // 1/255 (SUB_VALS=0, DIV_VALS=1, NORMALIZE), and write the NCHW float input tensor in place. + trtcpp::preproc::PreprocSpec spec; + spec.swapRB = true; // OpenCV GpuMat is BGR; the model expects RGB + spec.keepAspectRatioPad = true; // letterbox, pad right/bottom (matches v6) + spec.scale = {1.f / 255.f, 1.f / 255.f, 1.f / 255.f, 1.f}; + + // cv::cuda::GpuMat rows are typically pitched (padded for alignment) and a TensorView is + // contiguous, so make a continuous copy when the upload isn't already continuous. + cv::cuda::GpuMat continuous = gpuImg; + if (!gpuImg.isContinuous()) { + cv::cuda::createContinuous(gpuImg.rows, gpuImg.cols, gpuImg.type(), continuous); + gpuImg.copyTo(continuous); + } + auto src = trtcpp::opencv::viewOf(continuous); // zero-copy HWC-uint8 device view + if (!src) { + throw std::runtime_error("Error: could not view the input GpuMat: " + src.status().message()); + } + if (auto s = trtcpp::preproc::letterboxToTensor(src.value(), m_input.view(), spec, m_stream); !s) { + throw std::runtime_error("Error: preprocessing failed: " + s.message()); + } } std::vector YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR) { @@ -80,7 +105,7 @@ std::vector YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR) static int numIts = 1; preciseStopwatch s1; #endif - const auto input = preprocess(inputImageBGR); + preprocess(inputImageBGR); // fills m_input #ifdef ENABLE_BENCHMARKS static long long t1 = 0; t1 += s1.elapsedTime(); @@ -90,10 +115,21 @@ std::vector YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR) #ifdef ENABLE_BENCHMARKS preciseStopwatch s2; #endif - std::vector>> featureVectors; - auto succ = m_trtEngine->runInference(input, featureVectors); - if (!succ) { - throw std::runtime_error("Error: Unable to run inference."); + auto outputs = m_engine->infer({{m_inputName, m_input.view()}}, m_stream); + if (!outputs) { + throw std::runtime_error("Error: Unable to run inference: " + outputs.status().message()); + } + // Read each output back to a flat host float vector, in output-binding order. (v7 returns + // name-keyed owning Tensors; toHost performs the D2H copy AND synchronizes the stream.) + std::vector> featureVectors; + featureVectors.reserve(m_outputNames.size()); + for (const auto &name : m_outputNames) { + auto host = outputs->at(name).toHost(m_stream); + if (!host) { + throw std::runtime_error("Error: output readback failed: " + host.status().message()); + } + const auto span = must(host->as(), "output tensor is not float32 (rebuild the engine with a float output)"); + featureVectors.emplace_back(span.begin(), span.end()); } #ifdef ENABLE_BENCHMARKS static long long t2 = 0; @@ -102,32 +138,28 @@ std::vector YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR) preciseStopwatch s3; #endif // Check if our model does only object detection or also supports segmentation + // v7 already gives one flat host vector per output (batch size 1), so the v6 transformOutput + // 3D->1D/2D flattening is no longer needed. std::vector ret; - const auto &numOutputs = m_trtEngine->getOutputDims().size(); - if (numOutputs == 1) { - // Object detection or pose estimation - // Since we have a batch size of 1 and only 1 output, we must convert the output from a 3D array to a 1D array. - std::vector featureVector; - Engine::transformOutput(featureVectors, featureVector); - - const auto &outputDims = m_trtEngine->getOutputDims(); - size_t numChannels = outputDims[outputDims.size() - 1].d[1]; + if (m_outputShapes.size() == 1) { + // Object detection or pose estimation. Output shape is [1, C, anchors]; the channel count C + // distinguishes the two: pose adds NUM_KPS*3 keypoint values on top of (4 box + classes), + // while plain detection is just (4 box + classes). No magic number; works with Ultralytics + // pretrained models. + const size_t numChannels = static_cast(m_outputShapes[0][1]); if (numChannels == 4 + CLASS_NAMES.size() + NUM_KPS * 3) { // Pose estimation - ret = postprocessPose(featureVector); - } else if (numChannels == 4 + CLASS_NAMES.size()){ + ret = postprocessPose(featureVectors[0]); + } else if (numChannels == 4 + CLASS_NAMES.size()) { // Object detection - ret = postprocessDetect(featureVector); + ret = postprocessDetect(featureVectors[0]); } else { throw std::runtime_error("Error: Unable to identify whether the model is for Pose estimation or Object detection."); } } else { - // Segmentation - // Since we have a batch size of 1 and 2 outputs, we must convert the output from a 3D array to a 2D array. - std::vector> featureVector; - Engine::transformOutput(featureVectors, featureVector); - ret = postProcessSegmentation(featureVector); + // Instance segmentation (detections + mask prototypes). + ret = postProcessSegmentation(featureVectors); } #ifdef ENABLE_BENCHMARKS static long long t3 = 0; @@ -147,10 +179,8 @@ std::vector YoloV8::detectObjects(const cv::Mat &inputImageBGR) { } std::vector YoloV8::postProcessSegmentation(std::vector> &featureVectors) { - const auto &outputDims = m_trtEngine->getOutputDims(); - - int numChannels = outputDims[0].d[1]; - int numAnchors = outputDims[0].d[2]; + int numChannels = static_cast(m_outputShapes[0][1]); + int numAnchors = static_cast(m_outputShapes[0][2]); const auto numClasses = numChannels - SEG_CHANNELS - 4; @@ -237,7 +267,6 @@ std::vector YoloV8::postProcessSegmentation(std::vector maskChannels; cv::split(maskMat, maskChannels); - const auto inputDims = m_trtEngine->getInputDims(); cv::Rect roi; if (m_imgHeight > m_imgWidth) { @@ -260,9 +289,8 @@ std::vector YoloV8::postProcessSegmentation(std::vector YoloV8::postprocessPose(std::vector &featureVector) { - const auto &outputDims = m_trtEngine->getOutputDims(); - auto numChannels = outputDims[0].d[1]; - auto numAnchors = outputDims[0].d[2]; + const auto numChannels = static_cast(m_outputShapes[0][1]); + const auto numAnchors = static_cast(m_outputShapes[0][2]); std::vector bboxes; std::vector scores; @@ -342,9 +370,8 @@ std::vector YoloV8::postprocessPose(std::vector &featureVector) { } std::vector YoloV8::postprocessDetect(std::vector &featureVector) { - const auto &outputDims = m_trtEngine->getOutputDims(); - auto numChannels = outputDims[0].d[1]; - auto numAnchors = outputDims[0].d[2]; + const auto numChannels = static_cast(m_outputShapes[0][1]); + const auto numAnchors = static_cast(m_outputShapes[0][2]); auto numClasses = CLASS_NAMES.size(); diff --git a/src/yolov8.h b/src/yolov8.h index 366b370..88452b2 100644 --- a/src/yolov8.h +++ b/src/yolov8.h @@ -1,6 +1,20 @@ #pragma once -#include "engine.h" +// Migrated to tensorrt_cpp_api v7. The v7 public headers deliberately pull in no OpenCV, so the +// OpenCV headers this class needs (cv::Mat / GpuMat / dnn::NMSBoxesBatched) are now included +// explicitly here rather than transitively via the old "engine.h". +#include +#include +#include + +#include +#include +#include +#include + #include +#include +#include +#include // Utility method for checking if a file exists on disk inline bool doesFileExist(const std::string &name) { @@ -25,7 +39,7 @@ struct Object { // Can pass these arguments as command line parameters. struct YoloV8Config { // The precision to be used for inference - Precision precision = Precision::FP16; + trtcpp::Precision precision = trtcpp::Precision::kFp16; // Calibration data directory. Must be specified when using INT8 precision. std::string calibrationDataDirectory; // Probability threshold used to filter detected objects @@ -71,8 +85,8 @@ class YoloV8 { void drawObjectLabels(cv::Mat &image, const std::vector &objects, unsigned int scale = 2); private: - // Preprocess the input - std::vector> preprocess(const cv::cuda::GpuMat &gpuImg); + // Preprocess the input (fills m_input + m_ratio/m_imgWidth/m_imgHeight) + void preprocess(const cv::cuda::GpuMat &gpuImg); // Postprocess the output std::vector postprocessDetect(std::vector &featureVector); @@ -83,7 +97,15 @@ class YoloV8 { // Postprocess the output for segmentation model std::vector postProcessSegmentation(std::vector> &featureVectors); - std::unique_ptr> m_trtEngine = nullptr; + // v7 engine + cached IO metadata (v7 is name-keyed and non-templated). The owning input + // Tensor is reused across frames; the caller-owned stream drives async work. + std::unique_ptr m_engine = nullptr; + trtcpp::Stream m_stream; + std::string m_inputName; + std::vector m_outputNames; + trtcpp::Shape m_inputShape; // [1,3,H,W] + std::vector m_outputShapes; // build-time shapes, in output-binding order + trtcpp::Tensor m_input; // NCHW float device tensor fed to the engine // Used for image preprocessing // YoloV8 model expects values between [0.f, 1.f] so we use the following params