cyrusbehr · cyrusbehr · May 30, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,8 +4,8 @@ project(YOLOv8_TensorRT_CPP)
 # Use ccache to speed up rebuilds
 include(cmake/ccache.cmake)
 
-# Set C++ version and optimization level
-set(CMAKE_CXX_STANDARD 17)
+# Set C++ version and optimization level (tensorrt_cpp_api v7 requires C++20)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")
 
 # CMake Options
@@ -19,14 +19,20 @@ find_package(OpenCV REQUIRED)
 # TODO: Specify the path to TensorRT root dir
 set(TensorRT_DIR /home/cyrus/work/libs/TensorRT-10.0.0.6/)
 
-# Build the TensorRT inference engine library
-# TensorRT is found and linked by the tensorrt-cpp-api
+# Build the tensorrt_cpp_api v7 library (git submodule). Enable its OpenCV interop so we can hand
+# cv::cuda::GpuMat frames to the fused preprocessing kernel zero-copy (trtcpp::opencv::viewOf).
+# NOTE: the libs/tensorrt-cpp-api submodule must be at tensorrt_cpp_api v7.0.0+ (see MIGRATION.md).
+set(TRT_CPP_API_WITH_OPENCV ON CACHE BOOL "" FORCE)
+set(TRT_CPP_API_BUILD_PREPROC ON CACHE BOOL "" FORCE)
 add_subdirectory(libs/tensorrt-cpp-api)
 
-# Build the YoloV8 library
+# Build the YoloV8 library. v7 exposes namespaced targets and propagates its own include dirs
+# (the tensorrt_cpp_api/ root), so the old libs/.../src include is gone.
 add_library(YoloV8_TRT SHARED src/yolov8.cpp)
-target_link_libraries(YoloV8_TRT PUBLIC tensorrt_cpp_api ${OpenCV_LIBS})
-target_include_directories(YoloV8_TRT PUBLIC libs/tensorrt-cpp-api/src)
+target_link_libraries(YoloV8_TRT PUBLIC
+    tensorrt_cpp_api::tensorrt_cpp_api
+    tensorrt_cpp_api::preproc
+    ${OpenCV_LIBS})
 
 # Build and link the executables
 add_executable(detect_object_image src/object_detection_image.cpp)

diff --git a/MIGRATION.md b/MIGRATION.md
@@ -0,0 +1,64 @@
+# Migration to tensorrt_cpp_api v7
+
+This branch (`v7-migration`) ports YOLOv8-TensorRT-CPP from the v6 `tensorrt-cpp-api` to **v7**,
+which is a clean break (new namespace `trtcpp`, no-throw `Status`/`Result`, name-keyed tensors,
+PImpl headers with no OpenCV/TensorRT leakage). See the library's `docs/upgrading_from_v6.md`.
+
+## Verification status
+
+Built and run on an RTX 3080 Laptop GPU against a CUDA-12.6 OpenCV-CUDA build and the v7 library:
+`detect_object_image` on `images/team.jpg` (FP16 YOLOv8n) runs end-to-end and detects the people in
+the frame and writes the annotated image (~10 objects; the exact count shifts by one or two across
+FP16 engine rebuilds, as TensorRT's tactic selection moves borderline detections across the
+confidence threshold). The library, `opencv_interop`, the preproc sublib, and the YoloV8 detection
+code compile and run. The video/CSI demo targets additionally need an OpenCV built with
+`highgui`/`videoio`.
+
+## Required after pulling: init the submodule
+
+`libs/tensorrt-cpp-api` is a git submodule pinned to a v7 release; populate it before building:
+
+```sh
+git submodule update --init
+```
+
+## What changed
+
+**Build (`CMakeLists.txt`)**
+- C++17 → **C++20** (v7 requirement).
+- Enable the library's OpenCV interop and preprocessing before `add_subdirectory`:
+  `TRT_CPP_API_WITH_OPENCV=ON`, `TRT_CPP_API_BUILD_PREPROC=ON`.
+- Link the namespaced v7 targets `tensorrt_cpp_api::tensorrt_cpp_api` + `tensorrt_cpp_api::preproc`
+  (was `tensorrt_cpp_api`); dropped the `libs/.../src` include — v7 propagates its own include root.
+
+**Inference layer (`src/yolov8.{h,cpp}`)**
+- `Engine<float>` → `trtcpp::Engine` (non-templated; runtime `DType`). IO is now name-keyed; the
+  class caches `m_inputName`/`m_outputNames`/`m_inputShape`/`m_outputShapes` and a reusable
+  NCHW-float input `Tensor` plus a `Stream`.
+- `Options` + `buildLoadNetwork(onnx, SUB, DIV, NORMALIZE)` → `BuildOptions` +
+  `EngineBuilder::buildAndLoad(onnx, opts)`.
+- Preprocessing: the v6 OpenCV `cvtColor` + `resizeKeepAspectRatioPadRightBottom` + in-engine
+  HWC→NCHW/normalize is replaced by **one fused kernel**, `preproc::letterboxToTensor`, fed a
+  zero-copy `trtcpp::opencv::viewOf(gpuImg)` device view (BGR→RGB via `swapRB`, letterbox pad
+  right/bottom, `scale = 1/255`). Box-mapping ratio is unchanged.
+- `runInference(GpuMat, nested-vectors)` + `Engine<float>::transformOutput` → `engine.infer(...)`
+  returning name-keyed owning `Tensor`s; each output is read back with `toHost(stream)` (explicit
+  D2H + sync) into a flat `std::vector<float>`. The detect/pose/seg **post-processing math is
+  unchanged** — only how it obtains dims (`getOutputDims().d[i]` → cached `Shape[i]`) and the flat
+  output buffers.
+- Errors: v6 `bool`/exception checks → unwrap `Result`/`Status` (throwing a `std::runtime_error`
+  with `.status().message()` to preserve this app's exception-based control flow).
+
+**Precision (`src/cmd_line_util.h`)**
+- `Precision::FP32/FP16/INT8` → `trtcpp::Precision::kFp32/kFp16/kInt8Qdq`.
+- **INT8 caveat:** `kInt8Qdq` expects an explicit Q/DQ ONNX (no calibration data). The v6 flow
+  (a calibration-image directory) maps to `kInt8CalibLegacy`, which is only available when the
+  library is built against **TensorRT < 11** and requires constructing an `ICalibrator`
+  (`tensorrt_cpp_api/calibrator.h`) and setting `BuildOptions.calibrator`. That wiring is **not**
+  ported here — quantize to a QDQ ONNX, or restore a calibrator if you need legacy PTQ.
+
+**Misc**
+- Added `src/stopwatch.h` (a small `std::chrono` `preciseStopwatch`) to replace the timing utility
+  v6 shipped in the engine library and v7 does not.
+- The OpenCV modules the v6 `engine.h` pulled in transitively (`imgcodecs`, `videoio`, `highgui`)
+  are now included explicitly where used.
diff --git a/libs/tensorrt-cpp-api b/libs/tensorrt-cpp-api
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
@@ -1,6 +1,7 @@
 #include "cmd_line_util.h"
+#include "stopwatch.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/imgcodecs.hpp> // imread (was pulled in transitively by the v6 engine.h)
 
 // Benchmarks the specified model
 int main(int argc, char *argv[]) {

diff --git a/src/cmd_line_util.h b/src/cmd_line_util.h
@@ -191,11 +191,14 @@ inline bool parseArguments(int argc, char *argv[], YoloV8Config &config, std::st
                     return false;
 
                 if (nextArgument == "FP32") {
-                    config.precision = Precision::FP32;
+                    config.precision = trtcpp::Precision::kFp32;
                 } else if (nextArgument == "FP16") {
-                    config.precision = Precision::FP16;
+                    config.precision = trtcpp::Precision::kFp16;
                 } else if (nextArgument == "INT8") {
-                    config.precision = Precision::INT8;
+                    // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style
+                    // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when
+                    // the library is built against TensorRT < 11).
+                    config.precision = trtcpp::Precision::kInt8Qdq;
                 } else {
                     std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl;
                     return false;
@@ -379,11 +382,14 @@ inline bool parseArgumentsVideo(int argc, char *argv[], YoloV8Config &config, st
                     return false;
 
                 if (nextArgument == "FP32") {
-                    config.precision = Precision::FP32;
+                    config.precision = trtcpp::Precision::kFp32;
                 } else if (nextArgument == "FP16") {
-                    config.precision = Precision::FP16;
+                    config.precision = trtcpp::Precision::kFp16;
                 } else if (nextArgument == "INT8") {
-                    config.precision = Precision::INT8;
+                    // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style
+                    // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when
+                    // the library is built against TensorRT < 11).
+                    config.precision = trtcpp::Precision::kInt8Qdq;
                 } else {
                     std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl;
                     return false;

diff --git a/src/object_detection_csi_jetson.cpp b/src/object_detection_csi_jetson.cpp
@@ -1,6 +1,10 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+// Under tensorrt_cpp_api v7 the public headers pull in no OpenCV, so the modules this tool needs
+// (VideoCapture from videoio, imshow/waitKey from highgui) must be included explicitly rather than
+// transitively via the old v6 "engine.h".
+#include <opencv2/highgui.hpp>
+#include <opencv2/videoio.hpp>
 
 // Runs object detection on video stream then displays annotated results.
 int main(int argc, char *argv[]) {

diff --git a/src/object_detection_image.cpp b/src/object_detection_image.cpp
@@ -1,5 +1,6 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
+#include <opencv2/imgcodecs.hpp> // imread/imwrite (was pulled in transitively by the v6 engine.h)
 
 // Runs object detection on an input image then saves the annotated image to disk.
 int main(int argc, char *argv[]) {

diff --git a/src/object_detection_video_stream.cpp b/src/object_detection_video_stream.cpp
@@ -1,6 +1,7 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/highgui.hpp> // imshow/waitKey (was pulled in transitively by the v6 engine.h)
+#include <opencv2/videoio.hpp> // VideoCapture / CAP_PROP_*
 
 // Runs object detection on video stream then displays annotated results.
 int main(int argc, char *argv[]) {

diff --git a/src/stopwatch.h b/src/stopwatch.h
@@ -0,0 +1,17 @@
+#pragma once
+// Minimal stopwatch for the ENABLE_BENCHMARKS timing and the benchmark executable. v6 got this
+// from the engine library's util/Stopwatch.h, which v7 does not ship; this is a drop-in local
+// replacement with the same preciseStopwatch interface.
+#include <chrono>
+
+template <typename Clock = std::chrono::steady_clock> class Stopwatch {
+public:
+    template <typename T, typename Duration> T elapsedTime() const {
+        return static_cast<T>(std::chrono::duration_cast<Duration>(Clock::now() - start_).count());
+    }
+
+private:
+    typename Clock::time_point start_ = Clock::now();
+};
+
+using preciseStopwatch = Stopwatch<std::chrono::high_resolution_clock>;