From 4d2147a582b147cb3ad3e11adac8cc640267fc1b Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Fri, 29 May 2026 15:54:59 -0500
Subject: [PATCH 1/7] Migrate to tensorrt_cpp_api v7

Port the inference layer from the v6 templated Engine<T> API to v7 (namespace trtcpp, no-throw
Status/Result, name-keyed tensors, fused GPU preprocessing):

- Engine<float> -> trtcpp::Engine; Options/buildLoadNetwork -> BuildOptions/EngineBuilder::buildAndLoad.
- Preprocessing now uses the v7 fused kernel preproc::letterboxToTensor over a zero-copy
  opencv::viewOf(GpuMat) device view (BGR->RGB, letterbox, /255), replacing the OpenCV
  cvtColor/resize + in-engine normalize.
- runInference + transformOutput -> engine.infer(name-keyed) + per-output toHost(stream); the
  detect/pose/seg post-processing math is unchanged (only dim access and output buffering moved
  to the v7 Shape / flat Tensor API).
- Precision::FP32/FP16/INT8 -> trtcpp::Precision::kFp32/kFp16/kInt8Qdq (INT8 caveat in MIGRATION.md).
- CMake: C++20, namespaced targets + WITH_OPENCV/BUILD_PREPROC; added a local stopwatch.h to
  replace the timing util v7 no longer ships; explicit OpenCV module includes.

Requires the libs/tensorrt-cpp-api submodule at v7.0.0+. Syntax-checked against the real v7 +
OpenCV headers; NOT yet compiled/linked/run (prepared on a host with broken OpenCV-CUDA) -- build
and run before merging. See MIGRATION.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                        |  20 +--
 MIGRATION.md                          |  65 ++++++++++
 src/benchmark.cpp                     |   2 +
 src/cmd_line_util.h                   |  18 ++-
 src/object_detection_image.cpp        |   1 +
 src/object_detection_video_stream.cpp |   2 +
 src/stopwatch.h                       |  17 +++
 src/yolov8.cpp                        | 167 +++++++++++++-------------
 src/yolov8.h                          |  32 ++++-
 9 files changed, 222 insertions(+), 102 deletions(-)
 create mode 100644 MIGRATION.md
 create mode 100644 src/stopwatch.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bab0137..a2a0e3a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,8 +4,8 @@ project(YOLOv8_TensorRT_CPP)
 # Use ccache to speed up rebuilds
 include(cmake/ccache.cmake)
 
-# Set C++ version and optimization level
-set(CMAKE_CXX_STANDARD 17)
+# Set C++ version and optimization level (tensorrt_cpp_api v7 requires C++20)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")
 
 # CMake Options
@@ -19,14 +19,20 @@ find_package(OpenCV REQUIRED)
 # TODO: Specify the path to TensorRT root dir
 set(TensorRT_DIR /home/cyrus/work/libs/TensorRT-10.0.0.6/)
 
-# Build the TensorRT inference engine library
-# TensorRT is found and linked by the tensorrt-cpp-api
+# Build the tensorrt_cpp_api v7 library (git submodule). Enable its OpenCV interop so we can hand
+# cv::cuda::GpuMat frames to the fused preprocessing kernel zero-copy (trtcpp::opencv::viewOf).
+# NOTE: the libs/tensorrt-cpp-api submodule must be at tensorrt_cpp_api v7.0.0+ (see MIGRATION.md).
+set(TRT_CPP_API_WITH_OPENCV ON CACHE BOOL "" FORCE)
+set(TRT_CPP_API_BUILD_PREPROC ON CACHE BOOL "" FORCE)
 add_subdirectory(libs/tensorrt-cpp-api)
 
-# Build the YoloV8 library
+# Build the YoloV8 library. v7 exposes namespaced targets and propagates its own include dirs
+# (the tensorrt_cpp_api/ root), so the old libs/.../src include is gone.
 add_library(YoloV8_TRT SHARED src/yolov8.cpp)
-target_link_libraries(YoloV8_TRT PUBLIC tensorrt_cpp_api ${OpenCV_LIBS})
-target_include_directories(YoloV8_TRT PUBLIC libs/tensorrt-cpp-api/src)
+target_link_libraries(YoloV8_TRT PUBLIC
+    tensorrt_cpp_api::tensorrt_cpp_api
+    tensorrt_cpp_api::preproc
+    ${OpenCV_LIBS})
 
 # Build and link the executables
 add_executable(detect_object_image src/object_detection_image.cpp)
diff --git a/MIGRATION.md b/MIGRATION.md
new file mode 100644
index 0000000..9972518
--- /dev/null
+++ b/MIGRATION.md
@@ -0,0 +1,65 @@
+# Migration to tensorrt_cpp_api v7
+
+This branch (`v7-migration`) ports YOLOv8-TensorRT-CPP from the v6 `tensorrt-cpp-api` to **v7**,
+which is a clean break (new namespace `trtcpp`, no-throw `Status`/`Result`, name-keyed tensors,
+PImpl headers with no OpenCV/TensorRT leakage). See the library's `docs/upgrading_from_v6.md`.
+
+## ⚠️ Verification status
+
+The code has been **syntax-checked against the real v7 public headers + OpenCV headers**
+(`g++ -std=c++20 -fsyntax-only` over every translation unit — clean). It has **not been compiled,
+linked, or run**, because the machine this migration was prepared on has a broken OpenCV-CUDA
+install (the exact environment fragility v7 is designed to avoid). **Build and run on a host with a
+working OpenCV-CUDA before merging.**
+
+## Required: bump the submodule to v7
+
+`libs/tensorrt-cpp-api` is a git submodule. It must point at **tensorrt_cpp_api v7.0.0+**:
+
+```sh
+cd libs/tensorrt-cpp-api
+git fetch origin
+git checkout <v7.0.0 tag or commit>
+cd ../.. && git add libs/tensorrt-cpp-api
+```
+
+## What changed
+
+**Build (`CMakeLists.txt`)**
+- C++17 → **C++20** (v7 requirement).
+- Enable the library's OpenCV interop and preprocessing before `add_subdirectory`:
+  `TRT_CPP_API_WITH_OPENCV=ON`, `TRT_CPP_API_BUILD_PREPROC=ON`.
+- Link the namespaced v7 targets `tensorrt_cpp_api::tensorrt_cpp_api` + `tensorrt_cpp_api::preproc`
+  (was `tensorrt_cpp_api`); dropped the `libs/.../src` include — v7 propagates its own include root.
+
+**Inference layer (`src/yolov8.{h,cpp}`)**
+- `Engine<float>` → `trtcpp::Engine` (non-templated; runtime `DType`). IO is now name-keyed; the
+  class caches `m_inputName`/`m_outputNames`/`m_inputShape`/`m_outputShapes` and a reusable
+  NCHW-float input `Tensor` plus a `Stream`.
+- `Options` + `buildLoadNetwork(onnx, SUB, DIV, NORMALIZE)` → `BuildOptions` +
+  `EngineBuilder::buildAndLoad(onnx, opts)`.
+- Preprocessing: the v6 OpenCV `cvtColor` + `resizeKeepAspectRatioPadRightBottom` + in-engine
+  HWC→NCHW/normalize is replaced by **one fused kernel**, `preproc::letterboxToTensor`, fed a
+  zero-copy `trtcpp::opencv::viewOf(gpuImg)` device view (BGR→RGB via `swapRB`, letterbox pad
+  right/bottom, `scale = 1/255`). Box-mapping ratio is unchanged.
+- `runInference(GpuMat, nested-vectors)` + `Engine<float>::transformOutput` → `engine.infer(...)`
+  returning name-keyed owning `Tensor`s; each output is read back with `toHost(stream)` (explicit
+  D2H + sync) into a flat `std::vector<float>`. The detect/pose/seg **post-processing math is
+  unchanged** — only how it obtains dims (`getOutputDims().d[i]` → cached `Shape[i]`) and the flat
+  output buffers.
+- Errors: v6 `bool`/exception checks → unwrap `Result`/`Status` (throwing a `std::runtime_error`
+  with `.status().message()` to preserve this app's exception-based control flow).
+
+**Precision (`src/cmd_line_util.h`)**
+- `Precision::FP32/FP16/INT8` → `trtcpp::Precision::kFp32/kFp16/kInt8Qdq`.
+- **INT8 caveat:** `kInt8Qdq` expects an explicit Q/DQ ONNX (no calibration data). The v6 flow
+  (a calibration-image directory) maps to `kInt8CalibLegacy`, which is only available when the
+  library is built against **TensorRT < 11** and requires constructing an `ICalibrator`
+  (`tensorrt_cpp_api/calibrator.h`) and setting `BuildOptions.calibrator`. That wiring is **not**
+  ported here — quantize to a QDQ ONNX, or restore a calibrator if you need legacy PTQ.
+
+**Misc**
+- Added `src/stopwatch.h` (a small `std::chrono` `preciseStopwatch`) to replace the timing utility
+  v6 shipped in the engine library and v7 does not.
+- The OpenCV modules the v6 `engine.h` pulled in transitively (`imgcodecs`, `videoio`, `highgui`)
+  are now included explicitly where used.
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 284fc24..e9cfe5e 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -1,6 +1,8 @@
 #include "cmd_line_util.h"
+#include "stopwatch.h"
 #include "yolov8.h"
 #include <opencv2/cudaimgproc.hpp>
+#include <opencv2/imgcodecs.hpp> // imread (was pulled in transitively by the v6 engine.h)
 
 // Benchmarks the specified model
 int main(int argc, char *argv[]) {
diff --git a/src/cmd_line_util.h b/src/cmd_line_util.h
index 808a8ad..576bc82 100644
--- a/src/cmd_line_util.h
+++ b/src/cmd_line_util.h
@@ -178,11 +178,14 @@ inline bool parseArguments(int argc, char *argv[], YoloV8Config &config, std::st
                     return false;
 
                 if (nextArgument == "FP32") {
-                    config.precision = Precision::FP32;
+                    config.precision = trtcpp::Precision::kFp32;
                 } else if (nextArgument == "FP16") {
-                    config.precision = Precision::FP16;
+                    config.precision = trtcpp::Precision::kFp16;
                 } else if (nextArgument == "INT8") {
-                    config.precision = Precision::INT8;
+                    // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style
+                    // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when
+                    // the library is built against TensorRT < 11).
+                    config.precision = trtcpp::Precision::kInt8Qdq;
                 } else {
                     std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl;
                     return false;
@@ -354,11 +357,14 @@ inline bool parseArgumentsVideo(int argc, char *argv[], YoloV8Config &config, st
                     return false;
 
                 if (nextArgument == "FP32") {
-                    config.precision = Precision::FP32;
+                    config.precision = trtcpp::Precision::kFp32;
                 } else if (nextArgument == "FP16") {
-                    config.precision = Precision::FP16;
+                    config.precision = trtcpp::Precision::kFp16;
                 } else if (nextArgument == "INT8") {
-                    config.precision = Precision::INT8;
+                    // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style
+                    // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when
+                    // the library is built against TensorRT < 11).
+                    config.precision = trtcpp::Precision::kInt8Qdq;
                 } else {
                     std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl;
                     return false;
diff --git a/src/object_detection_image.cpp b/src/object_detection_image.cpp
index f95caef..d72fc8e 100644
--- a/src/object_detection_image.cpp
+++ b/src/object_detection_image.cpp
@@ -1,5 +1,6 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
+#include <opencv2/imgcodecs.hpp> // imread/imwrite (was pulled in transitively by the v6 engine.h)
 
 // Runs object detection on an input image then saves the annotated image to disk.
 int main(int argc, char *argv[]) {
diff --git a/src/object_detection_video_stream.cpp b/src/object_detection_video_stream.cpp
index 56f8123..beda4b5 100644
--- a/src/object_detection_video_stream.cpp
+++ b/src/object_detection_video_stream.cpp
@@ -1,6 +1,8 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
 #include <opencv2/cudaimgproc.hpp>
+#include <opencv2/highgui.hpp> // imshow/waitKey (was pulled in transitively by the v6 engine.h)
+#include <opencv2/videoio.hpp> // VideoCapture / CAP_PROP_*
 
 // Runs object detection on video stream then displays annotated results.
 int main(int argc, char *argv[]) {
diff --git a/src/stopwatch.h b/src/stopwatch.h
new file mode 100644
index 0000000..04af649
--- /dev/null
+++ b/src/stopwatch.h
@@ -0,0 +1,17 @@
+#pragma once
+// Minimal stopwatch for the ENABLE_BENCHMARKS timing and the benchmark executable. v6 got this
+// from the engine library's util/Stopwatch.h, which v7 does not ship; this is a drop-in local
+// replacement with the same preciseStopwatch interface.
+#include <chrono>
+
+template <typename Clock = std::chrono::steady_clock> class Stopwatch {
+public:
+    template <typename T, typename Duration> T elapsedTime() const {
+        return static_cast<T>(std::chrono::duration_cast<Duration>(Clock::now() - start_).count());
+    }
+
+private:
+    typename Clock::time_point start_ = Clock::now();
+};
+
+using preciseStopwatch = Stopwatch<std::chrono::high_resolution_clock>;
diff --git a/src/yolov8.cpp b/src/yolov8.cpp
index 743290e..f3a2856 100644
--- a/src/yolov8.cpp
+++ b/src/yolov8.cpp
@@ -1,66 +1,67 @@
 #include "yolov8.h"
+#include "stopwatch.h"
 #include <opencv2/cudaimgproc.hpp>
 
 YoloV8::YoloV8(const std::string &onnxModelPath, const YoloV8Config &config)
     : PROBABILITY_THRESHOLD(config.probabilityThreshold), NMS_THRESHOLD(config.nmsThreshold), TOP_K(config.topK),
       SEG_CHANNELS(config.segChannels), SEG_H(config.segH), SEG_W(config.segW), SEGMENTATION_THRESHOLD(config.segmentationThreshold),
       CLASS_NAMES(config.classNames), NUM_KPS(config.numKPS), KPS_THRESHOLD(config.kpsThreshold) {
-    // Specify options for GPU inference
-    Options options;
-    options.optBatchSize = 1;
-    options.maxBatchSize = 1;
-
+    // Specify build options for the v7 engine builder. (Batch knobs are now expressed as
+    // optimization profiles; this detector uses the model's static 1x3xHxW input.)
+    trtcpp::BuildOptions options;
     options.precision = config.precision;
-    options.calibrationDataDirectoryPath = config.calibrationDataDirectory;
+    options.engineCacheDir = "."; // build-or-load caches next to the working dir; v7 detects staleness
 
-    if (options.precision == Precision::INT8) {
-        if (options.calibrationDataDirectoryPath.empty()) {
-            throw std::runtime_error("Error: Must supply calibration data path for INT8 calibration");
-        }
+    // v7 INT8: prefer an explicit-QDQ ONNX with Precision::kInt8Qdq (no calibration data). Legacy
+    // calibrator PTQ (kInt8CalibLegacy) is only available when the library is built against
+    // TensorRT < 11 and is wired via BuildOptions.calibrator (see tensorrt_cpp_api/calibrator.h).
+    if (options.precision == trtcpp::Precision::kInt8CalibLegacy && config.calibrationDataDirectory.empty()) {
+        throw std::runtime_error("Error: Must supply calibration data path for legacy INT8 calibration");
     }
 
-    // Create our TensorRT inference engine
-    m_trtEngine = std::make_unique<Engine<float>>(options);
-
-    // Build the onnx model into a TensorRT engine file, cache the file to disk, and then load the TensorRT engine file into memory.
-    // If the engine file already exists on disk, this function will not rebuild but only load into memory.
-    // The engine file is rebuilt any time the above Options are changed.
-    auto succ = m_trtEngine->buildLoadNetwork(onnxModelPath, SUB_VALS, DIV_VALS, NORMALIZE);
-    if (!succ) {
-        const std::string errMsg = "Error: Unable to build or load the TensorRT engine. "
-                                   "Try increasing TensorRT log severity to kVERBOSE (in /libs/tensorrt-cpp-api/engine.cpp).";
-        throw std::runtime_error(errMsg);
+    // Build the ONNX into a TensorRT engine (or load a fresh cached one) and deserialize it.
+    auto engine = trtcpp::EngineBuilder{}.buildAndLoad(onnxModelPath, options);
+    if (!engine) {
+        throw std::runtime_error("Error: Unable to build or load the TensorRT engine: " + engine.status().message());
     }
-}
-
-std::vector<std::vector<cv::cuda::GpuMat>> YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) {
-    // Populate the input vectors
-    const auto &inputDims = m_trtEngine->getInputDims();
-
-    // Convert the image from BGR to RGB
-    cv::cuda::GpuMat rgbMat;
-    cv::cuda::cvtColor(gpuImg, rgbMat, cv::COLOR_BGR2RGB);
-
-    auto resized = rgbMat;
-
-    // Resize to the model expected input size while maintaining the aspect ratio with the use of padding
-    if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
-        // Only resize if not already the right size to avoid unecessary copy
-        resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(rgbMat, inputDims[0].d[1], inputDims[0].d[2]);
+    m_engine = std::make_unique<trtcpp::Engine>(std::move(engine).value());
+
+    // Cache IO metadata once (v7 is name-keyed and non-templated).
+    m_inputName = m_engine->inputNames().front();
+    m_outputNames = m_engine->outputNames();
+    m_inputShape = m_engine->tensorShape(m_inputName).value(); // [1,3,H,W]
+    for (const auto &name : m_outputNames) {
+        m_outputShapes.push_back(m_engine->tensorShape(name).value());
     }
 
-    // Convert to format expected by our inference engine
-    // The reason for the strange format is because it supports models with multiple inputs as well as batching
-    // In our case though, the model only has a single input and we are using a batch size of 1.
-    std::vector<cv::cuda::GpuMat> input{std::move(resized)};
-    std::vector<std::vector<cv::cuda::GpuMat>> inputs{std::move(input)};
-
-    // These params will be used in the post-processing stage
-    m_imgHeight = rgbMat.rows;
-    m_imgWidth = rgbMat.cols;
-    m_ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(rgbMat.cols), inputDims[0].d[1] / static_cast<float>(rgbMat.rows));
+    // Pre-allocate the NCHW float input tensor (static shape; reused across frames).
+    m_input = trtcpp::Tensor::allocate(trtcpp::DType::kFloat32, m_inputShape, trtcpp::Device::kCuda).value();
+}
 
-    return inputs;
+void YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) {
+    // Record original dims + the letterbox ratio used by post-processing to map boxes back to the
+    // source image. inputShape is [1, 3, H, W].
+    m_imgHeight = static_cast<float>(gpuImg.rows);
+    m_imgWidth = static_cast<float>(gpuImg.cols);
+    const int inH = static_cast<int>(m_inputShape[2]);
+    const int inW = static_cast<int>(m_inputShape[3]);
+    m_ratio = 1.f / std::min(inW / m_imgWidth, inH / m_imgHeight);
+
+    // One fused GPU kernel replaces the v6 cvtColor + resizeKeepAspectRatioPadRightBottom and the
+    // in-engine HWC->NCHW + normalize: BGR->RGB, letterbox-resize (pad right/bottom), scale by
+    // 1/255 (SUB_VALS=0, DIV_VALS=1, NORMALIZE), and write the NCHW float input tensor in place.
+    trtcpp::preproc::PreprocSpec spec;
+    spec.swapRB = true;             // OpenCV GpuMat is BGR; the model expects RGB
+    spec.keepAspectRatioPad = true; // letterbox, pad right/bottom (matches v6)
+    spec.scale = {1.f / 255.f, 1.f / 255.f, 1.f / 255.f, 1.f};
+
+    auto src = trtcpp::opencv::viewOf(gpuImg); // zero-copy HWC-uint8 device view (continuous GpuMat)
+    if (!src) {
+        throw std::runtime_error("Error: could not view the input GpuMat (clone() a padded mat first): " + src.status().message());
+    }
+    if (auto s = trtcpp::preproc::letterboxToTensor(src.value(), m_input.view(), spec, m_stream); !s) {
+        throw std::runtime_error("Error: preprocessing failed: " + s.message());
+    }
 }
 
 std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR) {
@@ -69,7 +70,7 @@ std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR)
     static int numIts = 1;
     preciseStopwatch s1;
 #endif
-    const auto input = preprocess(inputImageBGR);
+    preprocess(inputImageBGR); // fills m_input
 #ifdef ENABLE_BENCHMARKS
     static long long t1 = 0;
     t1 += s1.elapsedTime<long long, std::chrono::microseconds>();
@@ -79,10 +80,21 @@ std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR)
 #ifdef ENABLE_BENCHMARKS
     preciseStopwatch s2;
 #endif
-    std::vector<std::vector<std::vector<float>>> featureVectors;
-    auto succ = m_trtEngine->runInference(input, featureVectors);
-    if (!succ) {
-        throw std::runtime_error("Error: Unable to run inference.");
+    auto outputs = m_engine->infer({{m_inputName, m_input.view()}}, m_stream);
+    if (!outputs) {
+        throw std::runtime_error("Error: Unable to run inference: " + outputs.status().message());
+    }
+    // Read each output back to a flat host float vector, in output-binding order. (v7 returns
+    // name-keyed owning Tensors; toHost performs the D2H copy AND synchronizes the stream.)
+    std::vector<std::vector<float>> featureVectors;
+    featureVectors.reserve(m_outputNames.size());
+    for (const auto &name : m_outputNames) {
+        auto host = outputs->at(name).toHost(m_stream);
+        if (!host) {
+            throw std::runtime_error("Error: output readback failed: " + host.status().message());
+        }
+        const auto span = host->as<float>().value();
+        featureVectors.emplace_back(span.begin(), span.end());
     }
 #ifdef ENABLE_BENCHMARKS
     static long long t2 = 0;
@@ -91,31 +103,23 @@ std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR)
     preciseStopwatch s3;
 #endif
     // Check if our model does only object detection or also supports segmentation
+    // v7 already gives one flat host vector per output (batch size 1), so the v6 transformOutput
+    // 3D->1D/2D flattening is no longer needed.
     std::vector<Object> ret;
-    const auto &numOutputs = m_trtEngine->getOutputDims().size();
-    if (numOutputs == 1) {
-        // Object detection or pose estimation
-        // Since we have a batch size of 1 and only 1 output, we must convert the output from a 3D array to a 1D array.
-        std::vector<float> featureVector;
-        Engine<float>::transformOutput(featureVectors, featureVector);
-
-        const auto &outputDims = m_trtEngine->getOutputDims();
-        int numChannels = outputDims[outputDims.size() - 1].d[1];
-        // TODO: Need to improve this to make it more generic (don't use magic number).
-        // For now it works with Ultralytics pretrained models.
+    if (m_outputShapes.size() == 1) {
+        // Object detection or pose estimation. Output shape is [1, C, anchors]; pose models have
+        // C == 56 (4 box + 1 score + 17*3 keypoints).
+        // TODO: improve this to be more generic (don't use the magic number); works with
+        // Ultralytics pretrained models.
+        const int numChannels = static_cast<int>(m_outputShapes[0][1]);
         if (numChannels == 56) {
-            // Pose estimation
-            ret = postprocessPose(featureVector);
+            ret = postprocessPose(featureVectors[0]);
         } else {
-            // Object detection
-            ret = postprocessDetect(featureVector);
+            ret = postprocessDetect(featureVectors[0]);
         }
     } else {
-        // Segmentation
-        // Since we have a batch size of 1 and 2 outputs, we must convert the output from a 3D array to a 2D array.
-        std::vector<std::vector<float>> featureVector;
-        Engine<float>::transformOutput(featureVectors, featureVector);
-        ret = postProcessSegmentation(featureVector);
+        // Instance segmentation (detections + mask prototypes).
+        ret = postProcessSegmentation(featureVectors);
     }
 #ifdef ENABLE_BENCHMARKS
     static long long t3 = 0;
@@ -135,10 +139,8 @@ std::vector<Object> YoloV8::detectObjects(const cv::Mat &inputImageBGR) {
 }
 
 std::vector<Object> YoloV8::postProcessSegmentation(std::vector<std::vector<float>> &featureVectors) {
-    const auto &outputDims = m_trtEngine->getOutputDims();
-
-    int numChannels = outputDims[0].d[1];
-    int numAnchors = outputDims[0].d[2];
+    int numChannels = static_cast<int>(m_outputShapes[0][1]);
+    int numAnchors = static_cast<int>(m_outputShapes[0][2]);
 
     const auto numClasses = numChannels - SEG_CHANNELS - 4;
 
@@ -225,7 +227,6 @@ std::vector<Object> YoloV8::postProcessSegmentation(std::vector<std::vector<floa
 
         std::vector<cv::Mat> maskChannels;
         cv::split(maskMat, maskChannels);
-        const auto inputDims = m_trtEngine->getInputDims();
 
         cv::Rect roi;
         if (m_imgHeight > m_imgWidth) {
@@ -248,9 +249,8 @@ std::vector<Object> YoloV8::postProcessSegmentation(std::vector<std::vector<floa
 }
 
 std::vector<Object> YoloV8::postprocessPose(std::vector<float> &featureVector) {
-    const auto &outputDims = m_trtEngine->getOutputDims();
-    auto numChannels = outputDims[0].d[1];
-    auto numAnchors = outputDims[0].d[2];
+    const auto numChannels = static_cast<int>(m_outputShapes[0][1]);
+    const auto numAnchors = static_cast<int>(m_outputShapes[0][2]);
 
     std::vector<cv::Rect> bboxes;
     std::vector<float> scores;
@@ -330,9 +330,8 @@ std::vector<Object> YoloV8::postprocessPose(std::vector<float> &featureVector) {
 }
 
 std::vector<Object> YoloV8::postprocessDetect(std::vector<float> &featureVector) {
-    const auto &outputDims = m_trtEngine->getOutputDims();
-    auto numChannels = outputDims[0].d[1];
-    auto numAnchors = outputDims[0].d[2];
+    const auto numChannels = static_cast<int>(m_outputShapes[0][1]);
+    const auto numAnchors = static_cast<int>(m_outputShapes[0][2]);
 
     auto numClasses = CLASS_NAMES.size();
 
diff --git a/src/yolov8.h b/src/yolov8.h
index 5bc09f2..140c0d4 100644
--- a/src/yolov8.h
+++ b/src/yolov8.h
@@ -1,6 +1,20 @@
 #pragma once
-#include "engine.h"
+// Migrated to tensorrt_cpp_api v7. The v7 public headers deliberately pull in no OpenCV, so the
+// OpenCV headers this class needs (cv::Mat / GpuMat / dnn::NMSBoxesBatched) are now included
+// explicitly here rather than transitively via the old "engine.h".
+#include <tensorrt_cpp_api/all.h>
+#include <tensorrt_cpp_api/opencv_interop.h>
+#include <tensorrt_cpp_api/preproc.h>
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/dnn.hpp>
+#include <opencv2/imgproc.hpp>
+
 #include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
 
 // Utility method for checking if a file exists on disk
 inline bool doesFileExist(const std::string &name) {
@@ -25,7 +39,7 @@ struct Object {
 // Can pass these arguments as command line parameters.
 struct YoloV8Config {
     // The precision to be used for inference
-    Precision precision = Precision::FP16;
+    trtcpp::Precision precision = trtcpp::Precision::kFp16;
     // Calibration data directory. Must be specified when using INT8 precision.
     std::string calibrationDataDirectory;
     // Probability threshold used to filter detected objects
@@ -71,8 +85,8 @@ class YoloV8 {
     void drawObjectLabels(cv::Mat &image, const std::vector<Object> &objects, unsigned int scale = 2);
 
 private:
-    // Preprocess the input
-    std::vector<std::vector<cv::cuda::GpuMat>> preprocess(const cv::cuda::GpuMat &gpuImg);
+    // Preprocess the input (fills m_input + m_ratio/m_imgWidth/m_imgHeight)
+    void preprocess(const cv::cuda::GpuMat &gpuImg);
 
     // Postprocess the output
     std::vector<Object> postprocessDetect(std::vector<float> &featureVector);
@@ -83,7 +97,15 @@ class YoloV8 {
     // Postprocess the output for segmentation model
     std::vector<Object> postProcessSegmentation(std::vector<std::vector<float>> &featureVectors);
 
-    std::unique_ptr<Engine<float>> m_trtEngine = nullptr;
+    // v7 engine + cached IO metadata (v7 is name-keyed and non-templated). The owning input
+    // Tensor is reused across frames; the caller-owned stream drives async work.
+    std::unique_ptr<trtcpp::Engine> m_engine = nullptr;
+    trtcpp::Stream m_stream;
+    std::string m_inputName;
+    std::vector<std::string> m_outputNames;
+    trtcpp::Shape m_inputShape;                // [1,3,H,W]
+    std::vector<trtcpp::Shape> m_outputShapes; // build-time shapes, in output-binding order
+    trtcpp::Tensor m_input;                    // NCHW float device tensor fed to the engine
 
     // Used for image preprocessing
     // YoloV8 model expects values between [0.f, 1.f] so we use the following params

From 9dbb6a1cb027f70e4f086af0342494c59d1ed97c Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Fri, 29 May 2026 16:00:48 -0500
Subject: [PATCH 2/7] Include <iostream> for the benchmark timing output

yolov8.cpp uses std::cout under ENABLE_BENCHMARKS; v6 got <iostream> transitively via engine.h,
which v7 does not pull in. Include it explicitly. (Default builds are unaffected.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/yolov8.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/yolov8.cpp b/src/yolov8.cpp
index f3a2856..1d6de71 100644
--- a/src/yolov8.cpp
+++ b/src/yolov8.cpp
@@ -1,5 +1,6 @@
 #include "yolov8.h"
 #include "stopwatch.h"
+#include <iostream> // std::cout in the ENABLE_BENCHMARKS timing (was transitive via the v6 engine.h)
 #include <opencv2/cudaimgproc.hpp>
 
 YoloV8::YoloV8(const std::string &onnxModelPath, const YoloV8Config &config)

From a69b62953785ac705b818a5d80c7d5c0a003663a Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Fri, 29 May 2026 16:37:06 -0500
Subject: [PATCH 3/7] Bump tensorrt-cpp-api submodule to v7.0.0-rc1

Point libs/tensorrt-cpp-api at the v7 release candidate so the v7-migrated YoloV8 code
(namespaced targets, fused preproc, name-keyed IO) builds. See MIGRATION.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 libs/tensorrt-cpp-api | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/tensorrt-cpp-api b/libs/tensorrt-cpp-api
index f93f973..1aaf789 160000
--- a/libs/tensorrt-cpp-api
+++ b/libs/tensorrt-cpp-api
@@ -1 +1 @@
-Subproject commit f93f973cd03f1caa710c60d60c0d6feb9ed79e4a
+Subproject commit 1aaf7896b7dd7e70fda870aad6a2bd9e5859e2ba

From e6d52dc35541433ce86708d444e0f214a426bdcc Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Fri, 29 May 2026 16:37:31 -0500
Subject: [PATCH 4/7] Throw instead of asserting on failed v7 Result unwraps

From the code-review pass: replace the unchecked .value() calls (tensorShape, Tensor::allocate,
TensorView::as<float>) with a must() helper that throws std::runtime_error on error. .value() on
an error Result asserts in debug and is undefined behavior in this app's -DNDEBUG release builds
(e.g. on a dynamic/oversized input shape, CUDA OOM, or a non-float engine output).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/yolov8.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/yolov8.cpp b/src/yolov8.cpp
index 1d6de71..c8ce80b 100644
--- a/src/yolov8.cpp
+++ b/src/yolov8.cpp
@@ -2,6 +2,19 @@
 #include "stopwatch.h"
 #include <iostream> // std::cout in the ENABLE_BENCHMARKS timing (was transitive via the v6 engine.h)
 #include <opencv2/cudaimgproc.hpp>
+#include <stdexcept>
+
+namespace {
+// Unwrap a v7 Result, throwing on error (this app uses exceptions). Calling .value() directly
+// would assert in debug builds and be undefined behavior in -DNDEBUG release builds when the
+// Result holds an error (e.g. a dynamic/oversized shape, OOM, or a non-float output dtype).
+template <class T> T must(trtcpp::Result<T> r, const char *what) {
+    if (!r) {
+        throw std::runtime_error(std::string("Error: ") + what + ": " + r.status().message());
+    }
+    return std::move(r).value();
+}
+} // namespace
 
 YoloV8::YoloV8(const std::string &onnxModelPath, const YoloV8Config &config)
     : PROBABILITY_THRESHOLD(config.probabilityThreshold), NMS_THRESHOLD(config.nmsThreshold), TOP_K(config.topK),
@@ -30,13 +43,14 @@ YoloV8::YoloV8(const std::string &onnxModelPath, const YoloV8Config &config)
     // Cache IO metadata once (v7 is name-keyed and non-templated).
     m_inputName = m_engine->inputNames().front();
     m_outputNames = m_engine->outputNames();
-    m_inputShape = m_engine->tensorShape(m_inputName).value(); // [1,3,H,W]
+    m_inputShape = must(m_engine->tensorShape(m_inputName), "query input shape"); // [1,3,H,W]
     for (const auto &name : m_outputNames) {
-        m_outputShapes.push_back(m_engine->tensorShape(name).value());
+        m_outputShapes.push_back(must(m_engine->tensorShape(name), "query output shape"));
     }
 
-    // Pre-allocate the NCHW float input tensor (static shape; reused across frames).
-    m_input = trtcpp::Tensor::allocate(trtcpp::DType::kFloat32, m_inputShape, trtcpp::Device::kCuda).value();
+    // Pre-allocate the NCHW float input tensor. allocate() errors (and we throw) on a dynamic
+    // input shape or a CUDA OOM rather than crashing on an unchecked .value().
+    m_input = must(trtcpp::Tensor::allocate(trtcpp::DType::kFloat32, m_inputShape, trtcpp::Device::kCuda), "allocate input tensor");
 }
 
 void YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) {
@@ -94,7 +108,7 @@ std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR)
         if (!host) {
             throw std::runtime_error("Error: output readback failed: " + host.status().message());
         }
-        const auto span = host->as<float>().value();
+        const auto span = must(host->as<float>(), "output tensor is not float32 (rebuild the engine with a float output)");
         featureVectors.emplace_back(span.begin(), span.end());
     }
 #ifdef ENABLE_BENCHMARKS

From 19d8d8cd8b67c99179072e50d2af9bb20fd68346 Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Sat, 30 May 2026 07:55:44 -0500
Subject: [PATCH 5/7] Verify build/run; drop dead cudaimgproc include; handle
 pitched GpuMat

Built and ran the migration on an RTX 3080 against OpenCV-CUDA + tensorrt_cpp_api v7:
detect_object_image on team.jpg (FP16 YOLOv8n) detects 9 objects.

Fixes found by that real build/run:
- The preprocess fed cv::cuda::GpuMat directly to opencv::viewOf, but a GpuMat's rows are pitched
  (padded) while a TensorView is contiguous, so viewOf rejected it at runtime. Copy into a
  cv::cuda::createContinuous buffer when the upload isn't already continuous.
- Removed the now-dead <opencv2/cudaimgproc.hpp> includes (the migration replaced cv::cuda::cvtColor
  with the fused preproc kernel; only cv::cuda::GpuMat from core/cuda is still used).

Bump the libs/tensorrt-cpp-api submodule to the v7 release that builds the static core as PIC
(required to link libYoloV8_TRT.so). MIGRATION.md updated to reflect the verified build.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 MIGRATION.md                          | 20 ++++++++------------
 libs/tensorrt-cpp-api                 |  2 +-
 src/benchmark.cpp                     |  1 -
 src/object_detection_video_stream.cpp |  1 -
 src/yolov8.cpp                        | 12 +++++++++---
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/MIGRATION.md b/MIGRATION.md
index 9972518..c5095d3 100644
--- a/MIGRATION.md
+++ b/MIGRATION.md
@@ -4,23 +4,19 @@ This branch (`v7-migration`) ports YOLOv8-TensorRT-CPP from the v6 `tensorrt-cpp
 which is a clean break (new namespace `trtcpp`, no-throw `Status`/`Result`, name-keyed tensors,
 PImpl headers with no OpenCV/TensorRT leakage). See the library's `docs/upgrading_from_v6.md`.
 
-## ⚠️ Verification status
+## Verification status
 
-The code has been **syntax-checked against the real v7 public headers + OpenCV headers**
-(`g++ -std=c++20 -fsyntax-only` over every translation unit — clean). It has **not been compiled,
-linked, or run**, because the machine this migration was prepared on has a broken OpenCV-CUDA
-install (the exact environment fragility v7 is designed to avoid). **Build and run on a host with a
-working OpenCV-CUDA before merging.**
+Built and run on an RTX 3080 Laptop GPU against a CUDA-12.6 OpenCV-CUDA build and the v7 library:
+`detect_object_image` on `images/team.jpg` (FP16 YOLOv8n) detects 9 objects and writes the
+annotated image. The library, `opencv_interop`, the preproc sublib, and the YoloV8 code all compile
+and link.
 
-## Required: bump the submodule to v7
+## Required after pulling: init the submodule
 
-`libs/tensorrt-cpp-api` is a git submodule. It must point at **tensorrt_cpp_api v7.0.0+**:
+`libs/tensorrt-cpp-api` is a git submodule pinned to a v7 release; populate it before building:
 
 ```sh
-cd libs/tensorrt-cpp-api
-git fetch origin
-git checkout <v7.0.0 tag or commit>
-cd ../.. && git add libs/tensorrt-cpp-api
+git submodule update --init
 ```
 
 ## What changed
diff --git a/libs/tensorrt-cpp-api b/libs/tensorrt-cpp-api
index 1aaf789..6a6d4dd 160000
--- a/libs/tensorrt-cpp-api
+++ b/libs/tensorrt-cpp-api
@@ -1 +1 @@
-Subproject commit 1aaf7896b7dd7e70fda870aad6a2bd9e5859e2ba
+Subproject commit 6a6d4dd188f1f07a9f179c01b9df185d0e063d74
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index e9cfe5e..bb97e76 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -1,7 +1,6 @@
 #include "cmd_line_util.h"
 #include "stopwatch.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
 #include <opencv2/imgcodecs.hpp> // imread (was pulled in transitively by the v6 engine.h)
 
 // Benchmarks the specified model
diff --git a/src/object_detection_video_stream.cpp b/src/object_detection_video_stream.cpp
index beda4b5..9fd94c9 100644
--- a/src/object_detection_video_stream.cpp
+++ b/src/object_detection_video_stream.cpp
@@ -1,6 +1,5 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
 #include <opencv2/highgui.hpp> // imshow/waitKey (was pulled in transitively by the v6 engine.h)
 #include <opencv2/videoio.hpp> // VideoCapture / CAP_PROP_*
 
diff --git a/src/yolov8.cpp b/src/yolov8.cpp
index c8ce80b..fddc139 100644
--- a/src/yolov8.cpp
+++ b/src/yolov8.cpp
@@ -1,7 +1,6 @@
 #include "yolov8.h"
 #include "stopwatch.h"
 #include <iostream> // std::cout in the ENABLE_BENCHMARKS timing (was transitive via the v6 engine.h)
-#include <opencv2/cudaimgproc.hpp>
 #include <stdexcept>
 
 namespace {
@@ -70,9 +69,16 @@ void YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) {
     spec.keepAspectRatioPad = true; // letterbox, pad right/bottom (matches v6)
     spec.scale = {1.f / 255.f, 1.f / 255.f, 1.f / 255.f, 1.f};
 
-    auto src = trtcpp::opencv::viewOf(gpuImg); // zero-copy HWC-uint8 device view (continuous GpuMat)
+    // cv::cuda::GpuMat rows are typically pitched (padded for alignment) and a TensorView is
+    // contiguous, so make a continuous copy when the upload isn't already continuous.
+    cv::cuda::GpuMat continuous = gpuImg;
+    if (!gpuImg.isContinuous()) {
+        cv::cuda::createContinuous(gpuImg.rows, gpuImg.cols, gpuImg.type(), continuous);
+        gpuImg.copyTo(continuous);
+    }
+    auto src = trtcpp::opencv::viewOf(continuous); // zero-copy HWC-uint8 device view
     if (!src) {
-        throw std::runtime_error("Error: could not view the input GpuMat (clone() a padded mat first): " + src.status().message());
+        throw std::runtime_error("Error: could not view the input GpuMat: " + src.status().message());
     }
     if (auto s = trtcpp::preproc::letterboxToTensor(src.value(), m_input.view(), spec, m_stream); !s) {
         throw std::runtime_error("Error: preprocessing failed: " + s.message());

From 20ed9228dbb9f3f4891fcb66c08346c05202ea9b Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Sat, 30 May 2026 12:31:52 -0500
Subject: [PATCH 6/7] Fix CSI-Jetson tool's OpenCV includes for the v7
 migration

The object_detection_csi_jetson.cpp tool added on main relied on
<opencv2/highgui.hpp> and <opencv2/videoio.hpp> being pulled in transitively
via the v6 "engine.h". Under tensorrt_cpp_api v7 the public headers
deliberately include no OpenCV, so cv::VideoCapture / cv::imshow / cv::waitKey
no longer resolved and the target failed to compile. Include highgui and
videoio explicitly (matching how the migration fixed the other tools) and drop
the unused <opencv2/cudaimgproc.hpp> include (this tool does no cv::cuda
imgproc; it uploads a cv::Mat inside detectObjects).

Verified with -fsyntax-only against the full OpenCV-CUDA headers: clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/object_detection_csi_jetson.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/object_detection_csi_jetson.cpp b/src/object_detection_csi_jetson.cpp
index 63682a3..5f94c27 100644
--- a/src/object_detection_csi_jetson.cpp
+++ b/src/object_detection_csi_jetson.cpp
@@ -1,6 +1,10 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+// Under tensorrt_cpp_api v7 the public headers pull in no OpenCV, so the modules this tool needs
+// (VideoCapture from videoio, imshow/waitKey from highgui) must be included explicitly rather than
+// transitively via the old v6 "engine.h".
+#include <opencv2/highgui.hpp>
+#include <opencv2/videoio.hpp>
 
 // Runs object detection on video stream then displays annotated results.
 int main(int argc, char *argv[]) {

From 5eabd7fac0aba98c770db71c1d4e16ea5e8c7996 Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Sat, 30 May 2026 12:43:27 -0500
Subject: [PATCH 7/7] docs: make MIGRATION verification note durable (FP16
 count variance)

detect_object_image detects ~10 objects on team.jpg; the exact count shifts by
one or two across FP16 engine rebuilds (TensorRT tactic selection near the
confidence threshold), so state it as approximate rather than pinning an exact
count. Also note the video/CSI demo targets need an OpenCV with highgui/videoio.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 MIGRATION.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/MIGRATION.md b/MIGRATION.md
index c5095d3..49c99d8 100644
--- a/MIGRATION.md
+++ b/MIGRATION.md
@@ -7,9 +7,12 @@ PImpl headers with no OpenCV/TensorRT leakage). See the library's `docs/upgradin
 ## Verification status
 
 Built and run on an RTX 3080 Laptop GPU against a CUDA-12.6 OpenCV-CUDA build and the v7 library:
-`detect_object_image` on `images/team.jpg` (FP16 YOLOv8n) detects 9 objects and writes the
-annotated image. The library, `opencv_interop`, the preproc sublib, and the YoloV8 code all compile
-and link.
+`detect_object_image` on `images/team.jpg` (FP16 YOLOv8n) runs end-to-end and detects the people in
+the frame and writes the annotated image (~10 objects; the exact count shifts by one or two across
+FP16 engine rebuilds, as TensorRT's tactic selection moves borderline detections across the
+confidence threshold). The library, `opencv_interop`, the preproc sublib, and the YoloV8 detection
+code compile and run. The video/CSI demo targets additionally need an OpenCV built with
+`highgui`/`videoio`.
 
 ## Required after pulling: init the submodule