diff --git a/CMakeLists.txt b/CMakeLists.txt
index f62802a..54e54b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,8 +4,8 @@ project(YOLOv8_TensorRT_CPP)
 # Use ccache to speed up rebuilds
 include(cmake/ccache.cmake)
 
-# Set C++ version and optimization level
-set(CMAKE_CXX_STANDARD 17)
+# Set C++ version and optimization level (tensorrt_cpp_api v7 requires C++20)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")
 
 # CMake Options
@@ -19,14 +19,20 @@ find_package(OpenCV REQUIRED)
 # TODO: Specify the path to TensorRT root dir
 set(TensorRT_DIR /home/cyrus/work/libs/TensorRT-10.0.0.6/)
 
-# Build the TensorRT inference engine library
-# TensorRT is found and linked by the tensorrt-cpp-api
+# Build the tensorrt_cpp_api v7 library (git submodule). Enable its OpenCV interop so we can hand
+# cv::cuda::GpuMat frames to the fused preprocessing kernel zero-copy (trtcpp::opencv::viewOf).
+# NOTE: the libs/tensorrt-cpp-api submodule must be at tensorrt_cpp_api v7.0.0+ (see MIGRATION.md).
+set(TRT_CPP_API_WITH_OPENCV ON CACHE BOOL "" FORCE)
+set(TRT_CPP_API_BUILD_PREPROC ON CACHE BOOL "" FORCE)
 add_subdirectory(libs/tensorrt-cpp-api)
 
-# Build the YoloV8 library
+# Build the YoloV8 library. v7 exposes namespaced targets and propagates its own include dirs
+# (the tensorrt_cpp_api/ root), so the old libs/.../src include is gone.
 add_library(YoloV8_TRT SHARED src/yolov8.cpp)
-target_link_libraries(YoloV8_TRT PUBLIC tensorrt_cpp_api ${OpenCV_LIBS})
-target_include_directories(YoloV8_TRT PUBLIC libs/tensorrt-cpp-api/src)
+target_link_libraries(YoloV8_TRT PUBLIC
+    tensorrt_cpp_api::tensorrt_cpp_api
+    tensorrt_cpp_api::preproc
+    ${OpenCV_LIBS})
 
 # Build and link the executables
 add_executable(detect_object_image src/object_detection_image.cpp)
diff --git a/MIGRATION.md b/MIGRATION.md
new file mode 100644
index 0000000..49c99d8
--- /dev/null
+++ b/MIGRATION.md
@@ -0,0 +1,64 @@
+# Migration to tensorrt_cpp_api v7
+
+This branch (`v7-migration`) ports YOLOv8-TensorRT-CPP from the v6 `tensorrt-cpp-api` to **v7**,
+which is a clean break (new namespace `trtcpp`, no-throw `Status`/`Result`, name-keyed tensors,
+PImpl headers with no OpenCV/TensorRT leakage). See the library's `docs/upgrading_from_v6.md`.
+
+## Verification status
+
+Built and run on an RTX 3080 Laptop GPU against a CUDA-12.6 OpenCV-CUDA build and the v7 library:
+`detect_object_image` on `images/team.jpg` (FP16 YOLOv8n) runs end-to-end and detects the people in
+the frame and writes the annotated image (~10 objects; the exact count shifts by one or two across
+FP16 engine rebuilds, as TensorRT's tactic selection moves borderline detections across the
+confidence threshold). The library, `opencv_interop`, the preproc sublib, and the YoloV8 detection
+code compile and run. The video/CSI demo targets additionally need an OpenCV built with
+`highgui`/`videoio`.
+
+## Required after pulling: init the submodule
+
+`libs/tensorrt-cpp-api` is a git submodule pinned to a v7 release; populate it before building:
+
+```sh
+git submodule update --init
+```
+
+## What changed
+
+**Build (`CMakeLists.txt`)**
+- C++17 → **C++20** (v7 requirement).
+- Enable the library's OpenCV interop and preprocessing before `add_subdirectory`:
+  `TRT_CPP_API_WITH_OPENCV=ON`, `TRT_CPP_API_BUILD_PREPROC=ON`.
+- Link the namespaced v7 targets `tensorrt_cpp_api::tensorrt_cpp_api` + `tensorrt_cpp_api::preproc`
+  (was `tensorrt_cpp_api`); dropped the `libs/.../src` include — v7 propagates its own include root.
+
+**Inference layer (`src/yolov8.{h,cpp}`)**
+- `Engine<float>` → `trtcpp::Engine` (non-templated; runtime `DType`). IO is now name-keyed; the
+  class caches `m_inputName`/`m_outputNames`/`m_inputShape`/`m_outputShapes` and a reusable
+  NCHW-float input `Tensor` plus a `Stream`.
+- `Options` + `buildLoadNetwork(onnx, SUB, DIV, NORMALIZE)` → `BuildOptions` +
+  `EngineBuilder::buildAndLoad(onnx, opts)`.
+- Preprocessing: the v6 OpenCV `cvtColor` + `resizeKeepAspectRatioPadRightBottom` + in-engine
+  HWC→NCHW/normalize is replaced by **one fused kernel**, `preproc::letterboxToTensor`, fed a
+  zero-copy `trtcpp::opencv::viewOf(gpuImg)` device view (BGR→RGB via `swapRB`, letterbox pad
+  right/bottom, `scale = 1/255`). Box-mapping ratio is unchanged.
+- `runInference(GpuMat, nested-vectors)` + `Engine<float>::transformOutput` → `engine.infer(...)`
+  returning name-keyed owning `Tensor`s; each output is read back with `toHost(stream)` (explicit
+  D2H + sync) into a flat `std::vector<float>`. The detect/pose/seg **post-processing math is
+  unchanged** — only how it obtains dims (`getOutputDims().d[i]` → cached `Shape[i]`) and the flat
+  output buffers.
+- Errors: v6 `bool`/exception checks → unwrap `Result`/`Status` (throwing a `std::runtime_error`
+  with `.status().message()` to preserve this app's exception-based control flow).
+
+**Precision (`src/cmd_line_util.h`)**
+- `Precision::FP32/FP16/INT8` → `trtcpp::Precision::kFp32/kFp16/kInt8Qdq`.
+- **INT8 caveat:** `kInt8Qdq` expects an explicit Q/DQ ONNX (no calibration data). The v6 flow
+  (a calibration-image directory) maps to `kInt8CalibLegacy`, which is only available when the
+  library is built against **TensorRT < 11** and requires constructing an `ICalibrator`
+  (`tensorrt_cpp_api/calibrator.h`) and setting `BuildOptions.calibrator`. That wiring is **not**
+  ported here — quantize to a QDQ ONNX, or restore a calibrator if you need legacy PTQ.
+
+**Misc**
+- Added `src/stopwatch.h` (a small `std::chrono` `preciseStopwatch`) to replace the timing utility
+  v6 shipped in the engine library and v7 does not.
+- The OpenCV modules the v6 `engine.h` pulled in transitively (`imgcodecs`, `videoio`, `highgui`)
+  are now included explicitly where used.
diff --git a/libs/tensorrt-cpp-api b/libs/tensorrt-cpp-api
index f93f973..166ce91 160000
--- a/libs/tensorrt-cpp-api
+++ b/libs/tensorrt-cpp-api
@@ -1 +1 @@
-Subproject commit f93f973cd03f1caa710c60d60c0d6feb9ed79e4a
+Subproject commit 166ce91ce71b4ac7c39611796ced4a8bf20fcab9
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index b7cf8ce..d96f420 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -1,6 +1,7 @@
 #include "cmd_line_util.h"
+#include "stopwatch.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/imgcodecs.hpp> // imread (was pulled in transitively by the v6 engine.h)
 
 // Benchmarks the specified model
 int main(int argc, char *argv[]) {
diff --git a/src/cmd_line_util.h b/src/cmd_line_util.h
index 9aed68d..e6a4dc1 100644
--- a/src/cmd_line_util.h
+++ b/src/cmd_line_util.h
@@ -191,11 +191,14 @@ inline bool parseArguments(int argc, char *argv[], YoloV8Config &config, std::st
                     return false;
 
                 if (nextArgument == "FP32") {
-                    config.precision = Precision::FP32;
+                    config.precision = trtcpp::Precision::kFp32;
                 } else if (nextArgument == "FP16") {
-                    config.precision = Precision::FP16;
+                    config.precision = trtcpp::Precision::kFp16;
                 } else if (nextArgument == "INT8") {
-                    config.precision = Precision::INT8;
+                    // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style
+                    // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when
+                    // the library is built against TensorRT < 11).
+                    config.precision = trtcpp::Precision::kInt8Qdq;
                 } else {
                     std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl;
                     return false;
@@ -379,11 +382,14 @@ inline bool parseArgumentsVideo(int argc, char *argv[], YoloV8Config &config, st
                     return false;
 
                 if (nextArgument == "FP32") {
-                    config.precision = Precision::FP32;
+                    config.precision = trtcpp::Precision::kFp32;
                 } else if (nextArgument == "FP16") {
-                    config.precision = Precision::FP16;
+                    config.precision = trtcpp::Precision::kFp16;
                 } else if (nextArgument == "INT8") {
-                    config.precision = Precision::INT8;
+                    // v7: kInt8Qdq is the forward-compatible explicit-QDQ path. For v6-style
+                    // calibration-directory PTQ, use kInt8CalibLegacy + an ICalibrator (only when
+                    // the library is built against TensorRT < 11).
+                    config.precision = trtcpp::Precision::kInt8Qdq;
                 } else {
                     std::cout << "Error: Unexpected precision value: " << nextArgument << ", options are FP32, FP16, INT8" << std::endl;
                     return false;
diff --git a/src/object_detection_csi_jetson.cpp b/src/object_detection_csi_jetson.cpp
index 63682a3..5f94c27 100644
--- a/src/object_detection_csi_jetson.cpp
+++ b/src/object_detection_csi_jetson.cpp
@@ -1,6 +1,10 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+// Under tensorrt_cpp_api v7 the public headers pull in no OpenCV, so the modules this tool needs
+// (VideoCapture from videoio, imshow/waitKey from highgui) must be included explicitly rather than
+// transitively via the old v6 "engine.h".
+#include <opencv2/highgui.hpp>
+#include <opencv2/videoio.hpp>
 
 // Runs object detection on video stream then displays annotated results.
 int main(int argc, char *argv[]) {
diff --git a/src/object_detection_image.cpp b/src/object_detection_image.cpp
index 4271ad1..9f1cc06 100644
--- a/src/object_detection_image.cpp
+++ b/src/object_detection_image.cpp
@@ -1,5 +1,6 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
+#include <opencv2/imgcodecs.hpp> // imread/imwrite (was pulled in transitively by the v6 engine.h)
 
 // Runs object detection on an input image then saves the annotated image to disk.
 int main(int argc, char *argv[]) {
diff --git a/src/object_detection_video_stream.cpp b/src/object_detection_video_stream.cpp
index 3f53af7..9348107 100644
--- a/src/object_detection_video_stream.cpp
+++ b/src/object_detection_video_stream.cpp
@@ -1,6 +1,7 @@
 #include "cmd_line_util.h"
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/highgui.hpp> // imshow/waitKey (was pulled in transitively by the v6 engine.h)
+#include <opencv2/videoio.hpp> // VideoCapture / CAP_PROP_*
 
 // Runs object detection on video stream then displays annotated results.
 int main(int argc, char *argv[]) {
diff --git a/src/stopwatch.h b/src/stopwatch.h
new file mode 100644
index 0000000..04af649
--- /dev/null
+++ b/src/stopwatch.h
@@ -0,0 +1,17 @@
+#pragma once
+// Minimal stopwatch for the ENABLE_BENCHMARKS timing and the benchmark executable. v6 got this
+// from the engine library's util/Stopwatch.h, which v7 does not ship; this is a drop-in local
+// replacement with the same preciseStopwatch interface.
+#include <chrono>
+
+template <typename Clock = std::chrono::steady_clock> class Stopwatch {
+public:
+    template <typename T, typename Duration> T elapsedTime() const {
+        return static_cast<T>(std::chrono::duration_cast<Duration>(Clock::now() - start_).count());
+    }
+
+private:
+    typename Clock::time_point start_ = Clock::now();
+};
+
+using preciseStopwatch = Stopwatch<std::chrono::high_resolution_clock>;
diff --git a/src/yolov8.cpp b/src/yolov8.cpp
index cbd315c..a2bff04 100644
--- a/src/yolov8.cpp
+++ b/src/yolov8.cpp
@@ -1,77 +1,102 @@
 #include "yolov8.h"
-#include <opencv2/cudaimgproc.hpp>
+#include "stopwatch.h"
+#include <iostream> // std::cout in the ENABLE_BENCHMARKS timing (was transitive via the v6 engine.h)
+#include <stdexcept>
+
+namespace {
+// Unwrap a v7 Result, throwing on error (this app uses exceptions). Calling .value() directly
+// would assert in debug builds and be undefined behavior in -DNDEBUG release builds when the
+// Result holds an error (e.g. a dynamic/oversized shape, OOM, or a non-float output dtype).
+template <class T> T must(trtcpp::Result<T> r, const char *what) {
+    if (!r) {
+        throw std::runtime_error(std::string("Error: ") + what + ": " + r.status().message());
+    }
+    return std::move(r).value();
+}
+} // namespace
 
 YoloV8::YoloV8(const std::string &onnxModelPath, const std::string &trtModelPath, const YoloV8Config &config)
     : PROBABILITY_THRESHOLD(config.probabilityThreshold), NMS_THRESHOLD(config.nmsThreshold), TOP_K(config.topK),
       SEG_CHANNELS(config.segChannels), SEG_H(config.segH), SEG_W(config.segW), SEGMENTATION_THRESHOLD(config.segmentationThreshold),
       CLASS_NAMES(config.classNames), NUM_KPS(config.numKPS), KPS_THRESHOLD(config.kpsThreshold) {
-    // Specify options for GPU inference
-    Options options;
-    options.optBatchSize = 1;
-    options.maxBatchSize = 1;
-
+    // Specify build options for the v7 engine builder. (Batch knobs are now expressed as
+    // optimization profiles; this detector uses the model's static 1x3xHxW input.)
+    trtcpp::BuildOptions options;
     options.precision = config.precision;
-    options.calibrationDataDirectoryPath = config.calibrationDataDirectory;
+    options.engineCacheDir = "."; // build-or-load caches next to the working dir; v7 detects staleness
 
-    if (options.precision == Precision::INT8) {
-        if (options.calibrationDataDirectoryPath.empty()) {
-            throw std::runtime_error("Error: Must supply calibration data path for INT8 calibration");
-        }
+    // v7 INT8: prefer an explicit-QDQ ONNX with Precision::kInt8Qdq (no calibration data). Legacy
+    // calibrator PTQ (kInt8CalibLegacy) is only available when the library is built against
+    // TensorRT < 11 and is wired via BuildOptions.calibrator (see tensorrt_cpp_api/calibrator.h).
+    if (options.precision == trtcpp::Precision::kInt8CalibLegacy && config.calibrationDataDirectory.empty()) {
+        throw std::runtime_error("Error: Must supply calibration data path for legacy INT8 calibration");
     }
 
-    // Create our TensorRT inference engine
-    m_trtEngine = std::make_unique<Engine<float>>(options);
-
-    // Build the onnx model into a TensorRT engine file, cache the file to disk, and then load the TensorRT engine file into memory.
-    // If the engine file already exists on disk, this function will not rebuild but only load into memory.
-    // The engine file is rebuilt any time the above Options are changed.
-    if (!onnxModelPath.empty()) {
-        // Build the ONNX model into a TensorRT engine file
-        auto succ = m_trtEngine->buildLoadNetwork(onnxModelPath, SUB_VALS, DIV_VALS, NORMALIZE);
-        if (!succ) {
-            const std::string errMsg = "Error: Unable to build or load the TensorRT engine from ONNX model. "
-                                       "Try increasing TensorRT log severity to kVERBOSE (in /libs/tensorrt-cpp-api/engine.cpp).";
-            throw std::runtime_error(errMsg);
+    // Obtain a ready-to-run v7 Engine, either by building the ONNX into a TensorRT engine (caching
+    // it next to the working dir, rebuilding only when stale) or by loading a prebuilt .trt/.engine
+    // file directly. Preprocessing (BGR->RGB, letterbox, 1/255 scale) is fused on the GPU in
+    // preprocess(), so the v6 SUB_VALS/DIV_VALS/NORMALIZE are no longer passed at build/load time.
+    auto loadEngine = [&]() -> trtcpp::Result<trtcpp::Engine> {
+        if (!onnxModelPath.empty()) {
+            // Build the ONNX model into a TensorRT engine (or load a fresh cached one) and deserialize it.
+            return trtcpp::EngineBuilder{}.buildAndLoad(onnxModelPath, options);
         }
-    } else if (!trtModelPath.empty()) { // If no ONNX model, check for TRT model
-        // Load the TensorRT engine file directly
-        bool succ = m_trtEngine->loadNetwork(trtModelPath, SUB_VALS, DIV_VALS, NORMALIZE);
-        if (!succ) {
-            throw std::runtime_error("Error: Unable to load TensorRT engine from " + trtModelPath);
+        if (!trtModelPath.empty()) {
+            // No ONNX model: deserialize a prebuilt TensorRT engine file directly.
+            return trtcpp::Engine::loadFromFile(trtModelPath);
         }
-    } else {
-        throw std::runtime_error("Error: Neither ONNX model nor TensorRT engine path provided.");
+        return trtcpp::Status{trtcpp::StatusCode::kInvalidArgument, "Neither ONNX model nor TensorRT engine path provided."};
+    };
+    auto engine = loadEngine();
+    if (!engine) {
+        throw std::runtime_error("Error: Unable to build or load the TensorRT engine: " + engine.status().message());
     }
-}
-
-std::vector<std::vector<cv::cuda::GpuMat>> YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) {
-    // Populate the input vectors
-    const auto &inputDims = m_trtEngine->getInputDims();
-
-    // Convert the image from BGR to RGB
-    cv::cuda::GpuMat rgbMat;
-    cv::cuda::cvtColor(gpuImg, rgbMat, cv::COLOR_BGR2RGB);
-
-    auto resized = rgbMat;
-
-    // Resize to the model expected input size while maintaining the aspect ratio with the use of padding
-    if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
-        // Only resize if not already the right size to avoid unecessary copy
-        resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(rgbMat, inputDims[0].d[1], inputDims[0].d[2]);
+    m_engine = std::make_unique<trtcpp::Engine>(std::move(engine).value());
+
+    // Cache IO metadata once (v7 is name-keyed and non-templated).
+    m_inputName = m_engine->inputNames().front();
+    m_outputNames = m_engine->outputNames();
+    m_inputShape = must(m_engine->tensorShape(m_inputName), "query input shape"); // [1,3,H,W]
+    for (const auto &name : m_outputNames) {
+        m_outputShapes.push_back(must(m_engine->tensorShape(name), "query output shape"));
     }
 
-    // Convert to format expected by our inference engine
-    // The reason for the strange format is because it supports models with multiple inputs as well as batching
-    // In our case though, the model only has a single input and we are using a batch size of 1.
-    std::vector<cv::cuda::GpuMat> input{std::move(resized)};
-    std::vector<std::vector<cv::cuda::GpuMat>> inputs{std::move(input)};
-
-    // These params will be used in the post-processing stage
-    m_imgHeight = rgbMat.rows;
-    m_imgWidth = rgbMat.cols;
-    m_ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(rgbMat.cols), inputDims[0].d[1] / static_cast<float>(rgbMat.rows));
+    // Pre-allocate the NCHW float input tensor. allocate() errors (and we throw) on a dynamic
+    // input shape or a CUDA OOM rather than crashing on an unchecked .value().
+    m_input = must(trtcpp::Tensor::allocate(trtcpp::DType::kFloat32, m_inputShape, trtcpp::Device::kCuda), "allocate input tensor");
+}
 
-    return inputs;
+void YoloV8::preprocess(const cv::cuda::GpuMat &gpuImg) {
+    // Record original dims + the letterbox ratio used by post-processing to map boxes back to the
+    // source image. inputShape is [1, 3, H, W].
+    m_imgHeight = static_cast<float>(gpuImg.rows);
+    m_imgWidth = static_cast<float>(gpuImg.cols);
+    const int inH = static_cast<int>(m_inputShape[2]);
+    const int inW = static_cast<int>(m_inputShape[3]);
+    m_ratio = 1.f / std::min(inW / m_imgWidth, inH / m_imgHeight);
+
+    // One fused GPU kernel replaces the v6 cvtColor + resizeKeepAspectRatioPadRightBottom and the
+    // in-engine HWC->NCHW + normalize: BGR->RGB, letterbox-resize (pad right/bottom), scale by
+    // 1/255 (SUB_VALS=0, DIV_VALS=1, NORMALIZE), and write the NCHW float input tensor in place.
+    trtcpp::preproc::PreprocSpec spec;
+    spec.swapRB = true;             // OpenCV GpuMat is BGR; the model expects RGB
+    spec.keepAspectRatioPad = true; // letterbox, pad right/bottom (matches v6)
+    spec.scale = {1.f / 255.f, 1.f / 255.f, 1.f / 255.f, 1.f};
+
+    // cv::cuda::GpuMat rows are typically pitched (padded for alignment) and a TensorView is
+    // contiguous, so make a continuous copy when the upload isn't already continuous.
+    cv::cuda::GpuMat continuous = gpuImg;
+    if (!gpuImg.isContinuous()) {
+        cv::cuda::createContinuous(gpuImg.rows, gpuImg.cols, gpuImg.type(), continuous);
+        gpuImg.copyTo(continuous);
+    }
+    auto src = trtcpp::opencv::viewOf(continuous); // zero-copy HWC-uint8 device view
+    if (!src) {
+        throw std::runtime_error("Error: could not view the input GpuMat: " + src.status().message());
+    }
+    if (auto s = trtcpp::preproc::letterboxToTensor(src.value(), m_input.view(), spec, m_stream); !s) {
+        throw std::runtime_error("Error: preprocessing failed: " + s.message());
+    }
 }
 
 std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR) {
@@ -80,7 +105,7 @@ std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR)
     static int numIts = 1;
     preciseStopwatch s1;
 #endif
-    const auto input = preprocess(inputImageBGR);
+    preprocess(inputImageBGR); // fills m_input
 #ifdef ENABLE_BENCHMARKS
     static long long t1 = 0;
     t1 += s1.elapsedTime<long long, std::chrono::microseconds>();
@@ -90,10 +115,21 @@ std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR)
 #ifdef ENABLE_BENCHMARKS
     preciseStopwatch s2;
 #endif
-    std::vector<std::vector<std::vector<float>>> featureVectors;
-    auto succ = m_trtEngine->runInference(input, featureVectors);
-    if (!succ) {
-        throw std::runtime_error("Error: Unable to run inference.");
+    auto outputs = m_engine->infer({{m_inputName, m_input.view()}}, m_stream);
+    if (!outputs) {
+        throw std::runtime_error("Error: Unable to run inference: " + outputs.status().message());
+    }
+    // Read each output back to a flat host float vector, in output-binding order. (v7 returns
+    // name-keyed owning Tensors; toHost performs the D2H copy AND synchronizes the stream.)
+    std::vector<std::vector<float>> featureVectors;
+    featureVectors.reserve(m_outputNames.size());
+    for (const auto &name : m_outputNames) {
+        auto host = outputs->at(name).toHost(m_stream);
+        if (!host) {
+            throw std::runtime_error("Error: output readback failed: " + host.status().message());
+        }
+        const auto span = must(host->as<float>(), "output tensor is not float32 (rebuild the engine with a float output)");
+        featureVectors.emplace_back(span.begin(), span.end());
     }
 #ifdef ENABLE_BENCHMARKS
     static long long t2 = 0;
@@ -102,32 +138,28 @@ std::vector<Object> YoloV8::detectObjects(const cv::cuda::GpuMat &inputImageBGR)
     preciseStopwatch s3;
 #endif
     // Check if our model does only object detection or also supports segmentation
+    // v7 already gives one flat host vector per output (batch size 1), so the v6 transformOutput
+    // 3D->1D/2D flattening is no longer needed.
     std::vector<Object> ret;
-    const auto &numOutputs = m_trtEngine->getOutputDims().size();
-    if (numOutputs == 1) {
-        // Object detection or pose estimation
-        // Since we have a batch size of 1 and only 1 output, we must convert the output from a 3D array to a 1D array.
-        std::vector<float> featureVector;
-        Engine<float>::transformOutput(featureVectors, featureVector);
-
-        const auto &outputDims = m_trtEngine->getOutputDims();
-        size_t numChannels = outputDims[outputDims.size() - 1].d[1];
+    if (m_outputShapes.size() == 1) {
+        // Object detection or pose estimation. Output shape is [1, C, anchors]; the channel count C
+        // distinguishes the two: pose adds NUM_KPS*3 keypoint values on top of (4 box + classes),
+        // while plain detection is just (4 box + classes). No magic number; works with Ultralytics
+        // pretrained models.
+        const size_t numChannels = static_cast<size_t>(m_outputShapes[0][1]);
         if (numChannels == 4 + CLASS_NAMES.size() + NUM_KPS * 3) {
             // Pose estimation
-            ret = postprocessPose(featureVector);
-        } else if (numChannels == 4 + CLASS_NAMES.size()){
+            ret = postprocessPose(featureVectors[0]);
+        } else if (numChannels == 4 + CLASS_NAMES.size()) {
             // Object detection
-            ret = postprocessDetect(featureVector);
+            ret = postprocessDetect(featureVectors[0]);
         }
         else {
             throw std::runtime_error("Error: Unable to identify whether the model is for Pose estimation or Object detection.");
         }
     } else {
-        // Segmentation
-        // Since we have a batch size of 1 and 2 outputs, we must convert the output from a 3D array to a 2D array.
-        std::vector<std::vector<float>> featureVector;
-        Engine<float>::transformOutput(featureVectors, featureVector);
-        ret = postProcessSegmentation(featureVector);
+        // Instance segmentation (detections + mask prototypes).
+        ret = postProcessSegmentation(featureVectors);
     }
 #ifdef ENABLE_BENCHMARKS
     static long long t3 = 0;
@@ -147,10 +179,8 @@ std::vector<Object> YoloV8::detectObjects(const cv::Mat &inputImageBGR) {
 }
 
 std::vector<Object> YoloV8::postProcessSegmentation(std::vector<std::vector<float>> &featureVectors) {
-    const auto &outputDims = m_trtEngine->getOutputDims();
-
-    int numChannels = outputDims[0].d[1];
-    int numAnchors = outputDims[0].d[2];
+    int numChannels = static_cast<int>(m_outputShapes[0][1]);
+    int numAnchors = static_cast<int>(m_outputShapes[0][2]);
 
     const auto numClasses = numChannels - SEG_CHANNELS - 4;
 
@@ -237,7 +267,6 @@ std::vector<Object> YoloV8::postProcessSegmentation(std::vector<std::vector<floa
 
         std::vector<cv::Mat> maskChannels;
         cv::split(maskMat, maskChannels);
-        const auto inputDims = m_trtEngine->getInputDims();
 
         cv::Rect roi;
         if (m_imgHeight > m_imgWidth) {
@@ -260,9 +289,8 @@ std::vector<Object> YoloV8::postProcessSegmentation(std::vector<std::vector<floa
 }
 
 std::vector<Object> YoloV8::postprocessPose(std::vector<float> &featureVector) {
-    const auto &outputDims = m_trtEngine->getOutputDims();
-    auto numChannels = outputDims[0].d[1];
-    auto numAnchors = outputDims[0].d[2];
+    const auto numChannels = static_cast<int>(m_outputShapes[0][1]);
+    const auto numAnchors = static_cast<int>(m_outputShapes[0][2]);
 
     std::vector<cv::Rect> bboxes;
     std::vector<float> scores;
@@ -342,9 +370,8 @@ std::vector<Object> YoloV8::postprocessPose(std::vector<float> &featureVector) {
 }
 
 std::vector<Object> YoloV8::postprocessDetect(std::vector<float> &featureVector) {
-    const auto &outputDims = m_trtEngine->getOutputDims();
-    auto numChannels = outputDims[0].d[1];
-    auto numAnchors = outputDims[0].d[2];
+    const auto numChannels = static_cast<int>(m_outputShapes[0][1]);
+    const auto numAnchors = static_cast<int>(m_outputShapes[0][2]);
 
     auto numClasses = CLASS_NAMES.size();
 
diff --git a/src/yolov8.h b/src/yolov8.h
index 366b370..88452b2 100644
--- a/src/yolov8.h
+++ b/src/yolov8.h
@@ -1,6 +1,20 @@
 #pragma once
-#include "engine.h"
+// Migrated to tensorrt_cpp_api v7. The v7 public headers deliberately pull in no OpenCV, so the
+// OpenCV headers this class needs (cv::Mat / GpuMat / dnn::NMSBoxesBatched) are now included
+// explicitly here rather than transitively via the old "engine.h".
+#include <tensorrt_cpp_api/all.h>
+#include <tensorrt_cpp_api/opencv_interop.h>
+#include <tensorrt_cpp_api/preproc.h>
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/dnn.hpp>
+#include <opencv2/imgproc.hpp>
+
 #include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
 
 // Utility method for checking if a file exists on disk
 inline bool doesFileExist(const std::string &name) {
@@ -25,7 +39,7 @@ struct Object {
 // Can pass these arguments as command line parameters.
 struct YoloV8Config {
     // The precision to be used for inference
-    Precision precision = Precision::FP16;
+    trtcpp::Precision precision = trtcpp::Precision::kFp16;
     // Calibration data directory. Must be specified when using INT8 precision.
     std::string calibrationDataDirectory;
     // Probability threshold used to filter detected objects
@@ -71,8 +85,8 @@ class YoloV8 {
     void drawObjectLabels(cv::Mat &image, const std::vector<Object> &objects, unsigned int scale = 2);
 
 private:
-    // Preprocess the input
-    std::vector<std::vector<cv::cuda::GpuMat>> preprocess(const cv::cuda::GpuMat &gpuImg);
+    // Preprocess the input (fills m_input + m_ratio/m_imgWidth/m_imgHeight)
+    void preprocess(const cv::cuda::GpuMat &gpuImg);
 
     // Postprocess the output
     std::vector<Object> postprocessDetect(std::vector<float> &featureVector);
@@ -83,7 +97,15 @@ class YoloV8 {
     // Postprocess the output for segmentation model
     std::vector<Object> postProcessSegmentation(std::vector<std::vector<float>> &featureVectors);
 
-    std::unique_ptr<Engine<float>> m_trtEngine = nullptr;
+    // v7 engine + cached IO metadata (v7 is name-keyed and non-templated). The owning input
+    // Tensor is reused across frames; the caller-owned stream drives async work.
+    std::unique_ptr<trtcpp::Engine> m_engine = nullptr;
+    trtcpp::Stream m_stream;
+    std::string m_inputName;
+    std::vector<std::string> m_outputNames;
+    trtcpp::Shape m_inputShape;                // [1,3,H,W]
+    std::vector<trtcpp::Shape> m_outputShapes; // build-time shapes, in output-binding order
+    trtcpp::Tensor m_input;                    // NCHW float device tensor fed to the engine
 
     // Used for image preprocessing
     // YoloV8 model expects values between [0.f, 1.f] so we use the following params