From 138d41ec75a824a7a73b601393bc6cf32abbe391 Mon Sep 17 00:00:00 2001
From: Jan Solanti <jan.solanti@tuni.fi>
Date: Mon, 24 Nov 2025 18:20:06 +0200
Subject: [PATCH 1/5] hacky patches to make it work on pocl-cuda

---
 ggml/src/ggml-opencl/ggml-opencl.cpp          | 59 +++++++++++++------
 ggml/src/ggml-opencl/kernels/argsort.cl       |  4 +-
 ggml/src/ggml-opencl/kernels/exp.cl           |  6 +-
 ggml/src/ggml-opencl/kernels/expm1.cl         |  6 +-
 ggml/src/ggml-opencl/kernels/glu.cl           |  2 +-
 .../kernels/mul_mv_id_mxfp4_f32.cl            |  4 ++
 .../kernels/mul_mv_id_mxfp4_f32_flat.cl       |  4 ++
 .../kernels/mul_mv_id_q4_0_f32_8x_flat.cl     |  4 ++
 .../ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl |  4 ++
 .../kernels/mul_mv_id_q8_0_f32_flat.cl        |  4 ++
 .../ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl  |  4 ++
 .../kernels/mul_mv_iq4_nl_f32_flat.cl         |  4 ++
 .../ggml-opencl/kernels/mul_mv_mxfp4_f32.cl   |  4 ++
 .../kernels/mul_mv_mxfp4_f32_flat.cl          |  4 ++
 .../ggml-opencl/kernels/mul_mv_q4_0_f32.cl    |  4 ++
 .../kernels/mul_mv_q4_0_f32_1d_16x_flat.cl    |  4 ++
 .../kernels/mul_mv_q4_0_f32_1d_8x_flat.cl     |  4 ++
 .../kernels/mul_mv_q4_0_f32_8x_flat.cl        |  4 ++
 .../ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl  |  4 ++
 .../ggml-opencl/kernels/mul_mv_q4_1_f32.cl    |  4 ++
 .../kernels/mul_mv_q4_1_f32_flat.cl           |  4 ++
 .../ggml-opencl/kernels/mul_mv_q4_k_f32.cl    |  6 ++
 .../kernels/mul_mv_q4_k_f32_flat.cl           |  4 ++
 .../ggml-opencl/kernels/mul_mv_q5_k_f32.cl    |  4 ++
 .../kernels/mul_mv_q5_k_f32_flat.cl           |  4 ++
 .../ggml-opencl/kernels/mul_mv_q6_k_f32.cl    |  4 ++
 .../kernels/mul_mv_q6_k_f32_flat.cl           |  4 ++
 .../ggml-opencl/kernels/mul_mv_q8_0_f32.cl    |  4 ++
 .../kernels/mul_mv_q8_0_f32_flat.cl           |  4 ++
 ggml/src/ggml-opencl/kernels/rope.cl          |  4 +-
 ggml/src/ggml-opencl/kernels/sigmoid.cl       |  2 +-
 ggml/src/ggml-opencl/kernels/softplus.cl      |  6 +-
 ggml/src/ggml-opencl/kernels/tanh.cl          |  6 +-
 33 files changed, 157 insertions(+), 36 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index d344bde0fe3..3e688134af8 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -779,7 +779,8 @@ inline std::string read_file(const std::string &path) {
   return text;
 }
 
-static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
+#define build_program_from_source(ctx, dev, program_buffer, compile_opts) build_program_from_source_impl(__LINE__, ctx, dev, program_buffer, compile_opts)
+static cl_program build_program_from_source_impl(size_t line, cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
     cl_program p;
     char *program_log;
     size_t program_size;
@@ -800,7 +801,7 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
         program_log = (char*) malloc(log_size + 1);
         program_log[log_size] = '\0';
         clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
-        GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
+        GGML_LOG_ERROR("ggml_opencl:%lu: kernel compile error:\n\n%s\n", (unsigned long)line, program_log);
         free(program_log);
         exit(1);
     }
@@ -3313,7 +3314,6 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     } else {
         GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
         backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-        return nullptr;
     }
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
@@ -8259,7 +8259,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
     } else if (backend_ctx->gpu_family == INTEL) {
         sgs = 32;
     } else {
-        GGML_ASSERT(false && "Unsupported GPU");
+        sgs = 32;
     }
 
     CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),    &extra0->data_device));
@@ -8346,7 +8346,7 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
     } else if (backend_ctx->gpu_family == INTEL) {
         sgs = 32;
     } else {
-        GGML_ASSERT(false && "Unsupported GPU");
+        sgs = 32;
     }
 
     cl_kernel kernel = backend_ctx->kernel_rms_norm_mul;
@@ -8425,7 +8425,7 @@ static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm
     size_t sgs;
     if (backend_ctx->gpu_family == ADRENO) sgs = 64;
     else if (backend_ctx->gpu_family == INTEL) sgs = 32;
-    else GGML_ASSERT(false && "Unsupported GPU");
+    else sgs = 32;
 
     cl_kernel kernel = backend_ctx->kernel_norm_mul_add;
 
@@ -8558,7 +8558,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
     } else if (backend_ctx->gpu_family == INTEL) {
         sgs = 32;
     } else {
-        GGML_ASSERT(false && "Unsupported GPU");
+        sgs = 32;
     }
 
     CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
@@ -11845,7 +11845,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
                     kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat;
                 } else {
-                    GGML_ASSERT(false && "TODO: Unknown GPU");
+                    nth0 = 32;
+                    nth1 = 1;
+
+                    kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat;
                 }
 
                 CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
@@ -11903,7 +11906,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 nth0 = 64;
                 nth1 = 1;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                nth0 = 32;
+                nth1 = 1;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
@@ -11940,7 +11944,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 nth0 = 64;
                 nth1 = 1;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                nth0 = 32;
+                nth1 = 1;
             }
 
             if (src1t == GGML_TYPE_F32) {
@@ -12002,7 +12007,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
                 ndst =8;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                nth0 = 32;
+                nth1 = 1;
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
+                ndst = 8;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
@@ -12139,7 +12147,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 nth1 = 2;
                 ndst = nth1*4;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                nth0 = 32;
+                nth1 = 2;
+                ndst = nth1*4;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
@@ -12461,7 +12471,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 nth1 = 2;
                 ndst = 1;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                nth0 = 2;
+                nth1 = 32;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
@@ -12499,7 +12510,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
                 q = extra0_mxfp4->q_img;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                nth0 = 32;
+                nth1 = 2;
+                ndst = nth1*2;
+
+                q = extra0_mxfp4->q;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &q));
@@ -12808,7 +12823,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                 nsg  = 1;
                 ndst = 8;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                sgs  = 32;
+                nsg  = 1;
+                ndst = 8;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
@@ -12852,7 +12869,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                 nsg  = 2;
                 ndst = 4;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                sgs  = 32;
+                nsg  = 2;
+                ndst = 4;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
@@ -13107,7 +13126,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
 
                 q = extra0_mxfp4->q_img;
             } else {
-                GGML_ASSERT(false && "TODO: Unknown GPU");
+                sgs  = 32;
+                nsg  = 2;
+                ndst = 2;
+
+                q = extra0_mxfp4->q;
             }
 
             CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &q));
@@ -13606,7 +13629,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
     else if (backend_ctx->gpu_family == ADRENO) {
         nth = 64;
     } else {
-        GGML_ASSERT(false && "TODO: Unknown GPU");
+        nth = MIN(32, ne00);
     }
 
     cl_kernel kernel;
diff --git a/ggml/src/ggml-opencl/kernels/argsort.cl b/ggml/src/ggml-opencl/kernels/argsort.cl
index af4adc7b83f..ab24d1dfd2b 100644
--- a/ggml/src/ggml-opencl/kernels/argsort.cl
+++ b/ggml/src/ggml-opencl/kernels/argsort.cl
@@ -43,8 +43,8 @@ kernel void kernel_argsort_f32_i32(
         return;
     }
 
-    src0 = (global char  *)((global char *)src0 + offset0);
-    dst  = (global float *)((global char *)dst  + offsetd);
+    src0 = (global float  *)((global char *)src0 + offset0);
+    dst  = (global int *)((global char *)dst  + offsetd);
 
     global float * x_row = src0 + row * ne00;
 
diff --git a/ggml/src/ggml-opencl/kernels/exp.cl b/ggml/src/ggml-opencl/kernels/exp.cl
index a2458b6579c..c70a2ad5fd2 100644
--- a/ggml/src/ggml-opencl/kernels/exp.cl
+++ b/ggml/src/ggml-opencl/kernels/exp.cl
@@ -45,7 +45,7 @@ kernel void kernel_exp_f16(
     src0 = (global half*)((global char*)src0 + offset0);
     dst  = (global half*)((global char*)dst + offsetd);
 
-    dst[get_global_id(0)] = exp(src0[get_global_id(0)]);
+    dst[get_global_id(0)] = convert_half(exp(convert_float(src0[get_global_id(0)])));
 }
 
 kernel void kernel_exp_f16_4(
@@ -61,7 +61,7 @@ kernel void kernel_exp_f16_4(
     src0 = (global half4*)((global char*)src0 + offset0);
     dst  = (global half4*)((global char*)dst + offsetd);
 
-    dst[get_global_id(0)] = exp(src0[get_global_id(0)]);
+    dst[get_global_id(0)] = convert_half4(exp(convert_float4(src0[get_global_id(0)])));
 }
 
 kernel void kernel_exp_f32_nc(
@@ -120,6 +120,6 @@ kernel void kernel_exp_f16_nc(
         global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
         global       half * y = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-        *y = exp(*x);
+        *y = convert_half(exp(convert_float(*x)));
     }
 }
diff --git a/ggml/src/ggml-opencl/kernels/expm1.cl b/ggml/src/ggml-opencl/kernels/expm1.cl
index 05442ac2043..1434447a438 100644
--- a/ggml/src/ggml-opencl/kernels/expm1.cl
+++ b/ggml/src/ggml-opencl/kernels/expm1.cl
@@ -37,7 +37,7 @@ kernel void kernel_expm1_f16(
     src0 = (global half*)((global char*)src0 + offset0);
     dst  = (global half*)((global char*)dst + offsetd);
 
-    dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
+    dst[get_global_id(0)] = convert_half(exp(convert_float(src0[get_global_id(0)]))) - 1.0h;
 }
 
 kernel void kernel_expm1_f16_4(
@@ -49,7 +49,7 @@ kernel void kernel_expm1_f16_4(
     src0 = (global half4*)((global char*)src0 + offset0);
     dst  = (global half4*)((global char*)dst + offsetd);
 
-    dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
+    dst[get_global_id(0)] = convert_half4(exp(convert_float4(src0[get_global_id(0)]))) - 1.0h;
 }
 
 kernel void kernel_expm1_f32_nc(
@@ -108,6 +108,6 @@ kernel void kernel_expm1_f16_nc(
         global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
         global       half * y = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-        *y = exp(*x) - 1.0f;
+        *y = convert_half(exp(convert_float(*x)) - 1.0f);
     }
 }
diff --git a/ggml/src/ggml-opencl/kernels/glu.cl b/ggml/src/ggml-opencl/kernels/glu.cl
index 059a4bbf1ba..aaefa68a2b8 100644
--- a/ggml/src/ggml-opencl/kernels/glu.cl
+++ b/ggml/src/ggml-opencl/kernels/glu.cl
@@ -196,7 +196,7 @@ kernel void kernel_swiglu_f16(
         const half x0 = src0_row[i0];
         const half x1 = src1_row[i0];
 
-        const half silu = x0 / (1.0f + exp(-x0));
+        const half silu = x0 / (1.0f + exp((float)-x0));
 
         dst_row[i0] = silu*x1;
     }
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl
index d50bd1fc428..8ec5140ea15 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl
@@ -48,6 +48,10 @@ static inline float e8m0_to_fp32(uchar x) {
 #define N_R0_MXFP4 2
 #define N_SG_MXFP4 2
 #define N_SIMDWIDTH 64
+#else
+#define N_R0_MXFP4 4
+#define N_SG_MXFP4 1
+#define N_SIMDWIDTH 32
 #endif
 
 inline void mul_mv_mxfp4_f32(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl
index f65e86ed6a2..87fb3d0f51c 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl
@@ -63,6 +63,10 @@ static inline float e8m0_to_fp32(uchar x) {
 #define N_SG_MXFP4 1
 #define N_SIMDWIDTH 64
 #define SRC0Q_IMG
+#else
+#define N_R0_MXFP4 4
+#define N_SG_MXFP4 1
+#define N_SIMDWIDTH 32
 #endif
 
 kernel void kernel_mul_mv_id_mxfp4_f32_flat(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
index 7ccf41efbe9..7dc55de72df 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
@@ -88,6 +88,10 @@ inline float block_q_4_0_dot_y_flat(
 #define N_DST 8
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 inline void mul_vec_q_n_f32_8x_flat(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl
index f37e83ee8aa..4c98bfc5485 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl
@@ -34,6 +34,10 @@ typedef struct {
 #define N_R0_Q8_0 4
 #define N_SG_Q8_0 2
 #define N_SIMDWIDTH 64
+#else
+#define N_R0_Q8_0 8
+#define N_SG_Q8_0 1
+#define N_SIMDWIDTH 32
 #endif
 
 #ifdef INTEL_GPU
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl
index fd3a0710f5c..2ef8df57909 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl
@@ -34,6 +34,10 @@ typedef struct {
 #define N_R0_Q8_0 4
 #define N_SG_Q8_0 2
 #define N_SIMDWIDTH 64
+#else
+#define N_R0_Q8_0 8
+#define N_SG_Q8_0 1
+#define N_SIMDWIDTH 32
 #endif
 
 #ifdef INTEL_GPU
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl
index a6a325cd729..85bd5234525 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl
@@ -69,6 +69,10 @@ inline float block_iq4_nl_dot_y(
 #define N_DST 4
 #define N_SUBGROUP 1
 #define N_SUBGROUP_SIZE 64
+#else
+#define N_DST 4
+#define N_SUBGROUP 1
+#define N_SUBGROUP_SIZE 32
 #endif
 
 inline void mul_vec_q_n_f32(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl
index 8c5b3f52e42..d83c1fb5b71 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl
@@ -73,6 +73,10 @@ inline float block_iq4_nl_dot_y_flat(
 #define N_DST 8
 #define N_SUBGROUP 1
 #define N_SUBGROUP_SIZE 64
+#else
+#define N_DST 4
+#define N_SUBGROUP 1
+#define N_SUBGROUP_SIZE 32
 #endif
 
 inline void mul_vec_q_n_f32_8x_flat(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl
index 9a4d4b9bad1..b06e4ed4800 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl
@@ -48,6 +48,10 @@ static inline float e8m0_to_fp32(uchar x) {
 #define N_R0_MXFP4 2
 #define N_SG_MXFP4 2
 #define N_SIMDWIDTH 64
+#else
+#define N_R0_MXFP4 4
+#define N_SG_MXFP4 1
+#define N_SIMDWIDTH 32
 #endif
 
 #ifdef INTEL_GPU
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl
index 3d5a923eee0..b2fcb1dd3cd 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl
@@ -63,6 +63,10 @@ static inline float e8m0_to_fp32(uchar x) {
 #define N_SG_MXFP4 2
 #define N_SIMDWIDTH 64
 #define SRC0Q_IMG
+#else
+#define N_R0_MXFP4 4
+#define N_SG_MXFP4 1
+#define N_SIMDWIDTH 32
 #endif
 
 #ifdef INTEL_GPU
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl
index 52141e0ed55..8fdfc5fb757 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl
@@ -80,6 +80,10 @@ inline float block_q_4_0_dot_y(
 #define N_DST 4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 inline void mul_vec_q_n_f32(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl
index 3eebab8f0f2..1c3b65d70ad 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl
@@ -89,6 +89,10 @@ inline float mm_block_q_4_0_dot_y_flat(
 #define N_DST 16
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 16
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 //
 // This variant performs 1d blocking with 16x output.
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
index 38024d00ad5..2578d71d369 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
@@ -89,6 +89,10 @@ inline float mm_block_q_4_0_dot_y_flat(
 #define N_DST 8
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 //
 // This variant performs 1d blocking with 8x output.
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
index aed1ce7b260..79882d27e23 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
@@ -99,6 +99,10 @@ inline float block_q_4_0_dot_y_flat(
 #define N_DST 8
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 inline void mul_vec_q_n_f32_8x_flat(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl
index 92955217971..ce83a292cc1 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl
@@ -96,6 +96,10 @@ inline float block_q_4_0_dot_y_v(
 #define N_DST 4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 inline void mul_vec_q_n_f32_v(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl
index 6fe828f20e7..c68e7f73c34 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl
@@ -74,6 +74,10 @@ inline float block_q4_1_dot_y(
 #define N_DST 4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 inline void mul_vec_q_n_f32(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl
index d7c4645d675..047a154d64e 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl
@@ -75,6 +75,10 @@ inline float block_q4_1_dot_y_flat(
 #define N_DST 4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 inline void mul_vec_q_n_f32_flat(
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl
index 71ab9898213..3d94d985002 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl
@@ -1,3 +1,5 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
@@ -38,6 +40,10 @@ typedef struct {
 #define N_DST 4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 #undef  BLOCK_STRIDE
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl
index d92fb968904..511b73bdf61 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl
@@ -47,6 +47,10 @@ typedef struct {
 #define N_DST 16
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 #undef  BLOCK_STRIDE
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl
index b2058abc1b6..f3c01969671 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl
@@ -41,6 +41,10 @@ typedef struct {
 #define N_DST       4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
+#else
+#define N_DST       4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 #define BLOCK_STRIDE (N_SIMDWIDTH/8)
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl
index e353a72be70..ed96570ed1b 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl
@@ -45,6 +45,10 @@ typedef struct {
 #define N_DST       16
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 64
+#else
+#define N_DST       4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 #undef  BLOCK_STRIDE
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl
index 819e5192e35..d3c12d474e2 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl
@@ -68,6 +68,10 @@ typedef struct {
 #define N_DST 1
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 1
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 #define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl
index 86fe09c6dd6..61f59fcac28 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl
@@ -92,6 +92,10 @@ inline float block_q_6_K_dot_y_flat(
 #define N_DST 4
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 64
+#else
+#define N_DST 1
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 32
 #endif
 
 #define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl
index 7e88c7494de..356c087fb77 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl
@@ -34,6 +34,10 @@ typedef struct {
 #define N_R0_Q8_0 4
 #define N_SG_Q8_0 2
 #define N_SIMDWIDTH 64
+#else
+#define N_R0_Q8_0 8
+#define N_SG_Q8_0 1
+#define N_SIMDWIDTH 32
 #endif
 
 #ifdef INTEL_GPU
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl
index 71d159fd521..d0f7644d912 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl
@@ -34,6 +34,10 @@ typedef struct {
 #define N_R0_Q8_0 4
 #define N_SG_Q8_0 2
 #define N_SIMDWIDTH 64
+#else
+#define N_R0_Q8_0 8
+#define N_SG_Q8_0 1
+#define N_SIMDWIDTH 32
 #endif
 
 #ifdef INTEL_GPU
diff --git a/ggml/src/ggml-opencl/kernels/rope.cl b/ggml/src/ggml-opencl/kernels/rope.cl
index 82f4cd87407..a713cdafe0f 100644
--- a/ggml/src/ggml-opencl/kernels/rope.cl
+++ b/ggml/src/ggml-opencl/kernels/rope.cl
@@ -509,7 +509,7 @@ kernel void kernel_rope_multi_f16(
     src0 = (global void*)((global char*)src0 + offset0);
     src1 = (global int*)((global char*)src1 + offset1);
     src2 = (global float*)((global char*)src2 + offset2);
-    dst = (global float*)((global char*)dst + offsetd);
+    dst = (global half*)((global char*)dst + offsetd);
 
     int i3 = get_group_id(2);
     int i2 = get_group_id(1);
@@ -702,7 +702,7 @@ kernel void kernel_rope_vision_f16(
     src0 = (global void*)((global char*)src0 + offset0);
     src1 = (global int*)((global char*)src1 + offset1);
     src2 = (global float*)((global char*)src2 + offset2);
-    dst = (global float*)((global char*)dst + offsetd);
+    dst = (global half*)((global char*)dst + offsetd);
 
     int i3 = get_group_id(2);
     int i2 = get_group_id(1);
diff --git a/ggml/src/ggml-opencl/kernels/sigmoid.cl b/ggml/src/ggml-opencl/kernels/sigmoid.cl
index e3f669dde83..1337902a1a7 100644
--- a/ggml/src/ggml-opencl/kernels/sigmoid.cl
+++ b/ggml/src/ggml-opencl/kernels/sigmoid.cl
@@ -25,5 +25,5 @@ kernel void kernel_sigmoid_f16(
     src0 = (global half*)((global char*)src0 + offset0);
     dst = (global half*)((global char*)dst + offsetd);
 
-    dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
+    dst[get_global_id(0)] = 1.0f / (1.0f + exp((float)-src0[get_global_id(0)]));
 }
diff --git a/ggml/src/ggml-opencl/kernels/softplus.cl b/ggml/src/ggml-opencl/kernels/softplus.cl
index 6f8b7474165..91fab7ec997 100644
--- a/ggml/src/ggml-opencl/kernels/softplus.cl
+++ b/ggml/src/ggml-opencl/kernels/softplus.cl
@@ -38,7 +38,7 @@ kernel void kernel_softplus_f16(
     dst  = (global half*)((global char*)dst + offsetd);
 
     const float x = convert_float(src0[get_global_id(0)]);
-    dst[get_global_id(0)] = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
+    dst[get_global_id(0)] = convert_half((x > 20.0f) ? x : log(1.0f + exp(x)));
 }
 
 kernel void kernel_softplus_f16_4(
@@ -51,7 +51,7 @@ kernel void kernel_softplus_f16_4(
     dst  = (global half4*)((global char*)dst + offsetd);
 
     const float4 x = convert_float4(src0[get_global_id(0)]);
-    dst[get_global_id(0)] = convert_half4_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
+    dst[get_global_id(0)] = convert_half4((x > 20.0f) ? x : log(1.0f + exp(x)));
 }
 
 kernel void kernel_softplus_f32_nc(
@@ -111,6 +111,6 @@ kernel void kernel_softplus_f16_nc(
         global       half * hy = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
         const float x = convert_float(*hx);
-        *hy = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
+        *hy = convert_half((x > 20.0f) ? x : log(1.0f + exp(x)));
     }
 }
diff --git a/ggml/src/ggml-opencl/kernels/tanh.cl b/ggml/src/ggml-opencl/kernels/tanh.cl
index 2c4887ad3e0..ff173793a76 100644
--- a/ggml/src/ggml-opencl/kernels/tanh.cl
+++ b/ggml/src/ggml-opencl/kernels/tanh.cl
@@ -33,7 +33,7 @@ kernel void kernel_tanh_f16(
     src0 = (global half*)((global char*)src0 + offset0);
     dst  = (global half*)((global char*)dst + offsetd);
 
-    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
+    dst[get_global_id(0)] = convert_half(tanh(convert_float(src0[get_global_id(0)])));
 }
 
 kernel void kernel_tanh_f16_4(
@@ -45,7 +45,7 @@ kernel void kernel_tanh_f16_4(
     src0 = (global half4*)((global char*)src0 + offset0);
     dst  = (global half4*)((global char*)dst + offsetd);
 
-    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
+    dst[get_global_id(0)] = convert_half4(exp(convert_float4(src0[get_global_id(0)])));
 }
 
 kernel void kernel_tanh_f32_nc(
@@ -104,6 +104,6 @@ kernel void kernel_tanh_f16_nc(
         global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
         global       half * y = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-        *y = tanh(*x);
+        *y = convert_half(exp(convert_float(*x)));
     }
 }

From 9abbfe0a971988665cdadf3ac05c2c2b4f986f84 Mon Sep 17 00:00:00 2001
From: Xiangyan Sun <wishstudio@gmail.com>
Date: Tue, 7 Apr 2026 19:25:39 +0300
Subject: [PATCH 2/5] Implement and use cuda graph plans.

---
 ggml/include/ggml-backend.h |   3 +
 ggml/src/ggml-backend.cpp   | 112 +++++++++++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index d0c7e5a1be0..3acc7e2e9d3 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -97,8 +97,11 @@ extern "C" {
 
     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
 
+    GGML_API bool                      ggml_backend_supports_graph_plan(ggml_backend_t backend);
+    GGML_API bool                      ggml_backend_supports_graph_plan_update(ggml_backend_t backend);
     GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void                      ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
 
     GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index d9f8aaec52f..58564962500 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -420,6 +420,18 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
     backend->iface.synchronize(backend);
 }
 
+bool ggml_backend_supports_graph_plan(ggml_backend_t backend) {
+    GGML_ASSERT(backend);
+
+    return (bool) backend->iface.graph_plan_create;
+}
+
+bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend) {
+    GGML_ASSERT(backend);
+
+    return (bool) backend->iface.graph_plan_update;
+}
+
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     GGML_ASSERT(backend);
     GGML_ASSERT(backend->iface.graph_plan_create != NULL);
@@ -434,6 +446,13 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
     backend->iface.graph_plan_free(backend, plan);
 }
 
+void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph* cgraph) {
+    GGML_ASSERT(backend);
+    GGML_ASSERT(backend->iface.graph_plan_update != NULL);
+
+    backend->iface.graph_plan_update(backend, plan, cgraph);
+}
+
 enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     GGML_ASSERT(backend);
     GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
@@ -771,6 +790,11 @@ struct ggml_backend_sched_split {
     struct ggml_cgraph graph;
 };
 
+struct ggml_backend_sched_plan {
+    int backend_id;
+    ggml_backend_graph_plan_t plan;
+};
+
 struct ggml_backend_sched {
     bool is_reset; // true if the scheduler has been reset since the last graph split
     bool is_alloc;
@@ -800,6 +824,12 @@ struct ggml_backend_sched {
     int n_splits;
     int splits_capacity;
 
+    // graph plans
+    struct ggml_backend_sched_plan * plans;
+    int n_plans;
+    int plans_capacity;
+    bool plan_needs_update;
+
     // pipeline parallelism support
     int n_copies;
     int cur_copy;
@@ -1010,6 +1040,16 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
     }
 }
 
+static void ggml_backend_sched_free_plans(ggml_backend_sched_t sched) {
+    for (int i = 0; i < sched->n_plans; i++) {
+        ggml_backend_t backend = sched->backends[sched->plans[i].backend_id];
+        if (ggml_backend_supports_graph_plan(backend)) {
+            ggml_backend_graph_plan_free(backend, sched->plans[i].plan);
+        }
+    }
+    sched->n_plans = 0;
+}
+
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     // reset splits
@@ -1484,6 +1524,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
     for (int i = 0; i < sched->n_splits; ++i) {
         sched->splits[i].graph.uid = ggml_graph_next_uid();
     }
+    sched->plan_needs_update = true;
 }
 
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
@@ -1538,6 +1579,62 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     return true;
 }
 
+static void ggml_backend_sched_update_plans(ggml_backend_sched_t sched) {
+    // create graph plans
+    if (sched->plan_needs_update) {
+        bool create_new_plans;
+        if (sched->n_plans == sched->n_splits) {
+            create_new_plans = false;
+            for (int i = 0; i < sched->n_splits; i++) {
+                if (sched->splits[i].backend_id != sched->plans[i].backend_id) {
+                    create_new_plans = true;
+                    break;
+                }
+            }
+        } else {
+            create_new_plans = true;
+        }
+        if (create_new_plans) {
+            // free previous and recreate new plans
+            ggml_backend_sched_free_plans(sched);
+            if (sched->plans_capacity < sched->n_splits) {
+                while (sched->plans_capacity < sched->n_splits) {
+                    sched->plans_capacity *= 2;
+                }
+                sched->plans = (ggml_backend_sched_plan *) realloc(
+                    sched->plans, sched->plans_capacity * sizeof(struct ggml_backend_sched_plan));
+                GGML_ASSERT(sched->plans);
+            }
+            sched->n_plans = sched->n_splits;
+            for (int i = 0; i < sched->n_splits; i++) {
+                ggml_backend_t backend = sched->backends[sched->splits[i].backend_id];
+                sched->plans[i].backend_id = sched->splits[i].backend_id;
+                if (ggml_backend_supports_graph_plan(backend)) {
+                    sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph);
+                } else {
+                    sched->plans[i].plan = nullptr;
+                }
+            }
+        } else {
+            // update existing plans
+            for (int i = 0; i < sched->n_splits; i++) {
+                ggml_backend_t backend = sched->backends[sched->splits[i].backend_id];
+                if (ggml_backend_supports_graph_plan(backend)) {
+                    if (ggml_backend_supports_graph_plan_update(backend)) {
+                        ggml_backend_graph_plan_update(backend, sched->plans[i].plan, &sched->splits[i].graph);
+                    } else {
+                        ggml_backend_graph_plan_free(backend, sched->plans[i].plan);
+                        sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph);
+                    }
+                } else {
+                    sched->plans[i].plan = nullptr;
+                }
+            }
+        }
+        sched->plan_needs_update = false;
+    }
+}
+
 static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
     GGML_ASSERT(sched);
     struct ggml_backend_sched_split * splits = sched->splits;
@@ -1546,6 +1643,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
     std::vector<int32_t> ids;
     std::vector<ggml_bitset_t> used_ids;
 
+    ggml_backend_sched_update_plans(sched);
+
     for (int split_id = 0; split_id < sched->n_splits; split_id++) {
         struct ggml_backend_sched_split * split = &splits[split_id];
         int split_backend_id = split->backend_id;
@@ -1675,7 +1774,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         }
 
         if (!sched->callback_eval) {
-            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+            enum ggml_status ec;
+            if (ggml_backend_supports_graph_plan(split_backend) && sched->plans[split_id].plan) {
+                ec = ggml_backend_graph_plan_compute(split_backend, sched->plans[split_id].plan);
+            } else {
+                ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+            }
             if (ec != GGML_STATUS_SUCCESS) {
                 return ec;
             }
@@ -1773,6 +1877,10 @@ ggml_backend_sched_t ggml_backend_sched_new(
     sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
     sched->splits_capacity = initial_splits_capacity;
 
+    const int initial_plans_capacity = 16;
+    sched->plans = (ggml_backend_sched_plan *) calloc(initial_plans_capacity, sizeof(sched->plans[0]));
+    sched->plans_capacity = initial_plans_capacity;
+
     for (int b = 0; b < n_backends; b++) {
         sched->backends[b] = backends[b];
         sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
@@ -1806,6 +1914,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     ggml_free(sched->ctx);
     ggml_hash_set_free(&sched->hash_set);
     free(sched->splits);
+    ggml_backend_sched_free_plans(sched);
+    free(sched->plans);
     free(sched->hv_tensor_backend_ids);
     free(sched->hv_tensor_copies);
     free(sched->node_backend_ids);

From 03423817aa61a4a637461f9e87efb44eef67fc8d Mon Sep 17 00:00:00 2001
From: Jan Solanti <jan.solanti@tuni.fi>
Date: Sat, 11 Apr 2026 00:56:27 +0300
Subject: [PATCH 3/5] opencl: use command buffers when available

---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 700 +++++++++++++++++++--------
 1 file changed, 488 insertions(+), 212 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 3e688134af8..7b159dad21a 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -1,5 +1,6 @@
 #define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_ENABLE_BETA_EXTENSIONS
 
 // suppress warnings in CL headers for GCC and Clang
 #pragma GCC diagnostic ignored "-Woverlength-strings"
@@ -14,6 +15,7 @@
 #include "ggml.h"
 
 #include <CL/cl.h>
+#include <CL/cl_ext.h>
 
 #include <inttypes.h>
 #include <string.h>
@@ -51,7 +53,7 @@
 // OpenCL
 //------------------------------------------------------------------------------
 
-bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor, cl_command_buffer_khr recording = NULL);
 
 // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
 // Precompute mp (m' in the paper) and L such that division
@@ -395,6 +397,7 @@ struct ggml_backend_opencl_context {
     bool fp16_support;
     bool has_vector_subgroup_broadcast;
     bool disable_fusion;
+    bool cmdbuf_support;
 
     bool adreno_has_large_buffer;
     bool adreno_use_large_buffer;
@@ -706,17 +709,22 @@ struct ggml_backend_opencl_context {
         return workgroup_size;
     }
 
-    void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
+    void enqueue_ndrange_kernel(cl_command_buffer_khr recording, cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
+        if (recording) {
+            CL_CHECK(this->clCommandNDRangeKernelKHR(recording, NULL, NULL, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL, NULL));
+            return;
+        } else {
 #ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
 
-        profiling_info.emplace_back();
-        populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
+            profiling_info.emplace_back();
+            populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
 #else
-        GGML_UNUSED(tensor);
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+            GGML_UNUSED(tensor);
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
+        }
     }
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
@@ -752,6 +760,17 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_gemm_noshuffle_iq4_nl_f32;
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
+    clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = NULL;
+    clCommandCopyBufferKHR_fn clCommandCopyBufferKHR = NULL;
+    clCommandCopyBufferRectKHR_fn clCommandCopyBufferRectKHR = NULL;
+    clCommandFillBufferKHR_fn clCommandFillBufferKHR = NULL;
+    clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = NULL;
+    clCommandBarrierWithWaitListKHR_fn clCommandBarrierWithWaitListKHR = NULL;
+    clFinalizeCommandBufferKHR_fn clFinalizeCommandBufferKHR = NULL;
+    clEnqueueCommandBufferKHR_fn clEnqueueCommandBufferKHR = NULL;
+    clReleaseCommandBufferKHR_fn clReleaseCommandBufferKHR = NULL;
+    clGetCommandBufferInfoKHR_fn clGetCommandBufferInfoKHR = NULL;
+
     void free() {
         ref_count--;
         if (ref_count == 0) {
@@ -3425,6 +3444,25 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         backend_ctx->non_uniform_workgroups = true;
     }
 
+    // Check command buffer support.
+    backend_ctx->cmdbuf_support = strstr(ext_buffer, "cl_khr_command_buffer");
+    if (backend_ctx->cmdbuf_support) {
+        backend_ctx->clCreateCommandBufferKHR = (clCreateCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCreateCommandBufferKHR");
+        backend_ctx->clCommandCopyBufferKHR = (clCommandCopyBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandCopyBufferKHR");
+        backend_ctx->clCommandCopyBufferRectKHR = (clCommandCopyBufferRectKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandCopyBufferRectKHR");
+        backend_ctx->clCommandFillBufferKHR = (clCommandFillBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandFillBufferKHR");
+        backend_ctx->clCommandNDRangeKernelKHR = (clCommandNDRangeKernelKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandNDRangeKernelKHR");
+        backend_ctx->clCommandBarrierWithWaitListKHR  = (clCommandBarrierWithWaitListKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandBarrierWithWaitListKHR");
+        backend_ctx->clFinalizeCommandBufferKHR = (clFinalizeCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clFinalizeCommandBufferKHR");
+        backend_ctx->clEnqueueCommandBufferKHR = (clEnqueueCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clEnqueueCommandBufferKHR");
+        backend_ctx->clReleaseCommandBufferKHR = (clReleaseCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clReleaseCommandBufferKHR");
+        backend_ctx->clGetCommandBufferInfoKHR = (clGetCommandBufferInfoKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clGetCommandBufferInfoKHR");
+    }
+    char * cmdbuf_env = getenv("GGML_OPENCL_USE_CMDBUF");
+    bool cmdbuf_enable = (cmdbuf_env && strlen(cmdbuf_env) > 0) ? (bool) atoi(cmdbuf_env) : true;
+    GGML_LOG_INFO("ggml_opencl: command buffer support: %s%s\n", backend_ctx->cmdbuf_support ? "true" : "false", backend_ctx->cmdbuf_support ? (cmdbuf_enable ? " (enabled)" : " (disabled by env)") : "");
+    backend_ctx->cmdbuf_support &= cmdbuf_enable;
+
     // Print out configurations
 #ifdef GGML_OPENCL_SOA_Q
     GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -4090,13 +4128,13 @@ static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
     return true;
 }
 
-static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor);
-static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
-static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
+static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor);
+static void ggml_opencl_op_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
+static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
 
-static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+static ggml_status ggml_backend_opencl_graph_eval(ggml_backend_t backend, const ggml_cgraph * cgraph, cl_command_buffer_khr recording) {
 
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
@@ -4114,22 +4152,22 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
         }
 
         if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
-            ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
+            ggml_opencl_op_norm_fused(backend, recording, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
             i += 2;
             continue;
         }
         if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_GROUP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
-            ggml_opencl_op_group_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
+            ggml_opencl_op_group_norm_fused(backend, recording, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
             i += 2;
             continue;
         }
         if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-            ggml_opencl_op_rms_norm_fused(backend, node, cgraph->nodes[i+1]);
+            ggml_opencl_op_rms_norm_fused(backend, recording, node, cgraph->nodes[i+1]);
             i++;
             continue;
         }
 
-        bool ok = ggml_cl_compute_forward(backend, node);
+        bool ok = ggml_cl_compute_forward(backend, node, recording);
         if (!ok) {
             GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
@@ -4139,6 +4177,47 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
     return GGML_STATUS_SUCCESS;
 }
 
+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    return ggml_backend_opencl_graph_eval(backend, cgraph, NULL);
+}
+
+static ggml_backend_graph_plan_t ggml_backend_opencl_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph)
+{
+    ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context;
+    if (!backend_ctx->cmdbuf_support) {
+        return NULL;
+    }
+
+    cl_int error;
+    cl_command_buffer_khr cbuf = backend_ctx->clCreateCommandBufferKHR(1, &backend_ctx->queue, NULL, &error);
+    CL_CHECK(error);
+
+    if (ggml_backend_opencl_graph_eval(backend, cgraph, cbuf) != GGML_STATUS_SUCCESS) {
+        return NULL;
+    }
+
+    CL_CHECK(backend_ctx->clFinalizeCommandBufferKHR(cbuf));
+    return cbuf;
+}
+
+static void ggml_backend_opencl_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan)
+{
+    ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context;
+    if (!backend_ctx->cmdbuf_support) {
+        return;
+    }
+
+    cl_command_buffer_khr cmdbuf = (cl_command_buffer_khr)plan;
+
+    backend_ctx->clReleaseCommandBufferKHR(cmdbuf);
+}
+
+static enum ggml_status ggml_backend_opencl_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context;
+    CL_CHECK(backend_ctx->clEnqueueCommandBufferKHR(1, &backend_ctx->queue, (cl_command_buffer_khr)plan, 0, NULL, NULL));
+    return GGML_STATUS_SUCCESS;
+}
+
 static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     ggml_backend_opencl_device_context * dev_ctx     = (ggml_backend_opencl_device_context *)dev->context;
     ggml_backend_opencl_context *        backend_ctx = dev_ctx->backend_ctx;
@@ -4437,10 +4516,10 @@ static ggml_backend_i ggml_backend_opencl_i = {
     /* .get_tensor_2d_async     = */ NULL,
     /* .cpy_tensor_async        = */ NULL,  /* ggml_backend_opencl_cpy_tensor_async */
     /* .synchronize             = */ ggml_backend_opencl_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_create       = */ ggml_backend_opencl_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_opencl_graph_plan_free,
     /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
+    /* .graph_plan_compute      = */ ggml_backend_opencl_graph_plan_compute,
     /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
@@ -6689,8 +6768,9 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg
 
 // Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
 // nb[] is recalculated such that tensor is contiguous.
-static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
-                                       cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
+static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, cl_command_buffer_khr recording,
+                                       const ggml_tensor * src, cl_mem dst, cl_ulong &nb0,
+                                       cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
     const int tensor_type_size = ggml_type_size(src->type);
@@ -6759,17 +6839,18 @@ static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src);
+
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, src);
 }
 
-static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_nop(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     UNUSED(backend);
     UNUSED(src0);
     UNUSED(src1);
     UNUSED(dst);
 }
 
-static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_get_rows(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -6837,10 +6918,10 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
     size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_set_rows(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -6954,10 +7035,10 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
         (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_add(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -7125,17 +7206,17 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
             local_work_size_ptr = nullptr;
         }
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size_ptr, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, global_work_size, local_work_size_ptr, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_add_id(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -7201,10 +7282,10 @@ static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, con
     size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 };
     size_t local_work_size[] = { (size_t)nth, 1, 1 };
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -7330,17 +7411,17 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
         }
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_div(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -7451,17 +7532,17 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)n, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_sub(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -7572,17 +7653,17 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)n, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_sqr(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7629,10 +7710,10 @@ static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const
         local_work_size_ptr = nullptr;
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_sqrt(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7679,10 +7760,10 @@ static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const
         local_work_size_ptr = nullptr;
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mean(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7739,10 +7820,10 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)64, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_ssm_conv(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -7803,10 +7884,10 @@ static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, c
         local_work_size_ptr = nullptr;
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_gelu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7841,10 +7922,10 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_gelu_erf(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7879,10 +7960,10 @@ static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, c
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_gelu_quick(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7917,10 +7998,10 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_silu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7960,10 +8041,10 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_relu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -7996,10 +8077,10 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_sigmoid(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8039,10 +8120,10 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_tri(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8077,10 +8158,10 @@ static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t local_work_size[1] = { 256 };
     size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_fill(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(dst);
     GGML_ASSERT(dst->extra);
 
@@ -8107,10 +8188,10 @@ static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t local_work_size[1] = { 256 };
     size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_clamp(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8150,10 +8231,10 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8202,10 +8283,10 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_rms_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8277,10 +8358,10 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
     // This is local memory - the size depends on subgroup size.
     CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs,  NULL));
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) {
+static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) {
     GGML_ASSERT(mul_tensor);
     GGML_ASSERT(rms_norm_tensor);
 
@@ -8388,10 +8469,10 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
     CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float),         &eps));
     CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,     NULL));
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
+static void ggml_opencl_op_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
     GGML_ASSERT(norm_tensor && mul_tensor && add_tensor);
 
     const ggml_tensor * src0 = norm_tensor->src[0];
@@ -8474,10 +8555,10 @@ static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm
     CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &eps));
     CL_CHECK(clSetKernelArg(kernel, 33, sizeof(cl_float2) * num_subgroups, NULL));
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, gws, lws, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, gws, lws, dst);
 }
 
-static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
+static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
     GGML_ASSERT(gn_tensor && mul_tensor && add_tensor);
 
     const ggml_tensor * src0 = gn_tensor->src[0];
@@ -8522,10 +8603,10 @@ static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor
     CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &group_size));
     CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &eps));
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 1, gws, lws, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, gws, lws, dst);
 }
 
-static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_group_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8572,10 +8653,10 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
     size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
     size_t local_work_size[] = {(size_t)sgs, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_l2_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8630,10 +8711,10 @@ static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, co
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_tanh(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8697,7 +8778,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
             local_work_size_ptr = nullptr;
         }
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
     } else {
         // Handle non-contiguous input
         if (src0->type == GGML_TYPE_F32) {
@@ -8725,11 +8806,11 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_neg(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8779,7 +8860,7 @@ static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else {
         // Handle non-contiguous input
         if (src0->type == GGML_TYPE_F32) {
@@ -8807,11 +8888,11 @@ static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_exp(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8861,7 +8942,7 @@ static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else {
         // Handle non-contiguous input
         if (src0->type == GGML_TYPE_F32) {
@@ -8889,11 +8970,11 @@ static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_expm1(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -8957,7 +9038,7 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
             local_work_size_ptr = nullptr;
         }
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
     } else {
         // Handle non-contiguous input
         if (src0->type == GGML_TYPE_F32) {
@@ -8985,11 +9066,11 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
         size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_softplus(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -9053,7 +9134,7 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
             local_work_size_ptr = nullptr;
         }
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
     } else {
         // Handle non-contiguous input
         if (src0->type == GGML_TYPE_F32) {
@@ -9081,11 +9162,11 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
         size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
+static void ggml_cl_repeat(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -9147,10 +9228,10 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
     size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_cl_pad(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -9242,10 +9323,10 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
         local_work_size_ptr = nullptr;
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_cl_upscale(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -9356,10 +9437,10 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
         local_work_size_ptr = nullptr;
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_concat(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -9440,10 +9521,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
     size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_cl_timestep_embedding(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -9484,10 +9565,10 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
 
     size_t global_work_size[] = {gws0, gws1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, NULL, dst);
 }
 
-static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
+static void ggml_cl_flash_attn(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
     const ggml_tensor * v = dst->src[2];
     const ggml_tensor * mask = dst->src[3];
     const ggml_tensor * sinks = dst->src[4];
@@ -9613,17 +9694,17 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
         const size_t wg_size = 64;
         size_t local_work_size[] = { wg_size, 1 };
         size_t global_work_size[] = { wg_size, (size_t)(n_head * n_batch) };
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst);
     } else {
         const int block_m = backend_ctx->kernels_flash_attn_bm.at(dk_dv);
         const size_t wg_size = block_m;
         size_t local_work_size[] = { wg_size, 1 };
         size_t global_work_size[] = { (size_t)((n_q + block_m - 1) / block_m) * wg_size, (size_t)(n_head * n_batch) };
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst);
     }
 }
 
-static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
@@ -9672,10 +9753,10 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
         (size_t) ((N + OPWN - 1) / OPWN) * TPWN,
     };
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_conv_2d(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_TENSOR_BINARY_OP_LOCALS;
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
@@ -9748,10 +9829,10 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co
     size_t global_work_size[] = { (size_t)NB_K * WG_K, (size_t)NB_NPQ * WG_NPQ, 1 };
     size_t local_work_size[] = { (size_t)WG_K, (size_t)WG_NPQ, 1 };
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
@@ -9880,7 +9961,7 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten
     size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
     size_t local_work_size[3] = {64, 1, 2};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
     // deallocate sub buffers and images
     // <--------------------------------------------> //
@@ -9992,7 +10073,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t local_work_size[3] = {64, 4, 1};
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(q_img));
         CL_CHECK(clReleaseMemObject(b_sub_buf));
@@ -10073,7 +10154,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t
             local_work_size_t[0]=2;
             local_work_size_t[1]=8;
         }
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst);
 
         // gemm
         kernel = backend_ctx->kernel_gemm_noshuffle_q4_0_f32;
@@ -10104,7 +10185,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t
             local_work_size[1] = 64;
         }
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(b_sub_buf));
         CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
@@ -10120,7 +10201,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t
 #endif
 }
 
-static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -10197,7 +10278,7 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t local_work_size[3] = {64, 4, 1};
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(q_img));
         CL_CHECK(clReleaseMemObject(b_sub_buf));
@@ -10259,7 +10340,7 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t
 
         size_t local_work_size_t[2] = { 1, 16 };
         size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst);
 
         // gemm
         kernel = backend_ctx->kernel_gemm_noshuffle_q4_1_f32;
@@ -10279,7 +10360,7 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
         size_t local_work_size[3] = {1, 128, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(b_sub_buf));
         CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
@@ -10370,7 +10451,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml
         size_t local_work_size[3] = {64, 4, 1};
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(q_img));
         CL_CHECK(clReleaseMemObject(b_sub_buf));
@@ -10432,7 +10513,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml
 
         size_t local_work_size_t[2] = { 1, 16 };
         size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst);
 
         // gemm
         kernel = backend_ctx->kernel_gemm_noshuffle_iq4_nl_f32;
@@ -10451,7 +10532,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
         size_t local_work_size[3] = {1, 128, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(b_sub_buf));
         CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
@@ -10466,7 +10547,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml
 #endif
 }
 
-static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -10567,7 +10648,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t local_work_size[]  = { wavesize, 4, 1 };
         size_t global_work_size[] = { CEIL_DIV(M, wavesize)*wavesize, 4, 1 };
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(q_img));
         CL_CHECK(clReleaseMemObject(b_img));
@@ -10627,9 +10708,10 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
         CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
         CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
 
+
         size_t local_work_size_t[2] = { 1, 16 };
         size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst);
 
         // gemm
         kernel = backend_ctx->kernel_gemm_noshuffle_q8_0_f32;
@@ -10648,7 +10730,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t global_work_size[] = { (size_t)CEIL_DIV(N, 8), (size_t)CEIL_DIV(M, 4), 1 };
         size_t local_work_size[]  = { 2, 128, 1 };
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(b_img_trans));
         CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
@@ -10663,7 +10745,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
 #endif
 }
 
-static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -10748,7 +10830,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t local_work_size[3] = {64, 4, 1};
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(q_img));
         CL_CHECK(clReleaseMemObject(b_sub_buf));
@@ -10811,7 +10893,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t
 
         size_t local_work_size_t[2] = { 1, 16 };
         size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst);
 
         // gemm
         kernel = backend_ctx->kernel_gemm_noshuffle_q4_k_f32;
@@ -10835,7 +10917,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
         size_t local_work_size[3] = {1, 128, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
         CL_CHECK(clReleaseMemObject(b_sub_buf));
         CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
         CL_CHECK(clReleaseMemObject(b_img));
@@ -10849,7 +10931,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t
 #endif
 }
 
-static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -10934,7 +11016,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t local_work_size[3] = {64, 4, 1};
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(ql_img));
         CL_CHECK(clReleaseMemObject(qh_img));
@@ -10999,7 +11081,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t
 
         size_t local_size_t[2] = { 1, 16 };
         size_t global_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_size_t, local_size_t, dst);
 
         // gemm
         kernel = backend_ctx->kernel_gemm_noshuffle_q6_K_f32;
@@ -11024,7 +11106,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t
 
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
         size_t local_work_size[3] = {2, 128, 1};
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(b_sub_buf));
         CL_CHECK(clReleaseMemObject(b_img));
@@ -11039,7 +11121,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t
 #endif
 }
 
-static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -11133,7 +11215,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t local_work_size[3]  = {64, 4, 1};
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(q_img));
         CL_CHECK(clReleaseMemObject(qh_img));
@@ -11194,7 +11276,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t
 
         size_t local_work_size_t[2]  = {1, 16};
         size_t global_work_size_t[2] = {(size_t)width_B, (size_t)padded_height_B};
-        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst);
 
         // gemm
         kernel = backend_ctx->kernel_gemm_noshuffle_q5_k_f32;
@@ -11219,7 +11301,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t
         size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
         size_t local_work_size[3]  = {1, 128, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
         CL_CHECK(clReleaseMemObject(b_sub_buf));
         CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
@@ -11234,7 +11316,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t
 #endif
 }
 
-static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -11346,6 +11428,169 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
             ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
             return;
+    }
+
+    // q6_K x fp32
+    if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
+        ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
+        return;
+    }
+
+    // q5_K x fp32
+    if (src0t == GGML_TYPE_Q5_K && src1t == GGML_TYPE_F32) {
+        ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst);
+        return;
+    }
+
+    // q4_0 x fp32
+    if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
+        // TODO: remove duplicate definitions of image description + format -- move to top
+
+        // create an image for A
+        // <--------------------------------------------> //
+        if (N == 1) {
+            img_fmt_1d = { CL_R, CL_UNSIGNED_INT32};
+        } else {
+            img_fmt_1d = { CL_R, CL_FLOAT};
+        }
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 2 / 4;    // Divide by 4 for char -> float
+        img_desc_1d.buffer = extra0_q4_0->q;
+        A_image1d = clCreateImage(
+            context,
+            CL_MEM_READ_ONLY,
+            &img_fmt_1d,
+            &img_desc_1d,
+            NULL,
+            &status);
+        CL_CHECK(status);
+        // <--------------------------------------------> //
+
+
+        // create a sub_buffer for B
+        // <--------------------------------------------> //
+        region.origin = (extra1->offset);
+        region.size = K * N * sizeof(float);
+        B_sub_buffer = clCreateSubBuffer(
+            extra1->data_device,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &status);
+        CL_CHECK(status);
+        // <--------------------------------------------> //
+
+        // transpose activation for Skyler's gemm
+        if (N != 1) {
+            //how many extra elements beyond multiple of 8
+            int extra_elements = N % 8;
+
+            //how much padding to add
+            padding = 0;
+            if (extra_elements > 0){
+                padding = 8 - extra_elements;
+            }
+
+            // Specify the starting offset (in bytes)
+            region.origin = 0;
+            // Specify the size of the sub-buffer (divide by 2 for FP16)
+            region.size = K * (N + padding) * sizeof(float)/2;
+            backend_ctx->prealloc_act_trans.allocate(context, region.size);
+
+            B_d = clCreateSubBuffer(
+                backend_ctx->prealloc_act_trans.buffer,
+                0,
+                CL_BUFFER_CREATE_TYPE_REGION,
+                &region,
+                &status);
+            CL_CHECK(status);
+
+            cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT };
+            cl_image_desc image_desc_B_d_input = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(K * N / 4),
+                0, 0, 0, 0, 0, 0, 0, { B_sub_buffer }
+            };
+            B_d_input_image = clCreateImage(
+                context,
+                0,
+                &image_format_B_d_input,
+                &image_desc_B_d_input,
+                NULL,
+                &status);
+            CL_CHECK(status);
+
+            cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
+            cl_image_desc image_desc_B_d_output = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(K * (N + padding)/4),
+                0, 0, 0, 0, 0, 0, 0, { B_d }
+            };
+            B_image1d = clCreateImage(
+                context,
+                0,
+                &image_format_B_d_output,
+                &image_desc_B_d_output,
+                NULL,
+                &status);
+            CL_CHECK(status);
+
+            int height_B = N/4;
+            if (height_B == 0) {
+                height_B = 1;
+            }
+            int width_B = K/4;
+            int padded_height_B = (N + padding)/4;
+
+            kernel = backend_ctx->kernel_transpose_32_16;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
+
+            size_t local_size_t[2] = { 1, 16 };
+            //WGS tuning
+            if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=4;
+                local_size_t[1]=8;
+            } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=2;
+                local_size_t[1]=8;
+            } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
+                local_size_t[0]=1;
+                local_size_t[1]=8;
+            } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=2;
+                local_size_t[1]=8;
+            }
+
+            size_t global_size_t[2] = {
+                static_cast<size_t>(width_B),
+                static_cast<size_t>(padded_height_B)
+            };
+
+            backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_size_t, local_size_t, dst);
+        } else {
+            // no need to transpose B in other cases
+            // create an image for B from sub_buffer
+            // <--------------------------------------------> //
+            img_fmt_1d = {CL_RGBA, CL_FLOAT};
+
+            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+            img_desc_1d.image_width = K * N / 4;
+            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+            img_desc_1d.buffer = B_sub_buffer;
+            B_image1d = clCreateImage(
+                context,
+                CL_MEM_READ_ONLY,
+                &img_fmt_1d,
+                &img_desc_1d,
+                NULL,
+                &status);
+            CL_CHECK(status);
+            // <--------------------------------------------> //
         }
 
         // q6_K x fp32
@@ -11359,6 +11604,28 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
             ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst);
             return;
         }
+        // <--------------------------------------------> //
+
+        // enqueue kernel with profiling
+        // <--------------------------------------------> //
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
+        // <--------------------------------------------> //
+
+        // deallocate sub buffers and images
+        // <--------------------------------------------> //
+        CL_CHECK(clReleaseMemObject(A_image1d));
+        CL_CHECK(clReleaseMemObject(B_sub_buffer));
+        CL_CHECK(clReleaseMemObject(B_image1d));
+
+        if (N != 1) {
+            CL_CHECK(clReleaseMemObject(B_d));
+            CL_CHECK(clReleaseMemObject(B_d_input_image));
+            CL_CHECK(clReleaseMemObject(C_d));
+        }
+        // <--------------------------------------------> //
+
+        return;
+    }
     } // if (ne01 && ne1)
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
@@ -11394,7 +11661,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
                 if (!ggml_is_contiguous(src0)) {
                     backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
-                    ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
+                    ggml_cl_copy_to_contiguous(backend, recording, src0, backend_ctx->prealloc_src0.buffer,
                         nb00_cont, nb01_cont, nb02_cont, nb03_cont);
                     mem_src0 = backend_ctx->prealloc_src0.buffer;
                     offset0_cont = 0;
@@ -11402,7 +11669,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
                 if (!ggml_is_contiguous(src1)) {
                     backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
-                    ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
+                    ggml_cl_copy_to_contiguous(backend, recording, src1, backend_ctx->prealloc_src1.buffer,
                         nb10_cont, nb11_cont, nb12_cont, nb13_cont);
                     mem_src1 = backend_ctx->prealloc_src1.buffer;
                     offset1_cont = 0;
@@ -11432,7 +11699,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_F16: {
@@ -11461,7 +11728,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
                 if (!ggml_is_contiguous(src0)) {
                     backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
-                    ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
+                    ggml_cl_copy_to_contiguous(backend, recording, src0, backend_ctx->prealloc_src0.buffer,
                         nb00_cont, nb01_cont, nb02_cont, nb03_cont);
                     mem_src0 = backend_ctx->prealloc_src0.buffer;
                     offset0_cont = 0;
@@ -11469,7 +11736,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
                 if (!ggml_is_contiguous(src1)) {
                     backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
-                    ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
+                    ggml_cl_copy_to_contiguous(backend, recording, src1, backend_ctx->prealloc_src1.buffer,
                             nb10_cont, nb11_cont, nb12_cont, nb13_cont);
                     mem_src1 = backend_ctx->prealloc_src1.buffer;
                     offset1_cont = 0;
@@ -11499,7 +11766,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_Q4_0: {
@@ -11541,7 +11808,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_Q4_1: {
@@ -11584,7 +11851,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_Q8_0: {
@@ -11626,7 +11893,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_IQ4_NL: {
@@ -11668,7 +11935,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_Q4_K: {
@@ -11712,7 +11979,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_Q5_K: {
@@ -11757,7 +12024,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             case GGML_TYPE_Q6_K: {
@@ -11801,7 +12068,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
                 size_t local_work_size[] = {(size_t)nth0, 1, 1};
 
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
             default:
@@ -11817,7 +12084,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         src1->ne[2] == 1 && src1->ne[3] == 1 &&
         ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
         backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
-        ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
+        ggml_cl_mul_mat_f16_f32_tiled(backend, recording, src0, src1, dst);
         return;
     }
 
@@ -11883,7 +12150,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 global_work_size[2] = (size_t)ne12*ne13;
             }
 
-            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+            backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
             return;
         }
 #else // GGML_OPENCL_SOA_Q
@@ -12590,35 +12857,35 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else if (src0t == GGML_TYPE_Q4_K) {
         size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else if (src0t == GGML_TYPE_Q3_K) {
         GGML_ASSERT(false && "not implemented");
     } else if (src0t == GGML_TYPE_Q5_K) {
         size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else if (src0t == GGML_TYPE_Q6_K) {
         size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else {
         int64_t ny = (ne11 + nrows - 1)/nrows;
 
         size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
-static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, int ne20) {
+static void moe_router_reoerder(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src, int ne20) {
     cl_int err;
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
@@ -12683,7 +12950,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src,
 
     size_t histogram_global_size[] = {(size_t)(((ne21 + 63) / 64) * 64), static_cast<size_t>(ne20), 1};
     size_t histogram_local_size[] = {64, static_cast<size_t>(ne20), 1};
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, histogram_global_size, histogram_local_size, src);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, histogram_global_size, histogram_local_size, src);
 
     // Scan
     kernel = backend_ctx->kernel_moe_scan;
@@ -12696,7 +12963,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src,
 
     size_t scan_global_size[] = {1};
     size_t scan_local_size[] = {1};
-    backend_ctx->enqueue_ndrange_kernel(kernel, 1, scan_global_size, scan_local_size, src);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, scan_global_size, scan_local_size, src);
 
     // Fill
     kernel = backend_ctx->kernel_moe_fill;
@@ -12706,7 +12973,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src,
 
     size_t fill_global_size[] = {(size_t)(((max_post_router_tile + 63) / 64) * 64), n_tile_size, 1};
     size_t fill_local_size[] = {64, 1, 1};
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, fill_global_size, fill_local_size, src);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, fill_global_size, fill_local_size, src);
 
     // Scatter
     kernel = backend_ctx->kernel_moe_scatter;
@@ -12719,7 +12986,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src,
     CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne20));
     CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, histogram_global_size, histogram_local_size, src);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, histogram_global_size, histogram_local_size, src);
 
     CL_CHECK(clReleaseMemObject(original_router_buf));
     CL_CHECK(clReleaseMemObject(hist_buf));
@@ -12730,7 +12997,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src,
     CL_CHECK(clReleaseMemObject(emap_buf));
 }
 
-static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_mul_mat_id(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -12985,7 +13252,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne11));
 
                     // launch kernel
-                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+                    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_size, local_size, dst);
 
                     // deallocate sub buffers and images
                     CL_CHECK(clReleaseMemObject(src1_sub_buffer));
@@ -13057,7 +13324,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
 
                     // Dispatch reorder kernel
-                    backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
+                    backend_ctx->enqueue_ndrange_kernel(recording, backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
 
                     // MoE kernel prepare
                     // Create sub buffer for dst
@@ -13095,7 +13362,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     local_size[2] = 1;
 
                     // Dispatch kernel
-                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+                    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_size, local_size, dst);
 
                     clReleaseMemObject(sub_buf_src1_pre);
                     clReleaseMemObject(buf_src1_reordered);
@@ -13105,6 +13372,14 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     clReleaseMemObject(sub_buf_dst);
                     clReleaseMemObject(buf_dst_image);
                 }
+
+                // launch kernel
+                backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_size, local_size, dst);
+
+                // deallocate sub buffers and images
+                CL_CHECK(clReleaseMemObject(src1_sub_buffer));
+                CL_CHECK(clReleaseMemObject(buf_src1_image));
+                CL_CHECK(clReleaseMemObject(buf_src2));
                 return;
             } // fallback to generic MoE mxfp4 kernel
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
@@ -13210,10 +13485,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
     size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
     size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_scale(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -13261,10 +13536,10 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
-static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_cpy(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -13356,15 +13631,15 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, src1);
 }
 
-static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cl_cpy(backend, src0, dst, nullptr);
+static void ggml_cl_dup(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cl_cpy(backend, recording, src0, dst, nullptr);
     UNUSED(src1);
 }
 
-static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_set(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -13401,7 +13676,7 @@ static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const
     // for inplace case, dst is a view of src0 and is updated on top of it
     // so for non-inplace case, copy src0 to dst first
     if (!inplace) {
-        ggml_cl_cpy(backend, src0, dst, nullptr);
+        ggml_cl_cpy(backend, recording, src0, dst, nullptr);
     }
 
     // then copy src1 to dst with specified offset
@@ -13444,10 +13719,10 @@ static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne11*nth, (size_t)ne12, (size_t)ne13};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_diag_mask_inf(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -13485,7 +13760,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
         size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
     } else {
         kernel = backend_ctx->kernel_diag_mask_inf;
 
@@ -13505,11 +13780,11 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
         }
 
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst);
     }
 }
 
-static void ggml_cl_diag(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_diag(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -13549,10 +13824,10 @@ static void ggml_cl_diag(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_soft_max(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -13677,10 +13952,10 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_rope(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -13860,10 +14135,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_solve_tri(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(src1);
@@ -13926,10 +14201,10 @@ static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0,
     size_t global_work_size[3]= { (size_t)k, (size_t)dst->ne[2], (size_t)dst->ne[3]};
     size_t local_work_size[] = {16, 4, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_im2col(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src1);
     GGML_ASSERT(src1->extra);
@@ -14009,10 +14284,10 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
     size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
     size_t local_work_size[] = {256, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_argsort(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -14055,7 +14330,8 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
     size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
     size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     const int ne21 = dst->ne[1];
@@ -14065,7 +14341,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 }
 
-static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_sum_rows(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -14122,10 +14398,10 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
     size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)64, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
-static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_cumsum(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -14189,7 +14465,7 @@ static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, con
     size_t global_work_size[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = { (size_t)nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 
     if(ne00 > nth) {
         // if a single workgroup cannot handle an entire row, each workgroup
@@ -14216,7 +14492,7 @@ static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, con
 
         size_t global_work_size_1[] = { (size_t)net1*nth, (size_t)net2, (size_t)net3};
         size_t local_work_size_1[] = { (size_t)nth, 1, 1};
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_1, local_work_size_1, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size_1, local_work_size_1, dst);
 
         kernel = backend_ctx->kernel_cumsum_add;
         CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &tmp_buffer.buffer));
@@ -14233,11 +14509,11 @@ static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, con
 
         size_t global_work_size_2[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
         size_t local_work_size_2[] = { (size_t)nth, 1, 1};
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_2, local_work_size_2, dst);
+        backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size_2, local_work_size_2, dst);
     }
 }
 
-static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cl_glu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
     GGML_ASSERT(dst);
@@ -14344,16 +14620,16 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {nrows*nth, 1, 1};
     size_t local_work_size[] = {nth, 1, 1};
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst);
 }
 
 //------------------------------------------------------------------------------
 // Op offloading
 //------------------------------------------------------------------------------
 
-typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_cl_func_t)(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 
-bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor, cl_command_buffer_khr recording) {
     ggml_cl_func_t func = nullptr;
 
     ggml_tensor * src0 = tensor->src[0];
@@ -14572,13 +14848,13 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             if (!any_on_device) {
                 return false;
             }
-            ggml_cl_pad(backend, tensor->src[0], tensor);
+            ggml_cl_pad(backend, recording, tensor->src[0], tensor);
             return true;
         case GGML_OP_UPSCALE:
             if (!any_on_device) {
                 return false;
             }
-            ggml_cl_upscale(backend, tensor->src[0], tensor);
+            ggml_cl_upscale(backend, recording, tensor->src[0], tensor);
             return true;
         case GGML_OP_CONV_2D:
             if (!any_on_device) {
@@ -14602,7 +14878,7 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             if (!any_on_device) {
                 return false;
             }
-            ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
+            ggml_cl_timestep_embedding(backend, recording, tensor->src[0], tensor);
             return true;
         case GGML_OP_MUL_MAT:
             if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
@@ -14689,12 +14965,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             if (!any_on_device) {
                 return false;
             }
-            ggml_cl_flash_attn(backend, tensor->src[0], tensor->src[1], tensor);
+            ggml_cl_flash_attn(backend, recording, tensor->src[0], tensor->src[1], tensor);
             return true;
         default:
             return false;
     }
 
-    func(backend, tensor->src[0], tensor->src[1], tensor);
+    func(backend, recording, tensor->src[0], tensor->src[1], tensor);
     return true;
 }

From 6049b971f8a845090fa67791ea37825b2b1056ee Mon Sep 17 00:00:00 2001
From: Jan Solanti <jan.solanti@tuni.fi>
Date: Fri, 17 Apr 2026 20:42:24 +0300
Subject: [PATCH 4/5] opencl: async-ify tensor I/O a bit

---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 327 +++++++++++++++++++--------
 1 file changed, 228 insertions(+), 99 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 7b159dad21a..463694ef333 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -378,6 +378,39 @@ struct ggml_backend_opencl_device_context {
     cl_context context = nullptr;
 };
 
+struct DeferredMemobjReleaseInfo {
+    cl_mem mem;
+    cl_event last_event;
+    unsigned long line;
+    const char *name;
+};
+
+#define DEFERRED_CL_RELEASE_CHECK(err)                                         \
+do {                                                                           \
+    cl_int err_ = (err);                                                       \
+    if (err_ != CL_SUCCESS) {                                                  \
+        GGML_LOG_ERROR("ggml_opencl: %s error %d during deferred release of %s"\
+            " from %s:%lu\n", #err, err_, rel.name, __FILE__, rel.line);       \
+        GGML_ASSERT(0);                                                        \
+    }                                                                          \
+} while(0)
+
+static void ggml_backend_opencl_do_deferred_memobj_release(std::vector<DeferredMemobjReleaseInfo> &release_list) {
+    for (DeferredMemobjReleaseInfo rel: release_list) {
+        DEFERRED_CL_RELEASE_CHECK(clWaitForEvents(1, &rel.last_event));
+        DEFERRED_CL_RELEASE_CHECK(clReleaseEvent(rel.last_event));
+        DEFERRED_CL_RELEASE_CHECK(clReleaseMemObject(rel.mem));
+    }
+    release_list.clear();
+}
+
+#undef DEFERRED_CL_RELEASE_CHECK
+
+#define RELEASE_LATER(mem, evt, container)                                     \
+do {                                                                           \
+    (container).push_back({mem, evt, __LINE__, #mem});                         \
+} while(0)
+
 // backend context
 struct ggml_backend_opencl_context {
     int ref_count;
@@ -410,6 +443,7 @@ struct ggml_backend_opencl_context {
 
     cl_context context;
     cl_command_queue queue;
+    std::vector<DeferredMemobjReleaseInfo> release_list;
 
     // prealloc buffers for transposing weights and activations
     ggml_cl_buffer prealloc_quant_trans;
@@ -709,22 +743,25 @@ struct ggml_backend_opencl_context {
         return workgroup_size;
     }
 
-    void enqueue_ndrange_kernel(cl_command_buffer_khr recording, cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
+    void enqueue_ndrange_kernel(cl_command_buffer_khr recording, cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor, cl_event *out_evt = NULL) {
         if (recording) {
             CL_CHECK(this->clCommandNDRangeKernelKHR(recording, NULL, NULL, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL, NULL));
             return;
-        } else {
+        }
 #ifdef GGML_OPENCL_PROFILING
-            cl_event evt;
-            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-            profiling_info.emplace_back();
-            populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        if (out_evt) {
+            CL_CHECK(clRetainEvent(evt));
+            *out_evt = evt;
+        }
+        profiling_info.emplace_back();
+        populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
 #else
-            GGML_UNUSED(tensor);
-            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        GGML_UNUSED(tensor);
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, out_evt));
 #endif
-        }
+
     }
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
@@ -3998,36 +4035,49 @@ static void ggml_backend_opencl_free(ggml_backend_t backend) {
     ggml_cl2_free(backend);
 }
 
-static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_UNUSED(backend);
-    GGML_UNUSED(tensor);
-    GGML_UNUSED(data);
-    GGML_UNUSED(offset);
-    GGML_UNUSED(size);
+namespace {
+static void free_tmp_tensor_buf(cl_event evt, cl_int status, void *userdata) {
+    delete[] (char*)userdata;
 }
-
-static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_UNUSED(backend);
-    GGML_UNUSED(tensor);
-    GGML_UNUSED(data);
-    GGML_UNUSED(offset);
-    GGML_UNUSED(size);
 }
+static ggml_guid_t ggml_backend_opencl_guid();
+static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, const ggml_tensor * src, ggml_tensor * dst) {
+    if(src_backend == dst_backend) {
+        ggml_backend_opencl_context *dst_ctx = (ggml_backend_opencl_context *)dst_backend->context;
+        ggml_tensor_extra_cl * src_extra = (ggml_tensor_extra_cl *) src->extra;
+        ggml_tensor_extra_cl * dst_extra = (ggml_tensor_extra_cl *) dst->extra;
+        CL_CHECK(clEnqueueCopyBuffer(dst_ctx->queue, src_extra->data_device, dst_extra->data_device, 0, 0, ggml_nbytes(src), 0, NULL, NULL));
+        return true;
+    }
 
-static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
-    GGML_UNUSED(backend);
-    GGML_UNUSED(src);
-    GGML_UNUSED(dst);
+    if (src_backend->guid == ggml_backend_opencl_guid()) {
+        ggml_backend_opencl_context *src_ctx = (ggml_backend_opencl_context *)src_backend->context;
+        ggml_tensor_extra_cl * src_extra = (ggml_tensor_extra_cl *) src->extra;
+        void *tmp = new char[ggml_nbytes(src)];
+        CL_CHECK(clEnqueueReadBuffer(src_ctx->queue, src_extra->data_device, CL_TRUE, 0, ggml_nbytes(src), tmp, 0, NULL, NULL));
+        ggml_backend_tensor_set(dst, tmp, 0, ggml_nbytes(src));
+        delete[] (char*)tmp;
+        return true;
+    }
+
+    if (dst_backend->guid == ggml_backend_opencl_guid()) {
+        ggml_backend_opencl_context *dst_ctx = (ggml_backend_opencl_context *)dst_backend->context;
+        ggml_tensor_extra_cl * dst_extra = (ggml_tensor_extra_cl *) dst->extra;
+        void *tmp = new char[ggml_nbytes(src)];
+        cl_event evt;
+        ggml_backend_tensor_get(src, tmp, 0, ggml_nbytes(src));
+        CL_CHECK(clEnqueueWriteBuffer(dst_ctx->queue, dst_extra->data_device, CL_FALSE, 0, ggml_nbytes(src), tmp, 0, NULL, &evt));
+        CL_CHECK(clSetEventCallback(evt, CL_COMPLETE, free_tmp_tensor_buf, tmp));
+        CL_CHECK(clReleaseEvent(evt));
+        return true;
+    }
     return false;
 }
 
 static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
     auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
-
-    cl_event evt;
-    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
-    CL_CHECK(clWaitForEvents(1, &evt));
-    CL_CHECK(clReleaseEvent(evt));
+    ggml_backend_opencl_do_deferred_memobj_release(backend_ctx->release_list);
+    CL_CHECK(clFinish(backend_ctx->queue));
 }
 
 // Synchronizes the 'backend_ctx's device with others so that commands
@@ -4507,14 +4557,17 @@ static ggml_guid_t ggml_backend_opencl_guid() {
     return &guid;
 }
 
+static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size);
+
 static ggml_backend_i ggml_backend_opencl_i = {
     /* .get_name                = */ ggml_backend_opencl_name,
     /* .free                    = */ ggml_backend_opencl_free,
-    /* .set_tensor_async        = */ NULL,  /* ggml_backend_opencl_set_tensor_async */
-    /* .get_tensor_async        = */ NULL,  /* ggml_backend_opencl_get_tensor_async */
-    /* .set_tensor_2d_async     = */ NULL,
+    /* .set_tensor_async        = */ ggml_backend_opencl_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_opencl_get_tensor_async,
     /* .get_tensor_2d_async     = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,  /* ggml_backend_opencl_cpy_tensor_async */
+    /* .set_tensor_2d_async     = */ NULL,
+    /* .cpy_tensor_async        = */ ggml_backend_opencl_cpy_tensor_async,
     /* .synchronize             = */ ggml_backend_opencl_synchronize,
     /* .graph_plan_create       = */ ggml_backend_opencl_graph_plan_create,
     /* .graph_plan_free         = */ ggml_backend_opencl_graph_plan_free,
@@ -4921,7 +4974,7 @@ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backen
     return ((elem_num < 128 * 1024 * 1024) && adreno_kernel);  // max element num: 2**27
 }
 
-static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+static void ggml_backend_opencl_buffer_set_tensor_impl(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, std::vector<DeferredMemobjReleaseInfo> &release_list, cl_event *last_evt) {
     ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
 
     cl_context context = backend_ctx->context;
@@ -4952,7 +5005,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
             ggml_nbytes(tensor), NULL, &err);
         CL_CHECK(err);
         CL_CHECK(clEnqueueWriteBuffer(
-            queue, data_device, CL_TRUE, 0,
+            queue, data_device, CL_FALSE, 0,
             ggml_nbytes(tensor), data, 0, NULL, NULL));
 
         // We consider the specified offset arg as always, although For weights
@@ -5011,8 +5064,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         cl_event evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
-        CL_CHECK(clReleaseMemObject(data_device));
+        RELEASE_LATER(data_device, evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(evt));
+            *last_evt = evt;
+        }
 
         tensor->extra = extra;
 
@@ -5054,7 +5110,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
             ggml_nbytes(tensor), NULL, &err);
         CL_CHECK(err);
         CL_CHECK(clEnqueueWriteBuffer(
-            queue, data_device, CL_TRUE, 0,
+            queue, data_device, CL_FALSE, 0,
             ggml_nbytes(tensor), data, 0, NULL, NULL));
 
         cl_buffer_region region;
@@ -5106,8 +5162,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         cl_event evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
-        CL_CHECK(clReleaseMemObject(data_device));
+        RELEASE_LATER(data_device, evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(evt));
+            *last_evt = evt;
+        }
 
         tensor->extra = extra;
 
@@ -5146,7 +5205,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
             ggml_nbytes(tensor), NULL, &err);
         CL_CHECK(err);
         CL_CHECK(clEnqueueWriteBuffer(
-            queue, data_device, CL_TRUE, 0,
+            queue, data_device, CL_FALSE, 0,
             ggml_nbytes(tensor), data, 0, NULL, NULL));
 
         // The original tensor memory is divided into scales and quants, i.e.,
@@ -5219,8 +5278,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         cl_event evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
-        CL_CHECK(clReleaseMemObject(data_device));
+        RELEASE_LATER(data_device, evt, release_list);
 
         // Create image for Q
         cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
@@ -5252,7 +5310,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
             ggml_nbytes(tensor), NULL, &err);
         CL_CHECK(err);
         CL_CHECK(clEnqueueWriteBuffer(
-            queue, data_device, CL_TRUE, 0,
+            queue, data_device, CL_FALSE, 0,
             ggml_nbytes(tensor), data, 0, NULL, NULL));
 
         // The original tensor memory is divided into scales and quants, i.e.,
@@ -5287,8 +5345,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         cl_event evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
-        CL_CHECK(clReleaseMemObject(data_device));
+        RELEASE_LATER(data_device, evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(evt));
+            *last_evt = evt;
+        }
 
         tensor->extra = extra;
 
@@ -5411,7 +5472,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
             ggml_nbytes(tensor), NULL, &err);
         CL_CHECK(err);
         CL_CHECK(clEnqueueWriteBuffer(
-            queue, data_device, CL_TRUE, 0,
+            queue, data_device, CL_FALSE, 0,
             ggml_nbytes(tensor), data, 0, NULL, NULL));
 
         cl_buffer_region region;
@@ -5476,8 +5537,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         cl_event evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
-        CL_CHECK(clReleaseMemObject(data_device));
+        RELEASE_LATER(data_device, evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(evt));
+            *last_evt = evt;
+        }
 
         tensor->extra  = extra;
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
@@ -5632,7 +5696,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         cl_int err;
         cl_mem data_device;
         CL_CHECK((data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, ggml_nbytes(tensor), NULL, &err), err));
-        CL_CHECK(clEnqueueWriteBuffer(queue, data_device, CL_TRUE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL));
+        CL_CHECK(clEnqueueWriteBuffer(queue, data_device, CL_FALSE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL));
 
         cl_buffer_region region;
 
@@ -5686,8 +5750,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         cl_event evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
-        CL_CHECK(clReleaseMemObject(data_device));
+        RELEASE_LATER(data_device, evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(evt));
+            *last_evt = evt;
+        }
 
         extra->size_ql = size_ql;
         extra->size_qh = size_qh;
@@ -5726,13 +5793,32 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
     GGML_ASSERT(extra);
 
     CL_CHECK(clEnqueueWriteBuffer(
-        queue, extra->data_device, CL_TRUE, extra->offset + offset,
-        size, data, 0, NULL, NULL));
+        queue, extra->data_device, CL_FALSE, extra->offset + offset,
+        size, data, 0, NULL, last_evt));
 
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+    // Make sure all previously submitted commands in other devices are finished.
+    sync_with_other_backends(backend_ctx);
+    cl_event evt;
+    ggml_backend_opencl_buffer_set_tensor_impl(buffer, tensor, data, offset, size, backend_ctx->release_list, &evt);
+    CL_CHECK(clWaitForEvents(1, &evt));
+    CL_CHECK(clReleaseEvent(evt));
+    ggml_backend_opencl_do_deferred_memobj_release(backend_ctx->release_list);
+}
+
+static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    sync_with_other_backends(backend_ctx);
+    ggml_backend_opencl_buffer_set_tensor_impl(buf, tensor, data, offset, size, backend_ctx->release_list, nullptr);
+}
+
+static void ggml_backend_opencl_buffer_get_tensor_impl(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size, std::vector<DeferredMemobjReleaseInfo> &release_list, cl_event *last_evt) {
     GGML_ASSERT(tensor->extra);
 
     ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
@@ -5740,9 +5826,6 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
     cl_context context = backend_ctx->context;
     cl_command_queue queue = backend_ctx->queue;
 
-    // Make sure all previously submitted commands in other devices are finished.
-    sync_with_other_backends(backend_ctx);
-
 #ifdef GGML_OPENCL_SOA_Q
     // In end-to-end runs, get_tensor is usually used to get back the logits,
     // where we can simply do clEnqueueReadBuffer since they are f32.
@@ -5808,14 +5891,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
         size_t local_work_size[] = {1, 1, 1};
 
-        cl_event evt;
+        cl_event kern_evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
-            global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
+            global_work_size, local_work_size, 0, NULL, &kern_evt));
+        cl_event read_evt;
         CL_CHECK(clEnqueueReadBuffer(
-            queue, data_device, CL_TRUE, offset,
-            size, data, 0, NULL, NULL));
-        CL_CHECK(clReleaseMemObject(data_device));
+            queue, data_device, CL_FALSE, offset,
+            size, data, 1, &kern_evt, &read_evt));
+        CL_CHECK(clReleaseEvent(kern_evt));
+        RELEASE_LATER(data_device, read_evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(read_evt));
+            *last_evt = read_evt;
+        }
         return;
     }
     if (tensor->type == GGML_TYPE_Q4_1) {
@@ -5882,14 +5970,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
         size_t local_work_size[] = {1, 1, 1};
 
-        cl_event evt;
+        cl_event kern_evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
-            global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
+            global_work_size, local_work_size, 0, NULL, &kern_evt));
+        cl_event read_evt;
         CL_CHECK(clEnqueueReadBuffer(
-            queue, data_device, CL_TRUE, offset,
-            size, data, 0, NULL, NULL));
-        CL_CHECK(clReleaseMemObject(data_device));
+            queue, data_device, CL_FALSE, offset,
+            size, data, 1, &kern_evt, &read_evt));
+        CL_CHECK(clReleaseEvent(kern_evt));
+        RELEASE_LATER(data_device, read_evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(read_evt));
+            *last_evt = read_evt;
+        }
         return;
     }
     if (tensor->type == GGML_TYPE_MXFP4) {
@@ -5936,14 +6029,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
         size_t local_work_size[] = {1, 1, 1};
 
-        cl_event evt;
+        cl_event kern_evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
-            global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
+            global_work_size, local_work_size, 0, NULL, &kern_evt));
+        cl_event read_evt;
         CL_CHECK(clEnqueueReadBuffer(
-            queue, data_device, CL_TRUE, offset,
-            size, data, 0, NULL, NULL));
-        CL_CHECK(clReleaseMemObject(data_device));
+            queue, data_device, CL_FALSE, offset,
+            size, data, 1, &kern_evt, &read_evt));
+        CL_CHECK(clReleaseEvent(kern_evt));
+        RELEASE_LATER(data_device, read_evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(read_evt));
+            *last_evt = read_evt;
+        }
         return;
     }
     if (tensor->type == GGML_TYPE_Q8_0) {
@@ -5992,14 +6090,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
         size_t local_work_size[] = {1, 1, 1};
 
-        cl_event evt;
+        cl_event kern_evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
-            global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
+            global_work_size, local_work_size, 0, NULL, &kern_evt));
+        cl_event read_evt;
         CL_CHECK(clEnqueueReadBuffer(
-            queue, data_device, CL_TRUE, offset,
-            size, data, 0, NULL, NULL));
-        CL_CHECK(clReleaseMemObject(data_device));
+            queue, data_device, CL_FALSE, offset,
+            size, data, 1, &kern_evt, &read_evt));
+        CL_CHECK(clReleaseEvent(kern_evt));
+        RELEASE_LATER(data_device, read_evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(read_evt));
+            *last_evt = read_evt;
+        }
         return;
     }
     if (tensor->type == GGML_TYPE_IQ4_NL) {
@@ -6140,14 +6243,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
         size_t local_work_size[] = {1, 1, 1};
 
-        cl_event evt;
+        cl_event kern_evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
-            global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
+            global_work_size, local_work_size, 0, NULL, &kern_evt));
+        cl_event read_evt;
         CL_CHECK(clEnqueueReadBuffer(
-            queue, data_device, CL_TRUE, offset,
-            size, data, 0, NULL, NULL));
-        CL_CHECK(clReleaseMemObject(data_device));
+            queue, data_device, CL_FALSE, offset,
+            size, data, 1, &kern_evt, &read_evt));
+        CL_CHECK(clReleaseEvent(kern_evt));
+        RELEASE_LATER(data_device, read_evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(read_evt));
+            *last_evt = read_evt;
+        }
         return;
     }
     if (tensor->type == GGML_TYPE_Q5_K) {
@@ -6309,14 +6417,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
         size_t global_work_size[] = {(size_t)n_blk, 1, 1};
         size_t local_work_size[] = {1, 1, 1};
 
-        cl_event evt;
+        cl_event kern_evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
-            global_work_size, local_work_size, 0, NULL, &evt));
-        CL_CHECK(clWaitForEvents(1, &evt));
+            global_work_size, local_work_size, 0, NULL, &kern_evt));
+        cl_event read_evt;
         CL_CHECK(clEnqueueReadBuffer(
             queue, data_device, CL_TRUE, offset,
-            size, data, 0, NULL, NULL));
-        CL_CHECK(clReleaseMemObject(data_device));
+            size, data, 1, &kern_evt, &read_evt));
+        CL_CHECK(clReleaseEvent(kern_evt));
+        RELEASE_LATER(data_device, read_evt, release_list);
+        if (last_evt) {
+            CL_CHECK(clRetainEvent(read_evt));
+            *last_evt = read_evt;
+        }
         return;
     }
 #endif // GGML_OPENCL_SOA_Q
@@ -6324,12 +6437,28 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
     ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
 
     CL_CHECK(clEnqueueReadBuffer(
-        queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset,
-        size, data, 0, NULL, NULL));
+        queue, extra->data_device, CL_FALSE, extra->offset + tensor->view_offs + offset,
+        size, data, 0, NULL, last_evt));
 
     GGML_UNUSED(buffer);
 }
 
+static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+    cl_event evt;
+    ggml_backend_opencl_buffer_get_tensor_impl(buffer, tensor, data, offset, size, backend_ctx->release_list, &evt);
+    CL_CHECK(clWaitForEvents(1, &evt));
+    CL_CHECK(clReleaseEvent(evt));
+    ggml_backend_opencl_do_deferred_memobj_release(backend_ctx->release_list);
+}
+
+static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    ggml_backend_opencl_buffer_get_tensor_impl(buf, tensor, data, offset, size, backend_ctx->release_list, NULL);
+}
+
 static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     ggml_backend_dev_t dev = buffer->buft->device;
     ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
@@ -6463,7 +6592,7 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
     props->type        = ggml_backend_opencl_device_get_type(dev);
     ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = ggml_backend_dev_caps {
-        /* .async                 = */ false,
+        /* .async                 = */ true,
         /* .host_buffer           = */ false,
         /* .buffer_from_host_ptr  = */ false,
         /* .events                = */ false,

From 5b85b9bf77bfb4a971396fb95e6850c54097214d Mon Sep 17 00:00:00 2001
From: Jan Solanti <jan.solanti@tuni.fi>
Date: Tue, 28 Apr 2026 15:37:18 +0300
Subject: [PATCH 5/5] hacks to make the ROCm compiler happy(ier)

---
 ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl | 2 +-
 ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl | 2 +-
 ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
index 7dc55de72df..782b04d5b34 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
@@ -94,7 +94,7 @@ inline float block_q_4_0_dot_y_flat(
 #define N_SIMDWIDTH 32
 #endif
 
-inline void mul_vec_q_n_f32_8x_flat(
+void mul_vec_q_n_f32_8x_flat(
         global char  * src0_q,
         global half  * src0_d,
         global float * src1,
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
index 2578d71d369..d8314512bc9 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
@@ -98,7 +98,7 @@ inline float mm_block_q_4_0_dot_y_flat(
 // This variant performs 1d blocking with 8x output.
 // Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
 //
-inline void mul_mat_q_n_f32_1d_8x_flat(
+void mul_mat_q_n_f32_1d_8x_flat(
         global uchar * src0_q,
         global half  * src0_d,
         global float * src1,
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
index 79882d27e23..ee1dfe97866 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
@@ -105,7 +105,7 @@ inline float block_q_4_0_dot_y_flat(
 #define N_SIMDWIDTH 32
 #endif
 
-inline void mul_vec_q_n_f32_8x_flat(
+void mul_vec_q_n_f32_8x_flat(
         global uchar * src0_q,
         global half  * src0_d,
         global float * src1,