From 138d41ec75a824a7a73b601393bc6cf32abbe391 Mon Sep 17 00:00:00 2001 From: Jan Solanti Date: Mon, 24 Nov 2025 18:20:06 +0200 Subject: [PATCH 1/5] hacky patches to make it work on pocl-cuda --- ggml/src/ggml-opencl/ggml-opencl.cpp | 59 +++++++++++++------ ggml/src/ggml-opencl/kernels/argsort.cl | 4 +- ggml/src/ggml-opencl/kernels/exp.cl | 6 +- ggml/src/ggml-opencl/kernels/expm1.cl | 6 +- ggml/src/ggml-opencl/kernels/glu.cl | 2 +- .../kernels/mul_mv_id_mxfp4_f32.cl | 4 ++ .../kernels/mul_mv_id_mxfp4_f32_flat.cl | 4 ++ .../kernels/mul_mv_id_q4_0_f32_8x_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl | 4 ++ .../kernels/mul_mv_id_q8_0_f32_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl | 4 ++ .../kernels/mul_mv_iq4_nl_f32_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_mxfp4_f32.cl | 4 ++ .../kernels/mul_mv_mxfp4_f32_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_q4_0_f32.cl | 4 ++ .../kernels/mul_mv_q4_0_f32_1d_16x_flat.cl | 4 ++ .../kernels/mul_mv_q4_0_f32_1d_8x_flat.cl | 4 ++ .../kernels/mul_mv_q4_0_f32_8x_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_q4_1_f32.cl | 4 ++ .../kernels/mul_mv_q4_1_f32_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_q4_k_f32.cl | 6 ++ .../kernels/mul_mv_q4_k_f32_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_q5_k_f32.cl | 4 ++ .../kernels/mul_mv_q5_k_f32_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_q6_k_f32.cl | 4 ++ .../kernels/mul_mv_q6_k_f32_flat.cl | 4 ++ .../ggml-opencl/kernels/mul_mv_q8_0_f32.cl | 4 ++ .../kernels/mul_mv_q8_0_f32_flat.cl | 4 ++ ggml/src/ggml-opencl/kernels/rope.cl | 4 +- ggml/src/ggml-opencl/kernels/sigmoid.cl | 2 +- ggml/src/ggml-opencl/kernels/softplus.cl | 6 +- ggml/src/ggml-opencl/kernels/tanh.cl | 6 +- 33 files changed, 157 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index d344bde0fe3..3e688134af8 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -779,7 +779,8 @@ inline std::string read_file(const std::string &path) { return text; } -static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) { +#define build_program_from_source(ctx, dev, program_buffer, compile_opts) build_program_from_source_impl(__LINE__, ctx, dev, program_buffer, compile_opts) +static cl_program build_program_from_source_impl(size_t line, cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) { cl_program p; char *program_log; size_t program_size; @@ -800,7 +801,7 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co program_log = (char*) malloc(log_size + 1); program_log[log_size] = '\0'; clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); - GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log); + GGML_LOG_ERROR("ggml_opencl:%lu: kernel compile error:\n\n%s\n", (unsigned long)line, program_log); free(program_log); exit(1); } @@ -3313,7 +3314,6 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { } else { GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str()); backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; - return nullptr; } #ifdef GGML_OPENCL_USE_ADRENO_KERNELS @@ -8259,7 +8259,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c } else if (backend_ctx->gpu_family == INTEL) { sgs = 32; } else { - GGML_ASSERT(false && "Unsupported GPU"); + sgs = 32; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); @@ -8346,7 +8346,7 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * } else if (backend_ctx->gpu_family == INTEL) { sgs = 32; } else { - GGML_ASSERT(false && "Unsupported GPU"); + sgs = 32; } cl_kernel kernel = backend_ctx->kernel_rms_norm_mul; @@ -8425,7 +8425,7 @@ static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm size_t sgs; if (backend_ctx->gpu_family == ADRENO) sgs = 64; else if (backend_ctx->gpu_family == INTEL) sgs = 32; - else GGML_ASSERT(false && "Unsupported GPU"); + else sgs = 32; cl_kernel kernel = backend_ctx->kernel_norm_mul_add; @@ -8558,7 +8558,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, } else if (backend_ctx->gpu_family == INTEL) { sgs = 32; } else { - GGML_ASSERT(false && "Unsupported GPU"); + sgs = 32; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); @@ -11845,7 +11845,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth0 = 32; + nth1 = 1; + + kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); @@ -11903,7 +11906,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co nth0 = 64; nth1 = 1; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth0 = 32; + nth1 = 1; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); @@ -11940,7 +11944,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co nth0 = 64; nth1 = 1; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth0 = 32; + nth1 = 1; } if (src1t == GGML_TYPE_F32) { @@ -12002,7 +12007,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; ndst =8; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth0 = 32; + nth1 = 1; + kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; + ndst = 8; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); @@ -12139,7 +12147,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co nth1 = 2; ndst = nth1*4; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth0 = 32; + nth1 = 2; + ndst = nth1*4; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); @@ -12461,7 +12471,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co nth1 = 2; ndst = 1; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth0 = 2; + nth1 = 32; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); @@ -12499,7 +12510,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co q = extra0_mxfp4->q_img; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth0 = 32; + nth1 = 2; + ndst = nth1*2; + + q = extra0_mxfp4->q; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q)); @@ -12808,7 +12823,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, nsg = 1; ndst = 8; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + sgs = 32; + nsg = 1; + ndst = 8; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); @@ -12852,7 +12869,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, nsg = 2; ndst = 4; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + sgs = 32; + nsg = 2; + ndst = 4; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); @@ -13107,7 +13126,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, q = extra0_mxfp4->q_img; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + sgs = 32; + nsg = 2; + ndst = 2; + + q = extra0_mxfp4->q; } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q)); @@ -13606,7 +13629,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c else if (backend_ctx->gpu_family == ADRENO) { nth = 64; } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); + nth = MIN(32, ne00); } cl_kernel kernel; diff --git a/ggml/src/ggml-opencl/kernels/argsort.cl b/ggml/src/ggml-opencl/kernels/argsort.cl index af4adc7b83f..ab24d1dfd2b 100644 --- a/ggml/src/ggml-opencl/kernels/argsort.cl +++ b/ggml/src/ggml-opencl/kernels/argsort.cl @@ -43,8 +43,8 @@ kernel void kernel_argsort_f32_i32( return; } - src0 = (global char *)((global char *)src0 + offset0); - dst = (global float *)((global char *)dst + offsetd); + src0 = (global float *)((global char *)src0 + offset0); + dst = (global int *)((global char *)dst + offsetd); global float * x_row = src0 + row * ne00; diff --git a/ggml/src/ggml-opencl/kernels/exp.cl b/ggml/src/ggml-opencl/kernels/exp.cl index a2458b6579c..c70a2ad5fd2 100644 --- a/ggml/src/ggml-opencl/kernels/exp.cl +++ b/ggml/src/ggml-opencl/kernels/exp.cl @@ -45,7 +45,7 @@ kernel void kernel_exp_f16( src0 = (global half*)((global char*)src0 + offset0); dst = (global half*)((global char*)dst + offsetd); - dst[get_global_id(0)] = exp(src0[get_global_id(0)]); + dst[get_global_id(0)] = convert_half(exp(convert_float(src0[get_global_id(0)]))); } kernel void kernel_exp_f16_4( @@ -61,7 +61,7 @@ kernel void kernel_exp_f16_4( src0 = (global half4*)((global char*)src0 + offset0); dst = (global half4*)((global char*)dst + offsetd); - dst[get_global_id(0)] = exp(src0[get_global_id(0)]); + dst[get_global_id(0)] = convert_half4(exp(convert_float4(src0[get_global_id(0)]))); } kernel void kernel_exp_f32_nc( @@ -120,6 +120,6 @@ kernel void kernel_exp_f16_nc( global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - *y = exp(*x); + *y = convert_half(exp(convert_float(*x))); } } diff --git a/ggml/src/ggml-opencl/kernels/expm1.cl b/ggml/src/ggml-opencl/kernels/expm1.cl index 05442ac2043..1434447a438 100644 --- a/ggml/src/ggml-opencl/kernels/expm1.cl +++ b/ggml/src/ggml-opencl/kernels/expm1.cl @@ -37,7 +37,7 @@ kernel void kernel_expm1_f16( src0 = (global half*)((global char*)src0 + offset0); dst = (global half*)((global char*)dst + offsetd); - dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h; + dst[get_global_id(0)] = convert_half(exp(convert_float(src0[get_global_id(0)]))) - 1.0h; } kernel void kernel_expm1_f16_4( @@ -49,7 +49,7 @@ kernel void kernel_expm1_f16_4( src0 = (global half4*)((global char*)src0 + offset0); dst = (global half4*)((global char*)dst + offsetd); - dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h; + dst[get_global_id(0)] = convert_half4(exp(convert_float4(src0[get_global_id(0)]))) - 1.0h; } kernel void kernel_expm1_f32_nc( @@ -108,6 +108,6 @@ kernel void kernel_expm1_f16_nc( global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - *y = exp(*x) - 1.0f; + *y = convert_half(exp(convert_float(*x)) - 1.0f); } } diff --git a/ggml/src/ggml-opencl/kernels/glu.cl b/ggml/src/ggml-opencl/kernels/glu.cl index 059a4bbf1ba..aaefa68a2b8 100644 --- a/ggml/src/ggml-opencl/kernels/glu.cl +++ b/ggml/src/ggml-opencl/kernels/glu.cl @@ -196,7 +196,7 @@ kernel void kernel_swiglu_f16( const half x0 = src0_row[i0]; const half x1 = src1_row[i0]; - const half silu = x0 / (1.0f + exp(-x0)); + const half silu = x0 / (1.0f + exp((float)-x0)); dst_row[i0] = silu*x1; } diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl index d50bd1fc428..8ec5140ea15 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl @@ -48,6 +48,10 @@ static inline float e8m0_to_fp32(uchar x) { #define N_R0_MXFP4 2 #define N_SG_MXFP4 2 #define N_SIMDWIDTH 64 +#else +#define N_R0_MXFP4 4 +#define N_SG_MXFP4 1 +#define N_SIMDWIDTH 32 #endif inline void mul_mv_mxfp4_f32( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl index f65e86ed6a2..87fb3d0f51c 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl @@ -63,6 +63,10 @@ static inline float e8m0_to_fp32(uchar x) { #define N_SG_MXFP4 1 #define N_SIMDWIDTH 64 #define SRC0Q_IMG +#else +#define N_R0_MXFP4 4 +#define N_SG_MXFP4 1 +#define N_SIMDWIDTH 32 #endif kernel void kernel_mul_mv_id_mxfp4_f32_flat( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl index 7ccf41efbe9..7dc55de72df 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl @@ -88,6 +88,10 @@ inline float block_q_4_0_dot_y_flat( #define N_DST 8 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 8 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif inline void mul_vec_q_n_f32_8x_flat( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl index f37e83ee8aa..4c98bfc5485 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl @@ -34,6 +34,10 @@ typedef struct { #define N_R0_Q8_0 4 #define N_SG_Q8_0 2 #define N_SIMDWIDTH 64 +#else +#define N_R0_Q8_0 8 +#define N_SG_Q8_0 1 +#define N_SIMDWIDTH 32 #endif #ifdef INTEL_GPU diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl index fd3a0710f5c..2ef8df57909 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl @@ -34,6 +34,10 @@ typedef struct { #define N_R0_Q8_0 4 #define N_SG_Q8_0 2 #define N_SIMDWIDTH 64 +#else +#define N_R0_Q8_0 8 +#define N_SG_Q8_0 1 +#define N_SIMDWIDTH 32 #endif #ifdef INTEL_GPU diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl index a6a325cd729..85bd5234525 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl @@ -69,6 +69,10 @@ inline float block_iq4_nl_dot_y( #define N_DST 4 #define N_SUBGROUP 1 #define N_SUBGROUP_SIZE 64 +#else +#define N_DST 4 +#define N_SUBGROUP 1 +#define N_SUBGROUP_SIZE 32 #endif inline void mul_vec_q_n_f32( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl index 8c5b3f52e42..d83c1fb5b71 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl @@ -73,6 +73,10 @@ inline float block_iq4_nl_dot_y_flat( #define N_DST 8 #define N_SUBGROUP 1 #define N_SUBGROUP_SIZE 64 +#else +#define N_DST 4 +#define N_SUBGROUP 1 +#define N_SUBGROUP_SIZE 32 #endif inline void mul_vec_q_n_f32_8x_flat( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl index 9a4d4b9bad1..b06e4ed4800 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl @@ -48,6 +48,10 @@ static inline float e8m0_to_fp32(uchar x) { #define N_R0_MXFP4 2 #define N_SG_MXFP4 2 #define N_SIMDWIDTH 64 +#else +#define N_R0_MXFP4 4 +#define N_SG_MXFP4 1 +#define N_SIMDWIDTH 32 #endif #ifdef INTEL_GPU diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl index 3d5a923eee0..b2fcb1dd3cd 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl @@ -63,6 +63,10 @@ static inline float e8m0_to_fp32(uchar x) { #define N_SG_MXFP4 2 #define N_SIMDWIDTH 64 #define SRC0Q_IMG +#else +#define N_R0_MXFP4 4 +#define N_SG_MXFP4 1 +#define N_SIMDWIDTH 32 #endif #ifdef INTEL_GPU diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl index 52141e0ed55..8fdfc5fb757 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl @@ -80,6 +80,10 @@ inline float block_q_4_0_dot_y( #define N_DST 4 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif inline void mul_vec_q_n_f32( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl index 3eebab8f0f2..1c3b65d70ad 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl @@ -89,6 +89,10 @@ inline float mm_block_q_4_0_dot_y_flat( #define N_DST 16 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 16 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif // // This variant performs 1d blocking with 16x output. diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl index 38024d00ad5..2578d71d369 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl @@ -89,6 +89,10 @@ inline float mm_block_q_4_0_dot_y_flat( #define N_DST 8 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 8 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif // // This variant performs 1d blocking with 8x output. diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl index aed1ce7b260..79882d27e23 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl @@ -99,6 +99,10 @@ inline float block_q_4_0_dot_y_flat( #define N_DST 8 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 8 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif inline void mul_vec_q_n_f32_8x_flat( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl index 92955217971..ce83a292cc1 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl @@ -96,6 +96,10 @@ inline float block_q_4_0_dot_y_v( #define N_DST 4 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif inline void mul_vec_q_n_f32_v( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl index 6fe828f20e7..c68e7f73c34 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl @@ -74,6 +74,10 @@ inline float block_q4_1_dot_y( #define N_DST 4 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif inline void mul_vec_q_n_f32( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl index d7c4645d675..047a154d64e 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl @@ -75,6 +75,10 @@ inline float block_q4_1_dot_y_flat( #define N_DST 4 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif inline void mul_vec_q_n_f32_flat( diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl index 71ab9898213..3d94d985002 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl @@ -1,3 +1,5 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + #ifdef cl_intel_required_subgroup_size #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable #define INTEL_GPU 1 @@ -38,6 +40,10 @@ typedef struct { #define N_DST 4 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif #undef BLOCK_STRIDE diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl index d92fb968904..511b73bdf61 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl @@ -47,6 +47,10 @@ typedef struct { #define N_DST 16 #define N_SIMDGROUP 2 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif #undef BLOCK_STRIDE diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl index b2058abc1b6..f3c01969671 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl @@ -41,6 +41,10 @@ typedef struct { #define N_DST 4 #define N_SIMDGROUP 1 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif #define BLOCK_STRIDE (N_SIMDWIDTH/8) diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl index e353a72be70..ed96570ed1b 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl @@ -45,6 +45,10 @@ typedef struct { #define N_DST 16 #define N_SIMDGROUP 2 #define N_SIMDWIDTH 64 +#else +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif #undef BLOCK_STRIDE diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl index 819e5192e35..d3c12d474e2 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl @@ -68,6 +68,10 @@ typedef struct { #define N_DST 1 #define N_SIMDGROUP 2 #define N_SIMDWIDTH 64 +#else +#define N_DST 1 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif #define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl index 86fe09c6dd6..61f59fcac28 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl @@ -92,6 +92,10 @@ inline float block_q_6_K_dot_y_flat( #define N_DST 4 #define N_SIMDGROUP 2 #define N_SIMDWIDTH 64 +#else +#define N_DST 1 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 32 #endif #define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl index 7e88c7494de..356c087fb77 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl @@ -34,6 +34,10 @@ typedef struct { #define N_R0_Q8_0 4 #define N_SG_Q8_0 2 #define N_SIMDWIDTH 64 +#else +#define N_R0_Q8_0 8 +#define N_SG_Q8_0 1 +#define N_SIMDWIDTH 32 #endif #ifdef INTEL_GPU diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl index 71d159fd521..d0f7644d912 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl @@ -34,6 +34,10 @@ typedef struct { #define N_R0_Q8_0 4 #define N_SG_Q8_0 2 #define N_SIMDWIDTH 64 +#else +#define N_R0_Q8_0 8 +#define N_SG_Q8_0 1 +#define N_SIMDWIDTH 32 #endif #ifdef INTEL_GPU diff --git a/ggml/src/ggml-opencl/kernels/rope.cl b/ggml/src/ggml-opencl/kernels/rope.cl index 82f4cd87407..a713cdafe0f 100644 --- a/ggml/src/ggml-opencl/kernels/rope.cl +++ b/ggml/src/ggml-opencl/kernels/rope.cl @@ -509,7 +509,7 @@ kernel void kernel_rope_multi_f16( src0 = (global void*)((global char*)src0 + offset0); src1 = (global int*)((global char*)src1 + offset1); src2 = (global float*)((global char*)src2 + offset2); - dst = (global float*)((global char*)dst + offsetd); + dst = (global half*)((global char*)dst + offsetd); int i3 = get_group_id(2); int i2 = get_group_id(1); @@ -702,7 +702,7 @@ kernel void kernel_rope_vision_f16( src0 = (global void*)((global char*)src0 + offset0); src1 = (global int*)((global char*)src1 + offset1); src2 = (global float*)((global char*)src2 + offset2); - dst = (global float*)((global char*)dst + offsetd); + dst = (global half*)((global char*)dst + offsetd); int i3 = get_group_id(2); int i2 = get_group_id(1); diff --git a/ggml/src/ggml-opencl/kernels/sigmoid.cl b/ggml/src/ggml-opencl/kernels/sigmoid.cl index e3f669dde83..1337902a1a7 100644 --- a/ggml/src/ggml-opencl/kernels/sigmoid.cl +++ b/ggml/src/ggml-opencl/kernels/sigmoid.cl @@ -25,5 +25,5 @@ kernel void kernel_sigmoid_f16( src0 = (global half*)((global char*)src0 + offset0); dst = (global half*)((global char*)dst + offsetd); - dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)])); + dst[get_global_id(0)] = 1.0f / (1.0f + exp((float)-src0[get_global_id(0)])); } diff --git a/ggml/src/ggml-opencl/kernels/softplus.cl b/ggml/src/ggml-opencl/kernels/softplus.cl index 6f8b7474165..91fab7ec997 100644 --- a/ggml/src/ggml-opencl/kernels/softplus.cl +++ b/ggml/src/ggml-opencl/kernels/softplus.cl @@ -38,7 +38,7 @@ kernel void kernel_softplus_f16( dst = (global half*)((global char*)dst + offsetd); const float x = convert_float(src0[get_global_id(0)]); - dst[get_global_id(0)] = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x))); + dst[get_global_id(0)] = convert_half((x > 20.0f) ? x : log(1.0f + exp(x))); } kernel void kernel_softplus_f16_4( @@ -51,7 +51,7 @@ kernel void kernel_softplus_f16_4( dst = (global half4*)((global char*)dst + offsetd); const float4 x = convert_float4(src0[get_global_id(0)]); - dst[get_global_id(0)] = convert_half4_rte((x > 20.0f) ? x : log(1.0f + exp(x))); + dst[get_global_id(0)] = convert_half4((x > 20.0f) ? x : log(1.0f + exp(x))); } kernel void kernel_softplus_f32_nc( @@ -111,6 +111,6 @@ kernel void kernel_softplus_f16_nc( global half * hy = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); const float x = convert_float(*hx); - *hy = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x))); + *hy = convert_half((x > 20.0f) ? x : log(1.0f + exp(x))); } } diff --git a/ggml/src/ggml-opencl/kernels/tanh.cl b/ggml/src/ggml-opencl/kernels/tanh.cl index 2c4887ad3e0..ff173793a76 100644 --- a/ggml/src/ggml-opencl/kernels/tanh.cl +++ b/ggml/src/ggml-opencl/kernels/tanh.cl @@ -33,7 +33,7 @@ kernel void kernel_tanh_f16( src0 = (global half*)((global char*)src0 + offset0); dst = (global half*)((global char*)dst + offsetd); - dst[get_global_id(0)] = tanh(src0[get_global_id(0)]); + dst[get_global_id(0)] = convert_half(tanh(convert_float(src0[get_global_id(0)]))); } kernel void kernel_tanh_f16_4( @@ -45,7 +45,7 @@ kernel void kernel_tanh_f16_4( src0 = (global half4*)((global char*)src0 + offset0); dst = (global half4*)((global char*)dst + offsetd); - dst[get_global_id(0)] = tanh(src0[get_global_id(0)]); + dst[get_global_id(0)] = convert_half4(exp(convert_float4(src0[get_global_id(0)]))); } kernel void kernel_tanh_f32_nc( @@ -104,6 +104,6 @@ kernel void kernel_tanh_f16_nc( global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - *y = tanh(*x); + *y = convert_half(exp(convert_float(*x))); } } From 9abbfe0a971988665cdadf3ac05c2c2b4f986f84 Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Tue, 7 Apr 2026 19:25:39 +0300 Subject: [PATCH 2/5] Implement and use cuda graph plans. --- ggml/include/ggml-backend.h | 3 + ggml/src/ggml-backend.cpp | 112 +++++++++++++++++++++++++++++++++++- 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index d0c7e5a1be0..3acc7e2e9d3 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -97,8 +97,11 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + GGML_API bool ggml_backend_supports_graph_plan(ggml_backend_t backend); + GGML_API bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend); GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index d9f8aaec52f..58564962500 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -420,6 +420,18 @@ void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } +bool ggml_backend_supports_graph_plan(ggml_backend_t backend) { + GGML_ASSERT(backend); + + return (bool) backend->iface.graph_plan_create; +} + +bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend) { + GGML_ASSERT(backend); + + return (bool) backend->iface.graph_plan_update; +} + ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_ASSERT(backend); GGML_ASSERT(backend->iface.graph_plan_create != NULL); @@ -434,6 +446,13 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla backend->iface.graph_plan_free(backend, plan); } +void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph* cgraph) { + GGML_ASSERT(backend); + GGML_ASSERT(backend->iface.graph_plan_update != NULL); + + backend->iface.graph_plan_update(backend, plan, cgraph); +} + enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(backend); GGML_ASSERT(backend->iface.graph_plan_compute != NULL); @@ -771,6 +790,11 @@ struct ggml_backend_sched_split { struct ggml_cgraph graph; }; +struct ggml_backend_sched_plan { + int backend_id; + ggml_backend_graph_plan_t plan; +}; + struct ggml_backend_sched { bool is_reset; // true if the scheduler has been reset since the last graph split bool is_alloc; @@ -800,6 +824,12 @@ struct ggml_backend_sched { int n_splits; int splits_capacity; + // graph plans + struct ggml_backend_sched_plan * plans; + int n_plans; + int plans_capacity; + bool plan_needs_update; + // pipeline parallelism support int n_copies; int cur_copy; @@ -1010,6 +1040,16 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru } } +static void ggml_backend_sched_free_plans(ggml_backend_sched_t sched) { + for (int i = 0; i < sched->n_plans; i++) { + ggml_backend_t backend = sched->backends[sched->plans[i].backend_id]; + if (ggml_backend_supports_graph_plan(backend)) { + ggml_backend_graph_plan_free(backend, sched->plans[i].plan); + } + } + sched->n_plans = 0; +} + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits @@ -1484,6 +1524,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra for (int i = 0; i < sched->n_splits; ++i) { sched->splits[i].graph.uid = ggml_graph_next_uid(); } + sched->plan_needs_update = true; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { @@ -1538,6 +1579,62 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } +static void ggml_backend_sched_update_plans(ggml_backend_sched_t sched) { + // create graph plans + if (sched->plan_needs_update) { + bool create_new_plans; + if (sched->n_plans == sched->n_splits) { + create_new_plans = false; + for (int i = 0; i < sched->n_splits; i++) { + if (sched->splits[i].backend_id != sched->plans[i].backend_id) { + create_new_plans = true; + break; + } + } + } else { + create_new_plans = true; + } + if (create_new_plans) { + // free previous and recreate new plans + ggml_backend_sched_free_plans(sched); + if (sched->plans_capacity < sched->n_splits) { + while (sched->plans_capacity < sched->n_splits) { + sched->plans_capacity *= 2; + } + sched->plans = (ggml_backend_sched_plan *) realloc( + sched->plans, sched->plans_capacity * sizeof(struct ggml_backend_sched_plan)); + GGML_ASSERT(sched->plans); + } + sched->n_plans = sched->n_splits; + for (int i = 0; i < sched->n_splits; i++) { + ggml_backend_t backend = sched->backends[sched->splits[i].backend_id]; + sched->plans[i].backend_id = sched->splits[i].backend_id; + if (ggml_backend_supports_graph_plan(backend)) { + sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph); + } else { + sched->plans[i].plan = nullptr; + } + } + } else { + // update existing plans + for (int i = 0; i < sched->n_splits; i++) { + ggml_backend_t backend = sched->backends[sched->splits[i].backend_id]; + if (ggml_backend_supports_graph_plan(backend)) { + if (ggml_backend_supports_graph_plan_update(backend)) { + ggml_backend_graph_plan_update(backend, sched->plans[i].plan, &sched->splits[i].graph); + } else { + ggml_backend_graph_plan_free(backend, sched->plans[i].plan); + sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph); + } + } else { + sched->plans[i].plan = nullptr; + } + } + } + sched->plan_needs_update = false; + } +} + static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { GGML_ASSERT(sched); struct ggml_backend_sched_split * splits = sched->splits; @@ -1546,6 +1643,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::vector ids; std::vector used_ids; + ggml_backend_sched_update_plans(sched); + for (int split_id = 0; split_id < sched->n_splits; split_id++) { struct ggml_backend_sched_split * split = &splits[split_id]; int split_backend_id = split->backend_id; @@ -1675,7 +1774,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + enum ggml_status ec; + if (ggml_backend_supports_graph_plan(split_backend) && sched->plans[split_id].plan) { + ec = ggml_backend_graph_plan_compute(split_backend, sched->plans[split_id].plan); + } else { + ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + } if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1773,6 +1877,10 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; + const int initial_plans_capacity = 16; + sched->plans = (ggml_backend_sched_plan *) calloc(initial_plans_capacity, sizeof(sched->plans[0])); + sched->plans_capacity = initial_plans_capacity; + for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); @@ -1806,6 +1914,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { ggml_free(sched->ctx); ggml_hash_set_free(&sched->hash_set); free(sched->splits); + ggml_backend_sched_free_plans(sched); + free(sched->plans); free(sched->hv_tensor_backend_ids); free(sched->hv_tensor_copies); free(sched->node_backend_ids); From 03423817aa61a4a637461f9e87efb44eef67fc8d Mon Sep 17 00:00:00 2001 From: Jan Solanti Date: Sat, 11 Apr 2026 00:56:27 +0300 Subject: [PATCH 3/5] opencl: use command buffers when available --- ggml/src/ggml-opencl/ggml-opencl.cpp | 700 +++++++++++++++++++-------- 1 file changed, 488 insertions(+), 212 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 3e688134af8..7b159dad21a 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -1,5 +1,6 @@ #define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION #define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#define CL_ENABLE_BETA_EXTENSIONS // suppress warnings in CL headers for GCC and Clang #pragma GCC diagnostic ignored "-Woverlength-strings" @@ -14,6 +15,7 @@ #include "ggml.h" #include +#include #include #include @@ -51,7 +53,7 @@ // OpenCL //------------------------------------------------------------------------------ -bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor); +bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor, cl_command_buffer_khr recording = NULL); // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division @@ -395,6 +397,7 @@ struct ggml_backend_opencl_context { bool fp16_support; bool has_vector_subgroup_broadcast; bool disable_fusion; + bool cmdbuf_support; bool adreno_has_large_buffer; bool adreno_use_large_buffer; @@ -706,17 +709,22 @@ struct ggml_backend_opencl_context { return workgroup_size; } - void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) { + void enqueue_ndrange_kernel(cl_command_buffer_khr recording, cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) { + if (recording) { + CL_CHECK(this->clCommandNDRangeKernelKHR(recording, NULL, NULL, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL, NULL)); + return; + } else { #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - profiling_info.emplace_back(); - populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); + profiling_info.emplace_back(); + populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); #else - GGML_UNUSED(tensor); - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + GGML_UNUSED(tensor); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); #endif + } } #ifdef GGML_OPENCL_USE_ADRENO_KERNELS @@ -752,6 +760,17 @@ struct ggml_backend_opencl_context { cl_kernel kernel_gemm_noshuffle_iq4_nl_f32; #endif // GGML_OPENCL_USE_ADRENO_KERNELS + clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = NULL; + clCommandCopyBufferKHR_fn clCommandCopyBufferKHR = NULL; + clCommandCopyBufferRectKHR_fn clCommandCopyBufferRectKHR = NULL; + clCommandFillBufferKHR_fn clCommandFillBufferKHR = NULL; + clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = NULL; + clCommandBarrierWithWaitListKHR_fn clCommandBarrierWithWaitListKHR = NULL; + clFinalizeCommandBufferKHR_fn clFinalizeCommandBufferKHR = NULL; + clEnqueueCommandBufferKHR_fn clEnqueueCommandBufferKHR = NULL; + clReleaseCommandBufferKHR_fn clReleaseCommandBufferKHR = NULL; + clGetCommandBufferInfoKHR_fn clGetCommandBufferInfoKHR = NULL; + void free() { ref_count--; if (ref_count == 0) { @@ -3425,6 +3444,25 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { backend_ctx->non_uniform_workgroups = true; } + // Check command buffer support. + backend_ctx->cmdbuf_support = strstr(ext_buffer, "cl_khr_command_buffer"); + if (backend_ctx->cmdbuf_support) { + backend_ctx->clCreateCommandBufferKHR = (clCreateCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCreateCommandBufferKHR"); + backend_ctx->clCommandCopyBufferKHR = (clCommandCopyBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandCopyBufferKHR"); + backend_ctx->clCommandCopyBufferRectKHR = (clCommandCopyBufferRectKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandCopyBufferRectKHR"); + backend_ctx->clCommandFillBufferKHR = (clCommandFillBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandFillBufferKHR"); + backend_ctx->clCommandNDRangeKernelKHR = (clCommandNDRangeKernelKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandNDRangeKernelKHR"); + backend_ctx->clCommandBarrierWithWaitListKHR = (clCommandBarrierWithWaitListKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clCommandBarrierWithWaitListKHR"); + backend_ctx->clFinalizeCommandBufferKHR = (clFinalizeCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clFinalizeCommandBufferKHR"); + backend_ctx->clEnqueueCommandBufferKHR = (clEnqueueCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clEnqueueCommandBufferKHR"); + backend_ctx->clReleaseCommandBufferKHR = (clReleaseCommandBufferKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clReleaseCommandBufferKHR"); + backend_ctx->clGetCommandBufferInfoKHR = (clGetCommandBufferInfoKHR_fn) clGetExtensionFunctionAddressForPlatform (dev_ctx->platform, "clGetCommandBufferInfoKHR"); + } + char * cmdbuf_env = getenv("GGML_OPENCL_USE_CMDBUF"); + bool cmdbuf_enable = (cmdbuf_env && strlen(cmdbuf_env) > 0) ? (bool) atoi(cmdbuf_env) : true; + GGML_LOG_INFO("ggml_opencl: command buffer support: %s%s\n", backend_ctx->cmdbuf_support ? "true" : "false", backend_ctx->cmdbuf_support ? (cmdbuf_enable ? " (enabled)" : " (disabled by env)") : ""); + backend_ctx->cmdbuf_support &= cmdbuf_enable; + // Print out configurations #ifdef GGML_OPENCL_SOA_Q GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n"); @@ -4090,13 +4128,13 @@ static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx return true; } -static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor); -static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor); -static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor); +static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor); +static void ggml_opencl_op_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor); +static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor); -static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; +static ggml_status ggml_backend_opencl_graph_eval(ggml_backend_t backend, const ggml_cgraph * cgraph, cl_command_buffer_khr recording) { + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -4114,22 +4152,22 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm } if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) { - ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); + ggml_opencl_op_norm_fused(backend, recording, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); i += 2; continue; } if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_GROUP_NORM, GGML_OP_MUL, GGML_OP_ADD })) { - ggml_opencl_op_group_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); + ggml_opencl_op_group_norm_fused(backend, recording, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); i += 2; continue; } if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { - ggml_opencl_op_rms_norm_fused(backend, node, cgraph->nodes[i+1]); + ggml_opencl_op_rms_norm_fused(backend, recording, node, cgraph->nodes[i+1]); i++; continue; } - bool ok = ggml_cl_compute_forward(backend, node); + bool ok = ggml_cl_compute_forward(backend, node, recording); if (!ok) { GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -4139,6 +4177,47 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm return GGML_STATUS_SUCCESS; } +static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + return ggml_backend_opencl_graph_eval(backend, cgraph, NULL); +} + +static ggml_backend_graph_plan_t ggml_backend_opencl_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) +{ + ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context; + if (!backend_ctx->cmdbuf_support) { + return NULL; + } + + cl_int error; + cl_command_buffer_khr cbuf = backend_ctx->clCreateCommandBufferKHR(1, &backend_ctx->queue, NULL, &error); + CL_CHECK(error); + + if (ggml_backend_opencl_graph_eval(backend, cgraph, cbuf) != GGML_STATUS_SUCCESS) { + return NULL; + } + + CL_CHECK(backend_ctx->clFinalizeCommandBufferKHR(cbuf)); + return cbuf; +} + +static void ggml_backend_opencl_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan) +{ + ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context; + if (!backend_ctx->cmdbuf_support) { + return; + } + + cl_command_buffer_khr cmdbuf = (cl_command_buffer_khr)plan; + + backend_ctx->clReleaseCommandBufferKHR(cmdbuf); +} + +static enum ggml_status ggml_backend_opencl_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context; + CL_CHECK(backend_ctx->clEnqueueCommandBufferKHR(1, &backend_ctx->queue, (cl_command_buffer_khr)plan, 0, NULL, NULL)); + return GGML_STATUS_SUCCESS; +} + static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context; ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx; @@ -4437,10 +4516,10 @@ static ggml_backend_i ggml_backend_opencl_i = { /* .get_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */ /* .synchronize = */ ggml_backend_opencl_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, + /* .graph_plan_create = */ ggml_backend_opencl_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_opencl_graph_plan_free, /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, + /* .graph_plan_compute = */ ggml_backend_opencl_graph_plan_compute, /* .graph_compute = */ ggml_backend_opencl_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, @@ -6689,8 +6768,9 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg // Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but // nb[] is recalculated such that tensor is contiguous. -static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst, - cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) { +static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, cl_command_buffer_khr recording, + const ggml_tensor * src, cl_mem dst, cl_ulong &nb0, + cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) { ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; const int tensor_type_size = ggml_type_size(src->type); @@ -6759,17 +6839,18 @@ static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src); + + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, src); } -static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_nop(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { UNUSED(backend); UNUSED(src0); UNUSED(src1); UNUSED(dst); } -static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_get_rows(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -6837,10 +6918,10 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_set_rows(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -6954,10 +7035,10 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c (size_t)ne03}; size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_add(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -7125,17 +7206,17 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_add_id(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -7201,10 +7282,10 @@ static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 }; size_t local_work_size[] = { (size_t)nth, 1, 1 }; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -7330,17 +7411,17 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_div(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -7451,17 +7532,17 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_sub(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -7572,17 +7653,17 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_sqr(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7629,10 +7710,10 @@ static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_sqrt(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7679,10 +7760,10 @@ static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mean(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7739,10 +7820,10 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_ssm_conv(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -7803,10 +7884,10 @@ static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, c local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_gelu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7841,10 +7922,10 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_gelu_erf(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7879,10 +7960,10 @@ static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_gelu_quick(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7917,10 +7998,10 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_silu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7960,10 +8041,10 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_relu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -7996,10 +8077,10 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_sigmoid(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8039,10 +8120,10 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_tri(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8077,10 +8158,10 @@ static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const size_t local_work_size[1] = { 256 }; size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] }; - backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, global_work_size, local_work_size, dst); } -static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_fill(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(dst); GGML_ASSERT(dst->extra); @@ -8107,10 +8188,10 @@ static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const size_t local_work_size[1] = { 256 }; size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] }; - backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, global_work_size, local_work_size, dst); } -static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_clamp(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8150,10 +8231,10 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8202,10 +8283,10 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_rms_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8277,10 +8358,10 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c // This is local memory - the size depends on subgroup size. CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL)); - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) { +static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) { GGML_ASSERT(mul_tensor); GGML_ASSERT(rms_norm_tensor); @@ -8388,10 +8469,10 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps)); CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs, NULL)); - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) { +static void ggml_opencl_op_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) { GGML_ASSERT(norm_tensor && mul_tensor && add_tensor); const ggml_tensor * src0 = norm_tensor->src[0]; @@ -8474,10 +8555,10 @@ static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &eps)); CL_CHECK(clSetKernelArg(kernel, 33, sizeof(cl_float2) * num_subgroups, NULL)); - backend_ctx->enqueue_ndrange_kernel(kernel, 3, gws, lws, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, gws, lws, dst); } -static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) { +static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, cl_command_buffer_khr recording, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) { GGML_ASSERT(gn_tensor && mul_tensor && add_tensor); const ggml_tensor * src0 = gn_tensor->src[0]; @@ -8522,10 +8603,10 @@ static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &group_size)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &eps)); - backend_ctx->enqueue_ndrange_kernel(kernel, 1, gws, lws, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, gws, lws, dst); } -static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_group_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8572,10 +8653,10 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1}; size_t local_work_size[] = {(size_t)sgs, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_l2_norm(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8630,10 +8711,10 @@ static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_tanh(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8697,7 +8778,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } else { // Handle non-contiguous input if (src0->type == GGML_TYPE_F32) { @@ -8725,11 +8806,11 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_neg(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8779,7 +8860,7 @@ static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else { // Handle non-contiguous input if (src0->type == GGML_TYPE_F32) { @@ -8807,11 +8888,11 @@ static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_exp(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8861,7 +8942,7 @@ static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else { // Handle non-contiguous input if (src0->type == GGML_TYPE_F32) { @@ -8889,11 +8970,11 @@ static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_expm1(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -8957,7 +9038,7 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } else { // Handle non-contiguous input if (src0->type == GGML_TYPE_F32) { @@ -8985,11 +9066,11 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_softplus(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -9053,7 +9134,7 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } else { // Handle non-contiguous input if (src0->type == GGML_TYPE_F32) { @@ -9081,11 +9162,11 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) { +static void ggml_cl_repeat(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -9147,10 +9228,10 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_cl_pad(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -9242,10 +9323,10 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_cl_upscale(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -9356,10 +9437,10 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg local_work_size_ptr = nullptr; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_concat(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -9440,10 +9521,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_cl_timestep_embedding(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -9484,10 +9565,10 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor size_t global_work_size[] = {gws0, gws1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, NULL, dst); } -static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) { +static void ggml_cl_flash_attn(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) { const ggml_tensor * v = dst->src[2]; const ggml_tensor * mask = dst->src[3]; const ggml_tensor * sinks = dst->src[4]; @@ -9613,17 +9694,17 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co const size_t wg_size = 64; size_t local_work_size[] = { wg_size, 1 }; size_t global_work_size[] = { wg_size, (size_t)(n_head * n_batch) }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst); } else { const int block_m = backend_ctx->kernels_flash_attn_bm.at(dk_dv); const size_t wg_size = block_m; size_t local_work_size[] = { wg_size, 1 }; size_t global_work_size[] = { (size_t)((n_q + block_m - 1) / block_m) * wg_size, (size_t)(n_head * n_batch) }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst); } } -static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; @@ -9672,10 +9753,10 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten (size_t) ((N + OPWN - 1) / OPWN) * TPWN, }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst); } -static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_conv_2d(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; @@ -9748,10 +9829,10 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = { (size_t)NB_K * WG_K, (size_t)NB_NPQ * WG_NPQ, 1 }; size_t local_work_size[] = { (size_t)WG_K, (size_t)WG_NPQ, 1 }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size, local_work_size, dst); } -static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; @@ -9880,7 +9961,7 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten size_t global_work_size[3] = {64, static_cast(((M+63)/64)), static_cast(((N+31)/32)*ne12)}; size_t local_work_size[3] = {64, 1, 2}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); // deallocate sub buffers and images // <--------------------------------------------> // @@ -9992,7 +10073,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size[3] = {64, 4, 1}; size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(q_img)); CL_CHECK(clReleaseMemObject(b_sub_buf)); @@ -10073,7 +10154,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t local_work_size_t[0]=2; local_work_size_t[1]=8; } - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm kernel = backend_ctx->kernel_gemm_noshuffle_q4_0_f32; @@ -10104,7 +10185,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t local_work_size[1] = 64; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(b_sub_buf)); CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); @@ -10120,7 +10201,7 @@ static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_t #endif } -static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -10197,7 +10278,7 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size[3] = {64, 4, 1}; size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(q_img)); CL_CHECK(clReleaseMemObject(b_sub_buf)); @@ -10259,7 +10340,7 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size_t[2] = { 1, 16 }; size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm kernel = backend_ctx->kernel_gemm_noshuffle_q4_1_f32; @@ -10279,7 +10360,7 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; size_t local_work_size[3] = {1, 128, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(b_sub_buf)); CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); @@ -10370,7 +10451,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml size_t local_work_size[3] = {64, 4, 1}; size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(q_img)); CL_CHECK(clReleaseMemObject(b_sub_buf)); @@ -10432,7 +10513,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml size_t local_work_size_t[2] = { 1, 16 }; size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm kernel = backend_ctx->kernel_gemm_noshuffle_iq4_nl_f32; @@ -10451,7 +10532,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; size_t local_work_size[3] = {1, 128, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(b_sub_buf)); CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); @@ -10466,7 +10547,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml #endif } -static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -10567,7 +10648,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size[] = { wavesize, 4, 1 }; size_t global_work_size[] = { CEIL_DIV(M, wavesize)*wavesize, 4, 1 }; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(q_img)); CL_CHECK(clReleaseMemObject(b_img)); @@ -10627,9 +10708,10 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + size_t local_work_size_t[2] = { 1, 16 }; size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm kernel = backend_ctx->kernel_gemm_noshuffle_q8_0_f32; @@ -10648,7 +10730,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t size_t global_work_size[] = { (size_t)CEIL_DIV(N, 8), (size_t)CEIL_DIV(M, 4), 1 }; size_t local_work_size[] = { 2, 128, 1 }; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(b_img_trans)); CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); @@ -10663,7 +10745,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t #endif } -static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -10748,7 +10830,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size[3] = {64, 4, 1}; size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(q_img)); CL_CHECK(clReleaseMemObject(b_sub_buf)); @@ -10811,7 +10893,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size_t[2] = { 1, 16 }; size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm kernel = backend_ctx->kernel_gemm_noshuffle_q4_k_f32; @@ -10835,7 +10917,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; size_t local_work_size[3] = {1, 128, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(b_sub_buf)); CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); CL_CHECK(clReleaseMemObject(b_img)); @@ -10849,7 +10931,7 @@ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_t #endif } -static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -10934,7 +11016,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size[3] = {64, 4, 1}; size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(ql_img)); CL_CHECK(clReleaseMemObject(qh_img)); @@ -10999,7 +11081,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_size_t[2] = { 1, 16 }; size_t global_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_size_t, local_size_t, dst); // gemm kernel = backend_ctx->kernel_gemm_noshuffle_q6_K_f32; @@ -11024,7 +11106,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; size_t local_work_size[3] = {2, 128, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(b_sub_buf)); CL_CHECK(clReleaseMemObject(b_img)); @@ -11039,7 +11121,7 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t #endif } -static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -11133,7 +11215,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size[3] = {64, 4, 1}; size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(q_img)); CL_CHECK(clReleaseMemObject(qh_img)); @@ -11194,7 +11276,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t size_t local_work_size_t[2] = {1, 16}; size_t global_work_size_t[2] = {(size_t)width_B, (size_t)padded_height_B}; - backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm kernel = backend_ctx->kernel_gemm_noshuffle_q5_k_f32; @@ -11219,7 +11301,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; size_t local_work_size[3] = {1, 128, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); CL_CHECK(clReleaseMemObject(b_sub_buf)); CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); @@ -11234,7 +11316,7 @@ static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_t #endif } -static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -11346,6 +11428,169 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) { ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst); return; + } + + // q6_K x fp32 + if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst); + return; + } + + // q5_K x fp32 + if (src0t == GGML_TYPE_Q5_K && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst); + return; + } + + // q4_0 x fp32 + if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) { + // TODO: remove duplicate definitions of image description + format -- move to top + + // create an image for A + // <--------------------------------------------> // + if (N == 1) { + img_fmt_1d = { CL_R, CL_UNSIGNED_INT32}; + } else { + img_fmt_1d = { CL_R, CL_FLOAT}; + } + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 2 / 4; // Divide by 4 for char -> float + img_desc_1d.buffer = extra0_q4_0->q; + A_image1d = clCreateImage( + context, + CL_MEM_READ_ONLY, + &img_fmt_1d, + &img_desc_1d, + NULL, + &status); + CL_CHECK(status); + // <--------------------------------------------> // + + + // create a sub_buffer for B + // <--------------------------------------------> // + region.origin = (extra1->offset); + region.size = K * N * sizeof(float); + B_sub_buffer = clCreateSubBuffer( + extra1->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // <--------------------------------------------> // + + // transpose activation for Skyler's gemm + if (N != 1) { + //how many extra elements beyond multiple of 8 + int extra_elements = N % 8; + + //how much padding to add + padding = 0; + if (extra_elements > 0){ + padding = 8 - extra_elements; + } + + // Specify the starting offset (in bytes) + region.origin = 0; + // Specify the size of the sub-buffer (divide by 2 for FP16) + region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + + B_d = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + + cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT }; + cl_image_desc image_desc_B_d_input = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(K * N / 4), + 0, 0, 0, 0, 0, 0, 0, { B_sub_buffer } + }; + B_d_input_image = clCreateImage( + context, + 0, + &image_format_B_d_input, + &image_desc_B_d_input, + NULL, + &status); + CL_CHECK(status); + + cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16) + cl_image_desc image_desc_B_d_output = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(K * (N + padding)/4), + 0, 0, 0, 0, 0, 0, 0, { B_d } + }; + B_image1d = clCreateImage( + context, + 0, + &image_format_B_d_output, + &image_desc_B_d_output, + NULL, + &status); + CL_CHECK(status); + + int height_B = N/4; + if (height_B == 0) { + height_B = 1; + } + int width_B = K/4; + int padded_height_B = (N + padding)/4; + + kernel = backend_ctx->kernel_transpose_32_16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + + size_t local_size_t[2] = { 1, 16 }; + //WGS tuning + if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { + local_size_t[0]=4; + local_size_t[1]=8; + } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { + local_size_t[0]=2; + local_size_t[1]=8; + } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) { + local_size_t[0]=1; + local_size_t[1]=8; + } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) { + local_size_t[0]=2; + local_size_t[1]=8; + } + + size_t global_size_t[2] = { + static_cast(width_B), + static_cast(padded_height_B) + }; + + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 2, global_size_t, local_size_t, dst); + } else { + // no need to transpose B in other cases + // create an image for B from sub_buffer + // <--------------------------------------------> // + img_fmt_1d = {CL_RGBA, CL_FLOAT}; + + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_width = K * N / 4; + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.buffer = B_sub_buffer; + B_image1d = clCreateImage( + context, + CL_MEM_READ_ONLY, + &img_fmt_1d, + &img_desc_1d, + NULL, + &status); + CL_CHECK(status); + // <--------------------------------------------> // } // q6_K x fp32 @@ -11359,6 +11604,28 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst); return; } + // <--------------------------------------------> // + + // enqueue kernel with profiling + // <--------------------------------------------> // + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); + // <--------------------------------------------> // + + // deallocate sub buffers and images + // <--------------------------------------------> // + CL_CHECK(clReleaseMemObject(A_image1d)); + CL_CHECK(clReleaseMemObject(B_sub_buffer)); + CL_CHECK(clReleaseMemObject(B_image1d)); + + if (N != 1) { + CL_CHECK(clReleaseMemObject(B_d)); + CL_CHECK(clReleaseMemObject(B_d_input_image)); + CL_CHECK(clReleaseMemObject(C_d)); + } + // <--------------------------------------------> // + + return; + } } // if (ne01 && ne1) #endif // GGML_OPENCL_USE_ADRENO_KERNELS @@ -11394,7 +11661,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (!ggml_is_contiguous(src0)) { backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0)); - ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer, + ggml_cl_copy_to_contiguous(backend, recording, src0, backend_ctx->prealloc_src0.buffer, nb00_cont, nb01_cont, nb02_cont, nb03_cont); mem_src0 = backend_ctx->prealloc_src0.buffer; offset0_cont = 0; @@ -11402,7 +11669,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (!ggml_is_contiguous(src1)) { backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1)); - ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer, + ggml_cl_copy_to_contiguous(backend, recording, src1, backend_ctx->prealloc_src1.buffer, nb10_cont, nb11_cont, nb12_cont, nb13_cont); mem_src1 = backend_ctx->prealloc_src1.buffer; offset1_cont = 0; @@ -11432,7 +11699,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_F16: { @@ -11461,7 +11728,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (!ggml_is_contiguous(src0)) { backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0)); - ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer, + ggml_cl_copy_to_contiguous(backend, recording, src0, backend_ctx->prealloc_src0.buffer, nb00_cont, nb01_cont, nb02_cont, nb03_cont); mem_src0 = backend_ctx->prealloc_src0.buffer; offset0_cont = 0; @@ -11469,7 +11736,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (!ggml_is_contiguous(src1)) { backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1)); - ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer, + ggml_cl_copy_to_contiguous(backend, recording, src1, backend_ctx->prealloc_src1.buffer, nb10_cont, nb11_cont, nb12_cont, nb13_cont); mem_src1 = backend_ctx->prealloc_src1.buffer; offset1_cont = 0; @@ -11499,7 +11766,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_Q4_0: { @@ -11541,7 +11808,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_Q4_1: { @@ -11584,7 +11851,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_Q8_0: { @@ -11626,7 +11893,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_IQ4_NL: { @@ -11668,7 +11935,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_Q4_K: { @@ -11712,7 +11979,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_Q5_K: { @@ -11757,7 +12024,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } case GGML_TYPE_Q6_K: { @@ -11801,7 +12068,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } default: @@ -11817,7 +12084,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co src1->ne[2] == 1 && src1->ne[3] == 1 && ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) { - ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst); + ggml_cl_mul_mat_f16_f32_tiled(backend, recording, src0, src1, dst); return; } @@ -11883,7 +12150,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co global_work_size[2] = (size_t)ne12*ne13; } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); return; } #else // GGML_OPENCL_SOA_Q @@ -12590,35 +12857,35 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else if (src0t == GGML_TYPE_Q4_K) { size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else if (src0t == GGML_TYPE_Q3_K) { GGML_ASSERT(false && "not implemented"); } else if (src0t == GGML_TYPE_Q5_K) { size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else if (src0t == GGML_TYPE_Q6_K) { size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else { int64_t ny = (ne11 + nrows - 1)/nrows; size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } } -static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, int ne20) { +static void moe_router_reoerder(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src, int ne20) { cl_int err; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; @@ -12683,7 +12950,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, size_t histogram_global_size[] = {(size_t)(((ne21 + 63) / 64) * 64), static_cast(ne20), 1}; size_t histogram_local_size[] = {64, static_cast(ne20), 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, histogram_global_size, histogram_local_size, src); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, histogram_global_size, histogram_local_size, src); // Scan kernel = backend_ctx->kernel_moe_scan; @@ -12696,7 +12963,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, size_t scan_global_size[] = {1}; size_t scan_local_size[] = {1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 1, scan_global_size, scan_local_size, src); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 1, scan_global_size, scan_local_size, src); // Fill kernel = backend_ctx->kernel_moe_fill; @@ -12706,7 +12973,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, size_t fill_global_size[] = {(size_t)(((max_post_router_tile + 63) / 64) * 64), n_tile_size, 1}; size_t fill_local_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, fill_global_size, fill_local_size, src); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, fill_global_size, fill_local_size, src); // Scatter kernel = backend_ctx->kernel_moe_scatter; @@ -12719,7 +12986,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne20)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02)); - backend_ctx->enqueue_ndrange_kernel(kernel, 3, histogram_global_size, histogram_local_size, src); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, histogram_global_size, histogram_local_size, src); CL_CHECK(clReleaseMemObject(original_router_buf)); CL_CHECK(clReleaseMemObject(hist_buf)); @@ -12730,7 +12997,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, CL_CHECK(clReleaseMemObject(emap_buf)); } -static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_id(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -12985,7 +13252,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); // launch kernel - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_size, local_size, dst); // deallocate sub buffers and images CL_CHECK(clReleaseMemObject(src1_sub_buffer)); @@ -13057,7 +13324,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; // Dispatch reorder kernel - backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); // MoE kernel prepare // Create sub buffer for dst @@ -13095,7 +13362,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, local_size[2] = 1; // Dispatch kernel - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_size, local_size, dst); clReleaseMemObject(sub_buf_src1_pre); clReleaseMemObject(buf_src1_reordered); @@ -13105,6 +13372,14 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, clReleaseMemObject(sub_buf_dst); clReleaseMemObject(buf_dst_image); } + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); return; } // fallback to generic MoE mxfp4 kernel #endif // GGML_OPENCL_USE_ADRENO_KERNELS @@ -13210,10 +13485,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123}; size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_scale(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -13261,10 +13536,10 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } -static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_cpy(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -13356,15 +13631,15 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, src1); } -static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cl_cpy(backend, src0, dst, nullptr); +static void ggml_cl_dup(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cl_cpy(backend, recording, src0, dst, nullptr); UNUSED(src1); } -static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_set(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -13401,7 +13676,7 @@ static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const // for inplace case, dst is a view of src0 and is updated on top of it // so for non-inplace case, copy src0 to dst first if (!inplace) { - ggml_cl_cpy(backend, src0, dst, nullptr); + ggml_cl_cpy(backend, recording, src0, dst, nullptr); } // then copy src1 to dst with specified offset @@ -13444,10 +13719,10 @@ static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne11*nth, (size_t)ne12, (size_t)ne13}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_diag_mask_inf(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -13485,7 +13760,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; size_t local_work_size[] = {64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } else { kernel = backend_ctx->kernel_diag_mask_inf; @@ -13505,11 +13780,11 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size_ptr, dst); } } -static void ggml_cl_diag(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_diag(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -13549,10 +13824,10 @@ static void ggml_cl_diag(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_soft_max(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -13677,10 +13952,10 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_rope(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -13860,10 +14135,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_solve_tri(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(src1); @@ -13926,10 +14201,10 @@ static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[3]= { (size_t)k, (size_t)dst->ne[2], (size_t)dst->ne[3]}; size_t local_work_size[] = {16, 4, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_im2col(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src1); GGML_ASSERT(src1->extra); @@ -14009,10 +14284,10 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC}; size_t local_work_size[] = {256, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_argsort(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -14055,7 +14330,8 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1}; size_t local_work_size[] = {(size_t)ne00_padded, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); #ifdef GGML_OPENCL_USE_ADRENO_KERNELS const int ne21 = dst->ne[1]; @@ -14065,7 +14341,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co #endif // GGML_OPENCL_USE_ADRENO_KERNELS } -static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_sum_rows(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -14122,10 +14398,10 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)64, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } -static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_cumsum(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -14189,7 +14465,7 @@ static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = { (size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); if(ne00 > nth) { // if a single workgroup cannot handle an entire row, each workgroup @@ -14216,7 +14492,7 @@ static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size_1[] = { (size_t)net1*nth, (size_t)net2, (size_t)net3}; size_t local_work_size_1[] = { (size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_1, local_work_size_1, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size_1, local_work_size_1, dst); kernel = backend_ctx->kernel_cumsum_add; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer)); @@ -14233,11 +14509,11 @@ static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size_2[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03}; size_t local_work_size_2[] = { (size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_2, local_work_size_2, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size_2, local_work_size_2, dst); } } -static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_glu(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); GGML_ASSERT(dst); @@ -14344,16 +14620,16 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {nrows*nth, 1, 1}; size_t local_work_size[] = {nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(recording, kernel, 3, global_work_size, local_work_size, dst); } //------------------------------------------------------------------------------ // Op offloading //------------------------------------------------------------------------------ -typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_cl_func_t)(ggml_backend_t backend, cl_command_buffer_khr recording, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) { +bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor, cl_command_buffer_khr recording) { ggml_cl_func_t func = nullptr; ggml_tensor * src0 = tensor->src[0]; @@ -14572,13 +14848,13 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor if (!any_on_device) { return false; } - ggml_cl_pad(backend, tensor->src[0], tensor); + ggml_cl_pad(backend, recording, tensor->src[0], tensor); return true; case GGML_OP_UPSCALE: if (!any_on_device) { return false; } - ggml_cl_upscale(backend, tensor->src[0], tensor); + ggml_cl_upscale(backend, recording, tensor->src[0], tensor); return true; case GGML_OP_CONV_2D: if (!any_on_device) { @@ -14602,7 +14878,7 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor if (!any_on_device) { return false; } - ggml_cl_timestep_embedding(backend, tensor->src[0], tensor); + ggml_cl_timestep_embedding(backend, recording, tensor->src[0], tensor); return true; case GGML_OP_MUL_MAT: if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { @@ -14689,12 +14965,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor if (!any_on_device) { return false; } - ggml_cl_flash_attn(backend, tensor->src[0], tensor->src[1], tensor); + ggml_cl_flash_attn(backend, recording, tensor->src[0], tensor->src[1], tensor); return true; default: return false; } - func(backend, tensor->src[0], tensor->src[1], tensor); + func(backend, recording, tensor->src[0], tensor->src[1], tensor); return true; } From 6049b971f8a845090fa67791ea37825b2b1056ee Mon Sep 17 00:00:00 2001 From: Jan Solanti Date: Fri, 17 Apr 2026 20:42:24 +0300 Subject: [PATCH 4/5] opencl: async-ify tensor I/O a bit --- ggml/src/ggml-opencl/ggml-opencl.cpp | 327 +++++++++++++++++++-------- 1 file changed, 228 insertions(+), 99 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 7b159dad21a..463694ef333 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -378,6 +378,39 @@ struct ggml_backend_opencl_device_context { cl_context context = nullptr; }; +struct DeferredMemobjReleaseInfo { + cl_mem mem; + cl_event last_event; + unsigned long line; + const char *name; +}; + +#define DEFERRED_CL_RELEASE_CHECK(err) \ +do { \ + cl_int err_ = (err); \ + if (err_ != CL_SUCCESS) { \ + GGML_LOG_ERROR("ggml_opencl: %s error %d during deferred release of %s"\ + " from %s:%lu\n", #err, err_, rel.name, __FILE__, rel.line); \ + GGML_ASSERT(0); \ + } \ +} while(0) + +static void ggml_backend_opencl_do_deferred_memobj_release(std::vector &release_list) { + for (DeferredMemobjReleaseInfo rel: release_list) { + DEFERRED_CL_RELEASE_CHECK(clWaitForEvents(1, &rel.last_event)); + DEFERRED_CL_RELEASE_CHECK(clReleaseEvent(rel.last_event)); + DEFERRED_CL_RELEASE_CHECK(clReleaseMemObject(rel.mem)); + } + release_list.clear(); +} + +#undef DEFERRED_CL_RELEASE_CHECK + +#define RELEASE_LATER(mem, evt, container) \ +do { \ + (container).push_back({mem, evt, __LINE__, #mem}); \ +} while(0) + // backend context struct ggml_backend_opencl_context { int ref_count; @@ -410,6 +443,7 @@ struct ggml_backend_opencl_context { cl_context context; cl_command_queue queue; + std::vector release_list; // prealloc buffers for transposing weights and activations ggml_cl_buffer prealloc_quant_trans; @@ -709,22 +743,25 @@ struct ggml_backend_opencl_context { return workgroup_size; } - void enqueue_ndrange_kernel(cl_command_buffer_khr recording, cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) { + void enqueue_ndrange_kernel(cl_command_buffer_khr recording, cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor, cl_event *out_evt = NULL) { if (recording) { CL_CHECK(this->clCommandNDRangeKernelKHR(recording, NULL, NULL, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL, NULL)); return; - } else { + } #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - profiling_info.emplace_back(); - populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + if (out_evt) { + CL_CHECK(clRetainEvent(evt)); + *out_evt = evt; + } + profiling_info.emplace_back(); + populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); #else - GGML_UNUSED(tensor); - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + GGML_UNUSED(tensor); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, out_evt)); #endif - } + } #ifdef GGML_OPENCL_USE_ADRENO_KERNELS @@ -3998,36 +4035,49 @@ static void ggml_backend_opencl_free(ggml_backend_t backend) { ggml_cl2_free(backend); } -static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_UNUSED(backend); - GGML_UNUSED(tensor); - GGML_UNUSED(data); - GGML_UNUSED(offset); - GGML_UNUSED(size); +namespace { +static void free_tmp_tensor_buf(cl_event evt, cl_int status, void *userdata) { + delete[] (char*)userdata; } - -static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_UNUSED(backend); - GGML_UNUSED(tensor); - GGML_UNUSED(data); - GGML_UNUSED(offset); - GGML_UNUSED(size); } +static ggml_guid_t ggml_backend_opencl_guid(); +static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, const ggml_tensor * src, ggml_tensor * dst) { + if(src_backend == dst_backend) { + ggml_backend_opencl_context *dst_ctx = (ggml_backend_opencl_context *)dst_backend->context; + ggml_tensor_extra_cl * src_extra = (ggml_tensor_extra_cl *) src->extra; + ggml_tensor_extra_cl * dst_extra = (ggml_tensor_extra_cl *) dst->extra; + CL_CHECK(clEnqueueCopyBuffer(dst_ctx->queue, src_extra->data_device, dst_extra->data_device, 0, 0, ggml_nbytes(src), 0, NULL, NULL)); + return true; + } -static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { - GGML_UNUSED(backend); - GGML_UNUSED(src); - GGML_UNUSED(dst); + if (src_backend->guid == ggml_backend_opencl_guid()) { + ggml_backend_opencl_context *src_ctx = (ggml_backend_opencl_context *)src_backend->context; + ggml_tensor_extra_cl * src_extra = (ggml_tensor_extra_cl *) src->extra; + void *tmp = new char[ggml_nbytes(src)]; + CL_CHECK(clEnqueueReadBuffer(src_ctx->queue, src_extra->data_device, CL_TRUE, 0, ggml_nbytes(src), tmp, 0, NULL, NULL)); + ggml_backend_tensor_set(dst, tmp, 0, ggml_nbytes(src)); + delete[] (char*)tmp; + return true; + } + + if (dst_backend->guid == ggml_backend_opencl_guid()) { + ggml_backend_opencl_context *dst_ctx = (ggml_backend_opencl_context *)dst_backend->context; + ggml_tensor_extra_cl * dst_extra = (ggml_tensor_extra_cl *) dst->extra; + void *tmp = new char[ggml_nbytes(src)]; + cl_event evt; + ggml_backend_tensor_get(src, tmp, 0, ggml_nbytes(src)); + CL_CHECK(clEnqueueWriteBuffer(dst_ctx->queue, dst_extra->data_device, CL_FALSE, 0, ggml_nbytes(src), tmp, 0, NULL, &evt)); + CL_CHECK(clSetEventCallback(evt, CL_COMPLETE, free_tmp_tensor_buf, tmp)); + CL_CHECK(clReleaseEvent(evt)); + return true; + } return false; } static void ggml_backend_opencl_synchronize(ggml_backend_t backend) { auto * backend_ctx = static_cast(backend->context); - - cl_event evt; - CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseEvent(evt)); + ggml_backend_opencl_do_deferred_memobj_release(backend_ctx->release_list); + CL_CHECK(clFinish(backend_ctx->queue)); } // Synchronizes the 'backend_ctx's device with others so that commands @@ -4507,14 +4557,17 @@ static ggml_guid_t ggml_backend_opencl_guid() { return &guid; } +static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size); +static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size); + static ggml_backend_i ggml_backend_opencl_i = { /* .get_name = */ ggml_backend_opencl_name, /* .free = */ ggml_backend_opencl_free, - /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */ - /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */ - /* .set_tensor_2d_async = */ NULL, + /* .set_tensor_async = */ ggml_backend_opencl_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_opencl_get_tensor_async, /* .get_tensor_2d_async = */ NULL, - /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */ + /* .set_tensor_2d_async = */ NULL, + /* .cpy_tensor_async = */ ggml_backend_opencl_cpy_tensor_async, /* .synchronize = */ ggml_backend_opencl_synchronize, /* .graph_plan_create = */ ggml_backend_opencl_graph_plan_create, /* .graph_plan_free = */ ggml_backend_opencl_graph_plan_free, @@ -4921,7 +4974,7 @@ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backen return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27 } -static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_opencl_buffer_set_tensor_impl(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, std::vector &release_list, cl_event *last_evt) { ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); cl_context context = backend_ctx->context; @@ -4952,7 +5005,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); CL_CHECK(clEnqueueWriteBuffer( - queue, data_device, CL_TRUE, 0, + queue, data_device, CL_FALSE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); // We consider the specified offset arg as always, although For weights @@ -5011,8 +5064,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseMemObject(data_device)); + RELEASE_LATER(data_device, evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(evt)); + *last_evt = evt; + } tensor->extra = extra; @@ -5054,7 +5110,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); CL_CHECK(clEnqueueWriteBuffer( - queue, data_device, CL_TRUE, 0, + queue, data_device, CL_FALSE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); cl_buffer_region region; @@ -5106,8 +5162,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseMemObject(data_device)); + RELEASE_LATER(data_device, evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(evt)); + *last_evt = evt; + } tensor->extra = extra; @@ -5146,7 +5205,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); CL_CHECK(clEnqueueWriteBuffer( - queue, data_device, CL_TRUE, 0, + queue, data_device, CL_FALSE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); // The original tensor memory is divided into scales and quants, i.e., @@ -5219,8 +5278,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseMemObject(data_device)); + RELEASE_LATER(data_device, evt, release_list); // Create image for Q cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32}; @@ -5252,7 +5310,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); CL_CHECK(clEnqueueWriteBuffer( - queue, data_device, CL_TRUE, 0, + queue, data_device, CL_FALSE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); // The original tensor memory is divided into scales and quants, i.e., @@ -5287,8 +5345,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseMemObject(data_device)); + RELEASE_LATER(data_device, evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(evt)); + *last_evt = evt; + } tensor->extra = extra; @@ -5411,7 +5472,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); CL_CHECK(clEnqueueWriteBuffer( - queue, data_device, CL_TRUE, 0, + queue, data_device, CL_FALSE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); cl_buffer_region region; @@ -5476,8 +5537,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseMemObject(data_device)); + RELEASE_LATER(data_device, evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(evt)); + *last_evt = evt; + } tensor->extra = extra; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS @@ -5632,7 +5696,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_int err; cl_mem data_device; CL_CHECK((data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, ggml_nbytes(tensor), NULL, &err), err)); - CL_CHECK(clEnqueueWriteBuffer(queue, data_device, CL_TRUE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); + CL_CHECK(clEnqueueWriteBuffer(queue, data_device, CL_FALSE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); cl_buffer_region region; @@ -5686,8 +5750,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseMemObject(data_device)); + RELEASE_LATER(data_device, evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(evt)); + *last_evt = evt; + } extra->size_ql = size_ql; extra->size_qh = size_qh; @@ -5726,13 +5793,32 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, GGML_ASSERT(extra); CL_CHECK(clEnqueueWriteBuffer( - queue, extra->data_device, CL_TRUE, extra->offset + offset, - size, data, 0, NULL, NULL)); + queue, extra->data_device, CL_FALSE, extra->offset + offset, + size, data, 0, NULL, last_evt)); GGML_UNUSED(buffer); } -static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); + // Make sure all previously submitted commands in other devices are finished. + sync_with_other_backends(backend_ctx); + cl_event evt; + ggml_backend_opencl_buffer_set_tensor_impl(buffer, tensor, data, offset, size, backend_ctx->release_list, &evt); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseEvent(evt)); + ggml_backend_opencl_do_deferred_memobj_release(backend_ctx->release_list); +} + +static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + sync_with_other_backends(backend_ctx); + ggml_backend_opencl_buffer_set_tensor_impl(buf, tensor, data, offset, size, backend_ctx->release_list, nullptr); +} + +static void ggml_backend_opencl_buffer_get_tensor_impl(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size, std::vector &release_list, cl_event *last_evt) { GGML_ASSERT(tensor->extra); ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); @@ -5740,9 +5826,6 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_context context = backend_ctx->context; cl_command_queue queue = backend_ctx->queue; - // Make sure all previously submitted commands in other devices are finished. - sync_with_other_backends(backend_ctx); - #ifdef GGML_OPENCL_SOA_Q // In end-to-end runs, get_tensor is usually used to get back the logits, // where we can simply do clEnqueueReadBuffer since they are f32. @@ -5808,14 +5891,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; - cl_event evt; + cl_event kern_evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); + global_work_size, local_work_size, 0, NULL, &kern_evt)); + cl_event read_evt; CL_CHECK(clEnqueueReadBuffer( - queue, data_device, CL_TRUE, offset, - size, data, 0, NULL, NULL)); - CL_CHECK(clReleaseMemObject(data_device)); + queue, data_device, CL_FALSE, offset, + size, data, 1, &kern_evt, &read_evt)); + CL_CHECK(clReleaseEvent(kern_evt)); + RELEASE_LATER(data_device, read_evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(read_evt)); + *last_evt = read_evt; + } return; } if (tensor->type == GGML_TYPE_Q4_1) { @@ -5882,14 +5970,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; - cl_event evt; + cl_event kern_evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); + global_work_size, local_work_size, 0, NULL, &kern_evt)); + cl_event read_evt; CL_CHECK(clEnqueueReadBuffer( - queue, data_device, CL_TRUE, offset, - size, data, 0, NULL, NULL)); - CL_CHECK(clReleaseMemObject(data_device)); + queue, data_device, CL_FALSE, offset, + size, data, 1, &kern_evt, &read_evt)); + CL_CHECK(clReleaseEvent(kern_evt)); + RELEASE_LATER(data_device, read_evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(read_evt)); + *last_evt = read_evt; + } return; } if (tensor->type == GGML_TYPE_MXFP4) { @@ -5936,14 +6029,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; - cl_event evt; + cl_event kern_evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); + global_work_size, local_work_size, 0, NULL, &kern_evt)); + cl_event read_evt; CL_CHECK(clEnqueueReadBuffer( - queue, data_device, CL_TRUE, offset, - size, data, 0, NULL, NULL)); - CL_CHECK(clReleaseMemObject(data_device)); + queue, data_device, CL_FALSE, offset, + size, data, 1, &kern_evt, &read_evt)); + CL_CHECK(clReleaseEvent(kern_evt)); + RELEASE_LATER(data_device, read_evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(read_evt)); + *last_evt = read_evt; + } return; } if (tensor->type == GGML_TYPE_Q8_0) { @@ -5992,14 +6090,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; - cl_event evt; + cl_event kern_evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); + global_work_size, local_work_size, 0, NULL, &kern_evt)); + cl_event read_evt; CL_CHECK(clEnqueueReadBuffer( - queue, data_device, CL_TRUE, offset, - size, data, 0, NULL, NULL)); - CL_CHECK(clReleaseMemObject(data_device)); + queue, data_device, CL_FALSE, offset, + size, data, 1, &kern_evt, &read_evt)); + CL_CHECK(clReleaseEvent(kern_evt)); + RELEASE_LATER(data_device, read_evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(read_evt)); + *last_evt = read_evt; + } return; } if (tensor->type == GGML_TYPE_IQ4_NL) { @@ -6140,14 +6243,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; - cl_event evt; + cl_event kern_evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); + global_work_size, local_work_size, 0, NULL, &kern_evt)); + cl_event read_evt; CL_CHECK(clEnqueueReadBuffer( - queue, data_device, CL_TRUE, offset, - size, data, 0, NULL, NULL)); - CL_CHECK(clReleaseMemObject(data_device)); + queue, data_device, CL_FALSE, offset, + size, data, 1, &kern_evt, &read_evt)); + CL_CHECK(clReleaseEvent(kern_evt)); + RELEASE_LATER(data_device, read_evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(read_evt)); + *last_evt = read_evt; + } return; } if (tensor->type == GGML_TYPE_Q5_K) { @@ -6309,14 +6417,19 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)n_blk, 1, 1}; size_t local_work_size[] = {1, 1, 1}; - cl_event evt; + cl_event kern_evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, - global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); + global_work_size, local_work_size, 0, NULL, &kern_evt)); + cl_event read_evt; CL_CHECK(clEnqueueReadBuffer( queue, data_device, CL_TRUE, offset, - size, data, 0, NULL, NULL)); - CL_CHECK(clReleaseMemObject(data_device)); + size, data, 1, &kern_evt, &read_evt)); + CL_CHECK(clReleaseEvent(kern_evt)); + RELEASE_LATER(data_device, read_evt, release_list); + if (last_evt) { + CL_CHECK(clRetainEvent(read_evt)); + *last_evt = read_evt; + } return; } #endif // GGML_OPENCL_SOA_Q @@ -6324,12 +6437,28 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; CL_CHECK(clEnqueueReadBuffer( - queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset, - size, data, 0, NULL, NULL)); + queue, extra->data_device, CL_FALSE, extra->offset + tensor->view_offs + offset, + size, data, 0, NULL, last_evt)); GGML_UNUSED(buffer); } +static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); + cl_event evt; + ggml_backend_opencl_buffer_get_tensor_impl(buffer, tensor, data, offset, size, backend_ctx->release_list, &evt); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseEvent(evt)); + ggml_backend_opencl_do_deferred_memobj_release(backend_ctx->release_list); +} + +static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + ggml_backend_opencl_buffer_get_tensor_impl(buf, tensor, data, offset, size, backend_ctx->release_list, NULL); +} + static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_dev_t dev = buffer->buft->device; ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); @@ -6463,7 +6592,7 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct props->type = ggml_backend_opencl_device_get_type(dev); ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = ggml_backend_dev_caps { - /* .async = */ false, + /* .async = */ true, /* .host_buffer = */ false, /* .buffer_from_host_ptr = */ false, /* .events = */ false, From 5b85b9bf77bfb4a971396fb95e6850c54097214d Mon Sep 17 00:00:00 2001 From: Jan Solanti Date: Tue, 28 Apr 2026 15:37:18 +0300 Subject: [PATCH 5/5] hacks to make the ROCm compiler happy(ier) --- ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl | 2 +- ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl | 2 +- ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl index 7dc55de72df..782b04d5b34 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl @@ -94,7 +94,7 @@ inline float block_q_4_0_dot_y_flat( #define N_SIMDWIDTH 32 #endif -inline void mul_vec_q_n_f32_8x_flat( +void mul_vec_q_n_f32_8x_flat( global char * src0_q, global half * src0_d, global float * src1, diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl index 2578d71d369..d8314512bc9 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl @@ -98,7 +98,7 @@ inline float mm_block_q_4_0_dot_y_flat( // This variant performs 1d blocking with 8x output. // Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix). // -inline void mul_mat_q_n_f32_1d_8x_flat( +void mul_mat_q_n_f32_1d_8x_flat( global uchar * src0_q, global half * src0_d, global float * src1, diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl index 79882d27e23..ee1dfe97866 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl @@ -105,7 +105,7 @@ inline float block_q_4_0_dot_y_flat( #define N_SIMDWIDTH 32 #endif -inline void mul_vec_q_n_f32_8x_flat( +void mul_vec_q_n_f32_8x_flat( global uchar * src0_q, global half * src0_d, global float * src1,