diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 7ebb4ec0297..f66facc856a 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -737,6 +737,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+## Compile-time Flags
+
+Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
+
+| Name | Function |
+|-----------------|----------------------------------------------------------------------------------|
+| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
+
## Design Rule
- Open to all contributors.
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 5abf2290651..eec36e8db9a 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -25,6 +25,7 @@
#include "presets.hpp"
#include "type.hpp"
#include "sycl_hw.hpp"
+#include "fattn-buffers.hpp"
namespace syclexp = sycl::ext::oneapi::experimental;
@@ -404,12 +405,16 @@ struct ggml_backend_sycl_context {
std::unique_ptr pools[GGML_SYCL_MAX_DEVICES];
std::unordered_map>> scratchpad_map;
+ std::unique_ptr fattn_bufs[GGML_SYCL_MAX_DEVICES];
+
std::unique_ptr host_pools[GGML_SYCL_MAX_DEVICES];
static std::unique_ptr new_pool_for_device(queue_ptr qptr, int device);
static std::unique_ptr new_pool_for_host(queue_ptr qptr, int device);
+ static std::unique_ptr new_fattn_kv_buffers(queue_ptr qptr, int device);
+
ggml_sycl_pool & pool(int device) {
if (pools[device] == nullptr) {
pools[device] = new_pool_for_device(stream(device,0), device);
@@ -421,6 +426,17 @@ struct ggml_backend_sycl_context {
return pool(device);
}
+ ggml_sycl_fattn_kv_buffers & fattn_buffers(int device) {
+ if (fattn_bufs[device] == nullptr) {
+ fattn_bufs[device] = new_fattn_kv_buffers(stream(device, 0), device);
+ }
+ return *fattn_bufs[device];
+ }
+
+ ggml_sycl_fattn_kv_buffers & fattn_buffers() {
+ return fattn_buffers(device);
+ }
+
#ifdef GGML_SYCL_GRAPH
std::unique_ptr> exec_graph = nullptr;
#endif
diff --git a/ggml/src/ggml-sycl/fattn-buffers.cpp b/ggml/src/ggml-sycl/fattn-buffers.cpp
new file mode 100644
index 00000000000..46cf6d551f1
--- /dev/null
+++ b/ggml/src/ggml-sycl/fattn-buffers.cpp
@@ -0,0 +1,56 @@
+//
+// MIT license
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "common.hpp"
+
+sycl::half * ggml_sycl_fattn_kv_buffers::kv_buffer::ensure_half(size_t n_elems) {
+ const size_t need_bytes = n_elems * sizeof(sycl::half);
+
+ if (capacity >= need_bytes) {
+ return ptr;
+ }
+
+ if (ptr) {
+ SYCL_CHECK(CHECK_TRY_ERROR(qptr->wait()));
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+ ptr = nullptr;
+ capacity = 0;
+ }
+
+ size_t cap = 0;
+ while (cap < need_bytes) {
+ cap += CHUNK_SIZE;
+ }
+
+ void * dev_ptr;
+ SYCL_CHECK(
+ CHECK_TRY_ERROR(dev_ptr = sycl::malloc_device(
+ cap, *qptr)));
+
+ if (!dev_ptr) {
+ GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, cap);
+ GGML_ABORT("fattn buffer alloc failed");
+ }
+
+ ptr = static_cast(dev_ptr);
+ capacity = cap;
+ return ptr;
+}
+
+ggml_sycl_fattn_kv_buffers::kv_buffer::~kv_buffer() {
+#ifdef DEBUG_SYCL_POOL
+ GGML_LOG_INFO("ggml_sycl_fattn_kv_buffer[%d]: %.2f MiB\n", device, capacity / 1024.0 / 1024.0);
+#endif
+ if (ptr) {
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+ }
+}
diff --git a/ggml/src/ggml-sycl/fattn-buffers.hpp b/ggml/src/ggml-sycl/fattn-buffers.hpp
new file mode 100644
index 00000000000..c00461de620
--- /dev/null
+++ b/ggml/src/ggml-sycl/fattn-buffers.hpp
@@ -0,0 +1,63 @@
+//
+// MIT license
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_FATTN_BUFFERS_HPP
+#define GGML_SYCL_FATTN_BUFFERS_HPP
+
+#include
+
+typedef sycl::queue *queue_ptr;
+
+struct ggml_sycl_fattn_kv_buffers {
+ // buffers grow in chunks of this size
+ static constexpr size_t CHUNK_SIZE = 16ull << 20; // 16 MiB
+
+ struct kv_buffer {
+ kv_buffer(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
+ ~kv_buffer();
+
+ kv_buffer(const kv_buffer &) = delete;
+ kv_buffer & operator=(const kv_buffer &) = delete;
+
+ sycl::half * ensure_half(size_t n_elems);
+
+ private:
+ sycl::half * ptr = nullptr;
+ size_t capacity = 0;
+ queue_ptr qptr = nullptr;
+ [[maybe_unused]] int device = 0;
+ };
+
+ kv_buffer K;
+ kv_buffer V;
+
+ ggml_sycl_fattn_kv_buffers(queue_ptr qptr, int device) : K(qptr, device), V(qptr, device) {}
+
+ ggml_sycl_fattn_kv_buffers(const ggml_sycl_fattn_kv_buffers &) = delete;
+ ggml_sycl_fattn_kv_buffers & operator=(const ggml_sycl_fattn_kv_buffers &) = delete;
+};
+
+/**
+ * Imitates `ggml_sycl_pool_alloc` to keep the code calling alloc unchanged.
+ */
+struct ggml_sycl_fattn_alloc {
+ ggml_sycl_fattn_kv_buffers::kv_buffer & buf;
+ sycl::half * ptr = nullptr;
+
+ explicit ggml_sycl_fattn_alloc(ggml_sycl_fattn_kv_buffers::kv_buffer & buf_) : buf(buf_) {}
+
+ sycl::half * alloc(size_t n_elems) {
+ ptr = buf.ensure_half(n_elems);
+ return ptr;
+ }
+};
+#endif
diff --git a/ggml/src/ggml-sycl/fattn-common.hpp b/ggml/src/ggml-sycl/fattn-common.hpp
index ed00d03c3b6..03f0c2623c8 100644
--- a/ggml/src/ggml-sycl/fattn-common.hpp
+++ b/ggml/src/ggml-sycl/fattn-common.hpp
@@ -5,6 +5,7 @@
#include "common.hpp"
#include "convert.hpp"
#include "vecdotq.hpp"
+#include "fattn-buffers.hpp"
#include "ggml.h"
@@ -918,12 +919,13 @@ void launch_fattn(
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
ggml_sycl_pool & pool = ctx.pool();
+ ggml_sycl_fattn_kv_buffers & fbuf = ctx.fattn_buffers();
dpct::queue_ptr main_stream = ctx.stream();
const int id = ggml_sycl_get_device();
const int nsm = ggml_sycl_info().devices[id].nsm;
- ggml_sycl_pool_alloc K_f16(pool);
- ggml_sycl_pool_alloc V_f16(pool);
+ ggml_sycl_fattn_alloc K_f16(fbuf.K);
+ ggml_sycl_fattn_alloc V_f16(fbuf.V);
ggml_sycl_pool_alloc KV_max(pool);
ggml_sycl_pool_alloc dst_tmp(pool);
ggml_sycl_pool_alloc dst_tmp_meta(pool);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index f06147eeeb8..19afe0a6c25 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1281,6 +1281,23 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
~ggml_sycl_pool_leg() {
+#ifdef DEBUG_SYCL_POOL
+ int n_cached = 0;
+ size_t bytes_cached = 0;
+ for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+ if (buffer_pool[i].ptr != nullptr) {
+ ++n_cached;
+ bytes_cached += buffer_pool[i].size;
+ }
+ }
+ GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
+ n_cached, bytes_cached / 1024.0 / 1024.0);
+ const auto slots = format_slots_in_alloc_order();
+ if (!slots.empty()) {
+ GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
+ }
+#endif
+
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
@@ -1291,6 +1308,26 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
GGML_ASSERT(pool_size == 0);
}
+#ifdef DEBUG_SYCL_POOL
+ std::string format_slots_in_alloc_order() const {
+ std::string line;
+ char buf[32];
+ bool first = true;
+ for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+ if (buffer_pool[i].ptr == nullptr) {
+ continue;
+ }
+ if (!first) {
+ line += '/';
+ }
+ first = false;
+ snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
+ line += buf;
+ }
+ return line;
+ }
+#endif
+
void * alloc(size_t size, size_t * actual_size) override {
#ifdef DEBUG_sycl_MALLOC
int nnz = 0;
@@ -1454,6 +1491,10 @@ std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(q
return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device));
}
+std::unique_ptr ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
+ return std::unique_ptr(new ggml_sycl_fattn_kv_buffers(qptr, device));
+}
+
// TBD pool with virtual memory management
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool