Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,50 @@ struct ggml_sycl_pool {
virtual void free(void * ptr, size_t size) = 0;
};

struct ggml_sycl_fattn_kv_buffers {
// buffers grow in chunks of this size
static constexpr size_t CHUNK_SIZE = 16ull << 20; // 16 MiB

struct kv_buffer {
kv_buffer(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
~kv_buffer();

kv_buffer(const kv_buffer &) = delete;
kv_buffer & operator=(const kv_buffer &) = delete;

sycl::half * ensure_half(size_t n_elems);

private:
sycl::half * ptr = nullptr;
size_t capacity = 0;
queue_ptr qptr = nullptr;
int device = 0;
};

kv_buffer K;
kv_buffer V;

ggml_sycl_fattn_kv_buffers(queue_ptr qptr, int device) : K(qptr, device), V(qptr, device) {}

ggml_sycl_fattn_kv_buffers(const ggml_sycl_fattn_kv_buffers &) = delete;
ggml_sycl_fattn_kv_buffers & operator=(const ggml_sycl_fattn_kv_buffers &) = delete;
};

/**
* Imitates `ggml_sycl_pool_alloc` to keep the code calling alloc unchanged.
*/
struct ggml_sycl_fattn_alloc {
ggml_sycl_fattn_kv_buffers::kv_buffer & buf;
sycl::half * ptr = nullptr;

explicit ggml_sycl_fattn_alloc(ggml_sycl_fattn_kv_buffers::kv_buffer & buf_) : buf(buf_) {}

sycl::half * alloc(size_t n_elems) {
ptr = buf.ensure_half(n_elems);
return ptr;
Copy link
Copy Markdown
Author

@sanmai sanmai May 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the calling code does not use the return value but pool_alloc::alloc returns so keep doing that too to reduce the surprise if someone changes the old code to use the return value later

}
};

template<typename T>
struct ggml_sycl_pool_alloc {
ggml_sycl_pool * pool = nullptr;
Expand Down Expand Up @@ -404,12 +448,16 @@ struct ggml_backend_sycl_context {
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;

std::unique_ptr<ggml_sycl_fattn_kv_buffers> fattn_bufs[GGML_SYCL_MAX_DEVICES];

std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];

static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);

static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);

static std::unique_ptr<ggml_sycl_fattn_kv_buffers> new_fattn_kv_buffers(queue_ptr qptr, int device);

ggml_sycl_pool & pool(int device) {
if (pools[device] == nullptr) {
pools[device] = new_pool_for_device(stream(device,0), device);
Expand All @@ -421,6 +469,17 @@ struct ggml_backend_sycl_context {
return pool(device);
}

ggml_sycl_fattn_kv_buffers & fattn_buffers(int device) {
if (fattn_bufs[device] == nullptr) {
fattn_bufs[device] = new_fattn_kv_buffers(stream(device, 0), device);
}
return *fattn_bufs[device];
}

ggml_sycl_fattn_kv_buffers & fattn_buffers() {
return fattn_buffers(device);
}

#ifdef GGML_SYCL_GRAPH
std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
#endif
Expand Down
5 changes: 3 additions & 2 deletions ggml/src/ggml-sycl/fattn-common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -918,12 +918,13 @@ void launch_fattn(
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);

ggml_sycl_pool & pool = ctx.pool();
ggml_sycl_fattn_kv_buffers & fbuf = ctx.fattn_buffers();
dpct::queue_ptr main_stream = ctx.stream();
const int id = ggml_sycl_get_device();
const int nsm = ggml_sycl_info().devices[id].nsm;

ggml_sycl_pool_alloc<sycl::half> K_f16(pool);
ggml_sycl_pool_alloc<sycl::half> V_f16(pool);
ggml_sycl_fattn_alloc K_f16(fbuf.K);
ggml_sycl_fattn_alloc V_f16(fbuf.V);
Copy link
Copy Markdown
Author

@sanmai sanmai May 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered adding a no-op template to make it look more uniform:

ggml_sycl_fattn_alloc<sycl::half>   K_f16(fbuf.K);
ggml_sycl_fattn_alloc<sycl::half>   V_f16(fbuf.V);

ggml_sycl_pool_alloc<int> KV_max(pool);
ggml_sycl_pool_alloc<float> dst_tmp(pool);
ggml_sycl_pool_alloc<sycl::float2> dst_tmp_meta(pool);
Expand Down
82 changes: 82 additions & 0 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1281,6 +1281,23 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}

~ggml_sycl_pool_leg() {
#ifdef DEBUG_SYCL_POOL
int n_cached = 0;
size_t bytes_cached = 0;
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
if (buffer_pool[i].ptr != nullptr) {
++n_cached;
bytes_cached += buffer_pool[i].size;
}
}
GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
n_cached, bytes_cached / 1024.0 / 1024.0);
const auto slots = format_slots_in_alloc_order();
if (!slots.empty()) {
GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
}
#endif

for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
Expand All @@ -1291,6 +1308,26 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
GGML_ASSERT(pool_size == 0);
}

#ifdef DEBUG_SYCL_POOL
std::string format_slots_in_alloc_order() const {
std::string line;
char buf[32];
bool first = true;
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
if (buffer_pool[i].ptr == nullptr) {
continue;
}
if (!first) {
line += '/';
}
first = false;
snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
line += buf;
}
return line;
}
#endif

void * alloc(size_t size, size_t * actual_size) override {
#ifdef DEBUG_sycl_MALLOC
int nnz = 0;
Expand Down Expand Up @@ -1454,6 +1491,51 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
}

std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
}

sycl::half * ggml_sycl_fattn_kv_buffers::kv_buffer::ensure_half(size_t n_elems) {
const size_t need_bytes = n_elems * sizeof(sycl::half);
if (capacity >= need_bytes) {
return ptr;
}
if (ptr) {
SYCL_CHECK(CHECK_TRY_ERROR(qptr->wait()));
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
ptr = nullptr;
capacity = 0;
}

size_t cap = 0;
while (cap < need_bytes) {
cap += CHUNK_SIZE;
}

void * dev_ptr;
SYCL_CHECK(
CHECK_TRY_ERROR(dev_ptr = sycl::malloc_device(
cap, *qptr)));

if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, cap);
GGML_ABORT("fattn buffer alloc failed");
}

ptr = static_cast<sycl::half *>(dev_ptr);
capacity = cap;
return ptr;
}

ggml_sycl_fattn_kv_buffers::kv_buffer::~kv_buffer() {
#ifdef DEBUG_SYCL_POOL
GGML_LOG_INFO("ggml_sycl_fattn_kv_buffer[%d]: %.2f MiB\n", device, capacity / 1024.0 / 1024.0);
#endif
if (ptr) {
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
}
}

// TBD pool with virtual memory management
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool

Expand Down