Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/backend/SYCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|

## Compile-time Flags

Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.

| Name | Function |
|-----------------|----------------------------------------------------------------------------------|
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |

## Design Rule

- Open to all contributors.
Expand Down
16 changes: 16 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "presets.hpp"
#include "type.hpp"
#include "sycl_hw.hpp"
#include "fattn-buffers.hpp"

namespace syclexp = sycl::ext::oneapi::experimental;

Expand Down Expand Up @@ -404,12 +405,16 @@ struct ggml_backend_sycl_context {
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;

std::unique_ptr<ggml_sycl_fattn_kv_buffers> fattn_bufs[GGML_SYCL_MAX_DEVICES];

std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];

static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);

static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);

static std::unique_ptr<ggml_sycl_fattn_kv_buffers> new_fattn_kv_buffers(queue_ptr qptr, int device);

ggml_sycl_pool & pool(int device) {
if (pools[device] == nullptr) {
pools[device] = new_pool_for_device(stream(device,0), device);
Expand All @@ -421,6 +426,17 @@ struct ggml_backend_sycl_context {
return pool(device);
}

ggml_sycl_fattn_kv_buffers & fattn_buffers(int device) {
if (fattn_bufs[device] == nullptr) {
fattn_bufs[device] = new_fattn_kv_buffers(stream(device, 0), device);
}
return *fattn_bufs[device];
}

ggml_sycl_fattn_kv_buffers & fattn_buffers() {
return fattn_buffers(device);
}

#ifdef GGML_SYCL_GRAPH
std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
#endif
Expand Down
60 changes: 60 additions & 0 deletions ggml/src/ggml-sycl/fattn-buffers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//
// MIT license
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: MIT
//

//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//

#include "common.hpp"
Comment thread
arthw marked this conversation as resolved.

std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
}

sycl::half * ggml_sycl_fattn_kv_buffers::kv_buffer::ensure_half(size_t n_elems) {
const size_t need_bytes = n_elems * sizeof(sycl::half);

if (capacity >= need_bytes) {
return ptr;
}

if (ptr) {
SYCL_CHECK(CHECK_TRY_ERROR(qptr->wait()));
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
ptr = nullptr;
capacity = 0;
}

size_t cap = 0;
while (cap < need_bytes) {
cap += CHUNK_SIZE;
}

void * dev_ptr;
SYCL_CHECK(
CHECK_TRY_ERROR(dev_ptr = sycl::malloc_device(
cap, *qptr)));

if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, cap);
GGML_ABORT("fattn buffer alloc failed");
}

ptr = static_cast<sycl::half *>(dev_ptr);
capacity = cap;
return ptr;
}

ggml_sycl_fattn_kv_buffers::kv_buffer::~kv_buffer() {
#ifdef DEBUG_SYCL_POOL
GGML_LOG_INFO("ggml_sycl_fattn_kv_buffer[%d]: %.2f MiB\n", device, capacity / 1024.0 / 1024.0);
#endif
if (ptr) {
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
}
}
63 changes: 63 additions & 0 deletions ggml/src/ggml-sycl/fattn-buffers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
//
// MIT license
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: MIT
//

//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//

#ifndef GGML_SYCL_FATTN_BUFFERS_HPP
#define GGML_SYCL_FATTN_BUFFERS_HPP

#include <sycl/sycl.hpp>

typedef sycl::queue *queue_ptr;

struct ggml_sycl_fattn_kv_buffers {
// buffers grow in chunks of this size
static constexpr size_t CHUNK_SIZE = 16ull << 20; // 16 MiB

struct kv_buffer {
kv_buffer(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
~kv_buffer();

kv_buffer(const kv_buffer &) = delete;
kv_buffer & operator=(const kv_buffer &) = delete;

sycl::half * ensure_half(size_t n_elems);

private:
sycl::half * ptr = nullptr;
size_t capacity = 0;
queue_ptr qptr = nullptr;
[[maybe_unused]] int device = 0;
};

kv_buffer K;
kv_buffer V;

ggml_sycl_fattn_kv_buffers(queue_ptr qptr, int device) : K(qptr, device), V(qptr, device) {}

ggml_sycl_fattn_kv_buffers(const ggml_sycl_fattn_kv_buffers &) = delete;
ggml_sycl_fattn_kv_buffers & operator=(const ggml_sycl_fattn_kv_buffers &) = delete;
};

/**
* Imitates `ggml_sycl_pool_alloc` to keep the code calling alloc unchanged.
*/
struct ggml_sycl_fattn_alloc {
ggml_sycl_fattn_kv_buffers::kv_buffer & buf;
sycl::half * ptr = nullptr;

explicit ggml_sycl_fattn_alloc(ggml_sycl_fattn_kv_buffers::kv_buffer & buf_) : buf(buf_) {}

sycl::half * alloc(size_t n_elems) {
ptr = buf.ensure_half(n_elems);
return ptr;
}
};
#endif
6 changes: 4 additions & 2 deletions ggml/src/ggml-sycl/fattn-common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "common.hpp"
#include "convert.hpp"
#include "vecdotq.hpp"
#include "fattn-buffers.hpp"

#include "ggml.h"

Expand Down Expand Up @@ -918,12 +919,13 @@ void launch_fattn(
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);

ggml_sycl_pool & pool = ctx.pool();
ggml_sycl_fattn_kv_buffers & fbuf = ctx.fattn_buffers();
dpct::queue_ptr main_stream = ctx.stream();
const int id = ggml_sycl_get_device();
const int nsm = ggml_sycl_info().devices[id].nsm;

ggml_sycl_pool_alloc<sycl::half> K_f16(pool);
ggml_sycl_pool_alloc<sycl::half> V_f16(pool);
ggml_sycl_fattn_alloc K_f16(fbuf.K);
ggml_sycl_fattn_alloc V_f16(fbuf.V);
Copy link
Copy Markdown
Contributor Author

@sanmai sanmai May 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered adding a no-op template to make it look more uniform:

ggml_sycl_fattn_alloc<sycl::half>   K_f16(fbuf.K);
ggml_sycl_fattn_alloc<sycl::half>   V_f16(fbuf.V);

ggml_sycl_pool_alloc<int> KV_max(pool);
ggml_sycl_pool_alloc<float> dst_tmp(pool);
ggml_sycl_pool_alloc<sycl::float2> dst_tmp_meta(pool);
Expand Down
37 changes: 37 additions & 0 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1281,6 +1281,23 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}

~ggml_sycl_pool_leg() {
#ifdef DEBUG_SYCL_POOL
int n_cached = 0;
size_t bytes_cached = 0;
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
if (buffer_pool[i].ptr != nullptr) {
++n_cached;
bytes_cached += buffer_pool[i].size;
}
}
GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
n_cached, bytes_cached / 1024.0 / 1024.0);
const auto slots = format_slots_in_alloc_order();
if (!slots.empty()) {
GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
}
#endif

for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
Expand All @@ -1291,6 +1308,26 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
GGML_ASSERT(pool_size == 0);
}

#ifdef DEBUG_SYCL_POOL
std::string format_slots_in_alloc_order() const {
std::string line;
char buf[32];
bool first = true;
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
if (buffer_pool[i].ptr == nullptr) {
continue;
}
if (!first) {
line += '/';
}
first = false;
snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
line += buf;
}
return line;
}
#endif

void * alloc(size_t size, size_t * actual_size) override {
#ifdef DEBUG_sycl_MALLOC
int nnz = 0;
Expand Down