diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 5abf2290651..8b78c37a8d1 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -224,6 +224,7 @@ struct sycl_device_info { // cudaOccupancyMaxActiveBlocksPerMultiprocessor bool vmm; // virtual memory support size_t total_vram; + bool host_unified_memory; // true for integrated GPUs (CPU and GPU share the same physical memory) sycl_hw_info hw_info; optimize_feature opt_feature; }; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f06147eeeb8..bba4a449c30 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -105,6 +105,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.max_work_group_sizes[i] = prop.get_max_work_group_size(); info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units(); info.devices[i].hw_info = get_device_hw_info(&device); + info.devices[i].host_unified_memory = device.get_info(); } @@ -352,12 +353,13 @@ struct ggml_backend_sycl_buffer_context { int device; void * dev_ptr = nullptr; queue_ptr stream; + bool is_shared; // true when malloc_shared was used (integrated/UMA GPU) std::string name; optimize_feature opt_feature; std::vector tensor_extras; - ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : - device(device), dev_ptr(dev_ptr), stream(stream) { + ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_shared = false) : + device(device), dev_ptr(dev_ptr), stream(stream), is_shared(is_shared) { check_allow_gpu_index(device); name = (GGML_SYCL_NAME + std::to_string(device)); opt_feature = ggml_sycl_info().devices[device].opt_feature; @@ -457,6 +459,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + + if (ctx->is_shared) { + // Shared (UMA) buffer: CPU can access directly, no DMA needed + memcpy((char *)tensor->data + offset, data, size); + return; + } + ggml_sycl_set_device(ctx->device); auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); @@ -486,6 +495,12 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + if (ctx->is_shared) { + // Shared (UMA) buffer: CPU can access directly, no DMA needed + memcpy(data, (const char *)tensor->data + offset, size); + return; + } + ggml_sycl_set_device(ctx->device); auto stream = dpct::dev_mgr::instance().get_device(ctx->device).default_queue(); @@ -650,6 +665,7 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { struct ggml_backend_sycl_buffer_type_context { int device; std::string name; + bool use_shared_buffers; // true for integrated/UMA GPUs (host_unified_memory) // each buffer type has its own stream queue_ptr stream = nullptr; @@ -670,13 +686,16 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 void * dev_ptr; - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( - size, *stream))); + if (buft_ctx->use_shared_buffers) { + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_shared(size, *stream))); + } else { + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(size, *stream))); + } if (!dev_ptr) { GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); return nullptr; } - ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); + ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, buft_ctx->use_shared_buffers); return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } catch (sycl::exception const &exc) { @@ -740,10 +759,12 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { for (int i = 0; i < dev_count; i++) { auto & device_i = dpct::dev_mgr::instance().get_device(i); queue_ptr stream = &(device_i.default_queue()); + bool use_shared = ggml_sycl_info().devices[i].host_unified_memory && + getenv("GGML_SYCL_NO_SHARED") == nullptr; ggml_backend_sycl_buffer_types[i] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), i), - /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream}, + /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), use_shared, stream}, }; } ggml_backend_sycl_buffer_type_initialized = true; @@ -766,10 +787,12 @@ static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_syc if (!ggml_backend_sycl_buffer_type_initialized) { for (int i = 0; i < ggml_sycl_info().device_count; i++) { + bool use_shared = ggml_sycl_info().devices[i].host_unified_memory && + getenv("GGML_SYCL_NO_SHARED") == nullptr; ggml_backend_sycl_buffer_types[i] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .device = */ nullptr, - /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)}, + /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), use_shared, ctx->stream(i, 0)}, }; } ggml_backend_sycl_buffer_type_initialized = true; @@ -5304,6 +5327,12 @@ ggml_backend_t ggml_backend_sycl_init(int device) { check_allow_gpu_index(device); + if (ggml_sycl_info().devices[device].host_unified_memory && + getenv("GGML_SYCL_NO_SHARED") == nullptr) { + GGML_LOG_INFO("%s: device %d has unified memory - using sycl::malloc_shared for zero-copy tensor access\n", + __func__, device); + } + ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device); if (ctx == nullptr) { GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);