Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 50 additions & 46 deletions wgpu-hal/src/metal/adapter.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use objc2::rc::autoreleasepool;
use objc2::runtime::{AnyObject, ProtocolObject, Sel};
use objc2::{available, sel};
use objc2_foundation::{NSOperatingSystemVersion, NSProcessInfo};
Expand All @@ -9,6 +10,7 @@ use objc2_metal::{
use wgt::{AstcBlock, AstcChannel};

use alloc::{string::ToString as _, sync::Arc, vec::Vec};
use core::mem::ManuallyDrop;
use core::sync::atomic;

use crate::metal::QueueShared;
Expand Down Expand Up @@ -79,52 +81,54 @@ impl crate::Adapter for super::Adapter {
limits: &wgt::Limits,
_memory_hints: &wgt::MemoryHints,
) -> Result<crate::OpenDevice<super::Api>, crate::DeviceError> {
let queue = self
.shared
.device
.newCommandQueueWithMaxCommandBufferCount(MAX_COMMAND_BUFFERS)
.unwrap();

// Acquiring the meaning of timestamp ticks is hard with Metal!
// The only thing there is a method correlating cpu & gpu timestamps (`device.sample_timestamps`).
// Users are supposed to call this method twice and calculate the difference,
// see "Converting GPU Timestamps into CPU Time":
// https://developer.apple.com/documentation/metal/gpu_counters_and_counter_sample_buffers/converting_gpu_timestamps_into_cpu_time
// Not only does this mean we get an approximate value, this is as also *very slow*!
// Chromium opted to solve this using a linear regression that they stop at some point
// https://source.chromium.org/chromium/chromium/src/+/refs/heads/main:third_party/dawn/src/dawn/native/metal/DeviceMTL.mm;drc=76be2f9f117654f3fe4faa477b0445114fccedda;bpv=0;bpt=1;l=46
// Generally, the assumption is that timestamp values aren't changing over time, after all all other APIs provide stable values.
//
// We should do as Chromium does for the general case, but this requires quite some state tracking
// and doesn't even provide perfectly accurate values, especially at the start of the application when
// we didn't have the chance to sample a lot of values just yet.
//
// So instead, we're doing the dangerous but easy thing and use our "knowledge" of timestamps
// conversions on different devices, after all Metal isn't supported on that many ;)
// Based on:
// * https://github.com/gfx-rs/wgpu/pull/2528
// * https://github.com/gpuweb/gpuweb/issues/1325#issuecomment-761041326
let timestamp_period = if self.shared.device.name().to_string().starts_with("Intel") {
83.333
} else {
// Known for Apple Silicon (at least M1 & M2, iPad Pro 2018) and AMD GPUs.
1.0
};

Ok(crate::OpenDevice {
device: super::Device {
shared: Arc::clone(&self.shared),
features,
counters: Default::default(),
limits: limits.clone(),
},
queue: super::Queue {
shared: Arc::new(QueueShared {
raw: queue,
command_buffer_created_not_submitted: atomic::AtomicUsize::new(0),
}),
timestamp_period,
},
autoreleasepool(|_| {
let queue = self
.shared
.device
.newCommandQueueWithMaxCommandBufferCount(MAX_COMMAND_BUFFERS)
.unwrap();

// Acquiring the meaning of timestamp ticks is hard with Metal!
// The only thing there is a method correlating cpu & gpu timestamps (`device.sample_timestamps`).
// Users are supposed to call this method twice and calculate the difference,
// see "Converting GPU Timestamps into CPU Time":
// https://developer.apple.com/documentation/metal/gpu_counters_and_counter_sample_buffers/converting_gpu_timestamps_into_cpu_time
// Not only does this mean we get an approximate value, this is as also *very slow*!
// Chromium opted to solve this using a linear regression that they stop at some point
// https://source.chromium.org/chromium/chromium/src/+/refs/heads/main:third_party/dawn/src/dawn/native/metal/DeviceMTL.mm;drc=76be2f9f117654f3fe4faa477b0445114fccedda;bpv=0;bpt=1;l=46
// Generally, the assumption is that timestamp values aren't changing over time, after all all other APIs provide stable values.
//
// We should do as Chromium does for the general case, but this requires quite some state tracking
// and doesn't even provide perfectly accurate values, especially at the start of the application when
// we didn't have the chance to sample a lot of values just yet.
//
// So instead, we're doing the dangerous but easy thing and use our "knowledge" of timestamps
// conversions on different devices, after all Metal isn't supported on that many ;)
// Based on:
// * https://github.com/gfx-rs/wgpu/pull/2528
// * https://github.com/gpuweb/gpuweb/issues/1325#issuecomment-761041326
let timestamp_period = if self.shared.device.name().to_string().starts_with("Intel") {
83.333
} else {
// Known for Apple Silicon (at least M1 & M2, iPad Pro 2018) and AMD GPUs.
1.0
};

Ok(crate::OpenDevice {
device: super::Device {
shared: Arc::clone(&self.shared),
features,
counters: Default::default(),
limits: limits.clone(),
},
queue: super::Queue {
shared: Arc::new(QueueShared {
raw: ManuallyDrop::new(queue),
command_buffer_created_not_submitted: atomic::AtomicUsize::new(0),
}),
timestamp_period,
},
})
})
}

Expand Down
5 changes: 4 additions & 1 deletion wgpu-hal/src/metal/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,11 @@ impl crate::Device for super::Device {
})
})
}
unsafe fn destroy_buffer(&self, _buffer: super::Buffer) {
unsafe fn destroy_buffer(&self, buffer: super::Buffer) {
self.counters.buffers.sub(1);
autoreleasepool(|_| {
drop(buffer);
});
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain in more detail how you found this (and the similar handling for QueueShared) to be necessary? Is the idea that releasing these objects transfers ownership of some member(s) to the autorelease pool, rather than directly releasing them?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from my testings, it doesn't directly release them since there's no main release pool and that's because the code is not running inside Cocoa event loop so you have to manually autoreleasepool by yourself , there's probably a lot of places like this in the metal backend but this is what i currently found to be leaking

}

unsafe fn add_raw_buffer(&self, _buffer: &super::Buffer) {
Expand Down
58 changes: 34 additions & 24 deletions wgpu-hal/src/metal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ mod surface;
mod time;

use alloc::{string::ToString as _, sync::Arc, vec::Vec};
use core::{fmt, iter, ops, ptr::NonNull, sync::atomic};
use core::{fmt, iter, mem::ManuallyDrop, ops, ptr::NonNull, sync::atomic};

use bitflags::bitflags;
use hashbrown::HashMap;
Expand Down Expand Up @@ -411,27 +411,29 @@ impl AdapterShared {
}

fn expose(device: Retained<ProtocolObject<dyn MTLDevice>>) -> crate::ExposedAdapter<Api> {
let name = device.name().to_string();
let capabilities_query = CapabilitiesQuery::new(&device);
let shared = AdapterShared::new(device, &capabilities_query);
let features = capabilities_query.features();
let capabilities = capabilities_query.capabilities();
crate::ExposedAdapter {
info: wgt::AdapterInfo {
name,
// These are hardcoded based on typical values for Metal devices
//
// See <https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md#adapter-info>
// for more information.
subgroup_min_size: 4,
subgroup_max_size: 64,
transient_saves_memory: shared.private_caps.supports_memoryless_storage,
..wgt::AdapterInfo::new(shared.private_caps.device_type(), wgt::Backend::Metal)
},
features,
capabilities,
adapter: Adapter::new(Arc::new(shared)),
}
autoreleasepool(|_| {
let name = device.name().to_string();
let capabilities_query = CapabilitiesQuery::new(&device);
let shared = AdapterShared::new(device, &capabilities_query);
let features = capabilities_query.features();
let capabilities = capabilities_query.capabilities();
crate::ExposedAdapter {
info: wgt::AdapterInfo {
name,
// These are hardcoded based on typical values for Metal devices
//
// See <https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md#adapter-info>
// for more information.
subgroup_min_size: 4,
subgroup_max_size: 64,
transient_saves_memory: shared.private_caps.supports_memoryless_storage,
..wgt::AdapterInfo::new(shared.private_caps.device_type(), wgt::Backend::Metal)
},
features,
capabilities,
adapter: Adapter::new(Arc::new(shared)),
}
})
}
}

Expand All @@ -454,7 +456,7 @@ impl Queue {
) -> Self {
Self {
shared: Arc::new(QueueShared {
raw,
raw: ManuallyDrop::new(raw),
command_buffer_created_not_submitted: atomic::AtomicUsize::new(0),
}),
timestamp_period,
Expand All @@ -464,7 +466,7 @@ impl Queue {

#[derive(Debug)]
pub struct QueueShared {
raw: Retained<ProtocolObject<dyn MTLCommandQueue>>,
raw: ManuallyDrop<Retained<ProtocolObject<dyn MTLCommandQueue>>>,
// Tracks command buffers created via `CommandEncoder::begin_encoding` that
// have not yet been submitted or discarded. Used to proactively fail
// before hitting Metal's `maxCommandBufferCount`.
Expand All @@ -475,6 +477,14 @@ pub struct QueueShared {
command_buffer_created_not_submitted: atomic::AtomicUsize,
}

impl Drop for QueueShared {
fn drop(&mut self) {
autoreleasepool(|_| unsafe {
ManuallyDrop::drop(&mut self.raw);
});
}
}

pub struct Device {
shared: Arc<AdapterShared>,
features: wgt::Features,
Expand Down