diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index 253540c6dbcb9..696d1b873d43e 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -1641,7 +1641,6 @@ impl Default for MeshCullingDataBuffer { fn default() -> Self { Self(AtomicSparseBufferVec::new( BufferUsages::STORAGE, - 8, Arc::from("mesh culling data buffer"), )) } diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs index 95e0b3d804efc..90f29ccf9c934 100644 --- a/crates/bevy_render/src/batching/gpu_preprocessing.rs +++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs @@ -307,7 +307,6 @@ where InstanceInputUniformBuffer { buffer: AtomicSparseBufferVec::new( BufferUsages::STORAGE, - 8, Arc::from("instance input uniform buffer"), ), free_uniform_indices: vec![], diff --git a/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl b/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl index 0ef601e946d4e..4dd53ea1bf1c2 100644 --- a/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl +++ b/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl @@ -12,10 +12,8 @@ struct SparseBufferUpdateMetadata { // The size of a single element in words. element_size: u32, - // The total number of pages to be updated. - updated_page_count: u32, - // The base-2 logarithm of the page size. - page_size_log2: u32, + // The total number of elements to be updated. + updated_element_count: u32, }; // The buffer we're copying to. @@ -34,8 +32,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { // Calculate which word we are. Remember that this shader executes with one // thread per word. let invocation_index = global_id.x; - let total_word_count = (metadata.updated_page_count << metadata.page_size_log2) * - metadata.element_size; + let total_word_count = metadata.updated_element_count * metadata.element_size; if (invocation_index >= total_word_count) { return; } @@ -44,16 +41,11 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { let element_index = invocation_index / metadata.element_size; // Calculate which word *within* that element we're looking at. let word_index = invocation_index % metadata.element_size; - // Calculate which page we're copying. - let update_index = element_index >> metadata.page_size_log2; - // Determine which element we're copying within that page. - let element_index_in_page = element_index & ((1u << metadata.page_size_log2) - 1u); - // Look up our destination page. - let page_index = indices[update_index]; + // Look up our destination element. + let dest_element_index = indices[element_index]; // Calculate where we should write our word. - let dest_index = ((page_index << metadata.page_size_log2) + element_index_in_page) * - metadata.element_size + word_index; + let dest_index = dest_element_index * metadata.element_size + word_index; if (dest_index >= arrayLength(&dest_buffer)) { return; } diff --git a/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs b/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs index cc73847a5b3ed..e3e157668f294 100644 --- a/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs +++ b/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs @@ -3,7 +3,7 @@ use alloc::sync::{Arc, Weak}; use core::{ - iter, slice, + slice, sync::atomic::{AtomicU64, Ordering}, }; @@ -16,7 +16,7 @@ use bevy_ecs::{ system::{Res, ResMut}, world::{FromWorld, World}, }; -use bevy_log::{error, info}; +use bevy_log::{debug, error, info}; use bevy_material::{ bind_group_layout_entries::{ binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer}, @@ -104,8 +104,8 @@ const REALLOCATION_FACTOR: f64 = 1.5; /// We round all allocations up to the nearest multiple of this. const REALLOCATION_SIZE_MULTIPLE: usize = 256; -/// The number of dirty-page bits packed into each [`AtomicU64`] word. -const PAGES_PER_DIRTY_WORD: u32 = 64; +/// The number of bits packed into each [`AtomicU64`] word. +const BITS_PER_WORD: u32 = 64; /// Pipelines for the sparse buffer update shader. /// @@ -149,12 +149,8 @@ pub struct SparseBufferUpdateJobs(pub Vec); pub struct SparseBufferUpdateJob { /// A handle to the buffer to be updated. sparse_buffer_handle: SparseBufferHandle, - /// The number of pages to update. - updated_page_count: u32, - /// The base-2 logarithm of the size of a page for the buffer. - /// - /// The actual page size can be computed as `1 << page_size_log2`. - page_size_log2: u32, + /// The number of elements to update. + updated_element_count: u32, /// The size of each element in 32-bit words. element_word_size: u32, /// A debugging label for the buffer. @@ -162,14 +158,9 @@ pub struct SparseBufferUpdateJob { } impl SparseBufferUpdateJob { - /// The number of elements per page. - fn page_size(&self) -> u32 { - 1 << self.page_size_log2 - } - /// Calculates the number of words that need to be updated. fn words_to_update(&self) -> u32 { - self.updated_page_count * self.page_size() * self.element_word_size + self.updated_element_count * self.element_word_size } /// Calculates the number of workgroups that need to be dispatched. @@ -185,12 +176,8 @@ impl SparseBufferUpdateJob { struct GpuSparseBufferUpdateMetadata { /// The size of a single element in 32-bit words. element_size: u32, - /// The number of pages that need to be updated. - updated_page_count: u32, - /// The base-2 logarithm of the page size. - /// - /// That is, the page size is `1 << page_size_log2`. - page_size_log2: u32, + /// The number of elements that need to be updated. + updated_element_count: u32, } /// A system, part of the render graph, that performs sparse buffer updates to @@ -334,32 +321,22 @@ impl SpecializedComputePipeline for SparseBufferUpdatePipelines { /// /// There's one such set of buffers per sparse buffer vector. struct SparseBufferStagingBuffers { - /// All pages that have changed and need to be updated. + /// All elements that have changed and need to be updated. source_data: RawBufferVec, - /// The index at which we write each page in [`Self::source_data`]. + /// The index at which we write each element in [`Self::source_data`]. /// /// The length of this buffer is equal to [`Self::source_data`] divided by - /// 2^[`Self::page_size_log2`]. + /// [`Self::element_word_size`]. indices: RawBufferVec, /// The size of each element in 32-bit words. element_word_size: u32, - - /// The base-2 logarithm of the page size in elements. - /// - /// That is, the page size in elements is `1 << page_size_log2`. - page_size_log2: u32, } impl SparseBufferStagingBuffers { - /// The number of elements per page. - fn page_size(&self) -> usize { - 1 << self.page_size_log2 - } - /// Creates a new set of staging buffers for a sparse buffer vector. - fn new(label: &str, element_word_size: u32, page_size_log2: u32) -> SparseBufferStagingBuffers { + fn new(label: &str, element_word_size: u32) -> SparseBufferStagingBuffers { let mut source_data_buffer = RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE); source_data_buffer.set_label(Some(&*format!("{} staging buffer", label))); @@ -371,16 +348,12 @@ impl SparseBufferStagingBuffers { source_data: source_data_buffer, indices: indices_buffer, element_word_size, - page_size_log2, } } - /// Returns the number of updated pages. - fn updated_page_count(&self) -> u32 { - // Note that we don't have to round up here because data is always - // uploaded in increments of a whole page. - let element_count = self.source_data.len() / self.element_word_size as usize; - (element_count / self.page_size()) as u32 + /// Returns the number of updated elements. + fn updated_element_count(&self) -> u32 { + (self.source_data.len() / self.element_word_size as usize) as u32 } /// Writes the buffers that contain all the data necessary to perform a @@ -394,7 +367,7 @@ impl SparseBufferStagingBuffers { render_device: &RenderDevice, render_queue: &RenderQueue, ) { - metadata_uniform.get_mut().updated_page_count = self.updated_page_count(); + metadata_uniform.get_mut().updated_element_count = self.updated_element_count(); metadata_uniform.write_buffer(render_device, render_queue); self.source_data.write_buffer(render_device, render_queue); @@ -403,21 +376,38 @@ impl SparseBufferStagingBuffers { /// Returns true if a sparse buffer update should *not* be performed because /// too many words changed. - fn should_perform_full_reupload(&self, changed_page_count: u32, buffer_length: usize) -> bool { + fn should_perform_full_reupload( + &self, + changed_element_count: u32, + buffer_length: usize, + ) -> bool { // Calculate the number of changed words. If it's greater than the // maximum number of workgroups as defined by `wgpu`, we must perform a // full reupload. - let total_changed_word_count = - changed_page_count * self.page_size() as u32 * self.element_word_size; + // + // FIXME: This degrades performance in the exact scenarios we need it + // the most. We should fall back to doing multiple rounds of uploads in + // this case. + let total_changed_word_count = changed_element_count * self.element_word_size; if total_changed_word_count > MAX_WORKGROUPS * SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE { return true; } // Don't perform a sparse upload if too many words changed, as it'll end // up being slower than just uploading the whole buffer afresh. - let sparse_upload_fraction = - changed_page_count as f64 / buffer_length.div_ceil(self.page_size()) as f64; - sparse_upload_fraction > SPARSE_UPLOAD_THRESHOLD + let sparse_upload_fraction = changed_element_count as f64 / buffer_length as f64; + let should_reupload = sparse_upload_fraction > SPARSE_UPLOAD_THRESHOLD; + + debug!( + "Sparse buffer changed {}/{} elements ({:.3}, threshold {:.3}): performing {} upload", + changed_element_count, + buffer_length, + sparse_upload_fraction, + SPARSE_UPLOAD_THRESHOLD, + if should_reupload { "full" } else { "sparse" } + ); + + should_reupload } } @@ -428,9 +418,9 @@ impl SparseBufferStagingBuffers { /// This type is similar to /// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], but instead of /// reuploading the entire buffer to the GPU when it's changed, it tracks -/// changes on a per-page level and uploads only the pages that changed if the -/// number of such pages is small. It uses a compute shader to scatter the -/// changed pages. +/// changes on a per-element level and uploads only the elements that changed if +/// the number of such pages is small. It uses a compute shader to scatter the +/// changed elements. /// /// As the stored data is [`AtomicPod`], multiple threads may update the buffer /// simultaneously. Note that, like @@ -464,12 +454,23 @@ where buffer_usages: BufferUsages, /// An optional debug label to identify this buffer. label: Arc, - /// A bit set of dirty pages. + /// A bit set of dirty blocks. + /// + /// The size of this vector in bits is the number of elements divided + /// (rounded up) by 64: in other words, the size of this vector in *bits* is + /// the size of the [`Self::dirty_bits`] vector in *words*. A 1 in a bit + /// indicates that the block has changed since the last upload, while a 0 + /// indicates that the block hasn't changed. + summary: Vec, + /// A bit set of dirty elements. + /// + /// The size of this vector in bits is the number of elements, rounded up to + /// the nearest 64. A 1 in a bit indicates that the element has changed since + /// the last upload, while a 0 indicates that the element hasn't changed. /// - /// The size of this vector in bits is the number of elements divided by the - /// page size, rounded up. A 1 in a bit indicates that the page has changed - /// since the last upload, while a 0 indicates that the page hasn't changed. - dirty_pages: Vec, + /// Each group of 64 elements, corresponding to a single word in this array, + /// is known as a *block*. + dirty_bits: Vec, /// True if the entire buffer needs to be reuploaded because it resized. needs_full_reupload: bool, /// True if a sparse update is to be performed. @@ -480,21 +481,13 @@ impl AtomicSparseBufferVec where T: AtomicPod, { - /// The number of elements per page. - fn page_size(&self) -> u32 { - 1 << self.staging_buffers.page_size_log2 - } - /// Creates a new [`AtomicSparseBufferVec`] with the given set of buffer - /// usages, page size, and label. + /// usages and label. /// /// `buffer_usages` specifies the set of allowed `wgpu` buffer usages for /// the buffer that [`AtomicSparseBufferVec`] manages. /// `BufferUsages::COPY_DST` is automatically added to this set. - /// - /// The `page_size_log2` parameter is the base-2 logarithm of the page size. - /// That is, the page size is `1 << page_size_log2`. - pub fn new(buffer_usages: BufferUsages, page_size_log2: u32, label: Arc) -> Self { + pub fn new(buffer_usages: BufferUsages, label: Arc) -> Self { // Make sure the value is word-aligned. debug_assert_eq!(size_of::() % 4, 0); let element_word_size = size_of::() / 4; @@ -508,18 +501,13 @@ where handle: id, values: vec![], data_buffer: None, - staging_buffers: SparseBufferStagingBuffers::new( - &label, - element_word_size as u32, - page_size_log2, - ), - metadata_uniform: UniformBuffer::from(GpuSparseBufferUpdateMetadata::new::( - page_size_log2, - )), + staging_buffers: SparseBufferStagingBuffers::new(&label, element_word_size as u32), + metadata_uniform: UniformBuffer::from(GpuSparseBufferUpdateMetadata::new::()), capacity: 0, buffer_usages: buffer_usages | BufferUsages::COPY_DST, label, - dirty_pages: vec![], + summary: vec![], + dirty_bits: vec![], needs_full_reupload: false, sparse_update_scheduled: false, } @@ -542,7 +530,9 @@ where /// Removes all elements from the buffer. pub fn clear(&mut self) { - self.truncate(0); + self.values.clear(); + self.summary.clear(); + self.dirty_bits.clear(); } /// Copies a value out of the buffer. @@ -572,26 +562,23 @@ where self.values.push(T::Blob::default()); value.write_to_blob(&self.values[index as usize]); - let page_word = (self.index_to_page(index) / PAGES_PER_DIRTY_WORD) as usize; - while self.dirty_pages.len() < page_word + 1 { - self.dirty_pages.push(AtomicU64::default()); + let dirty_word_index = (index / BITS_PER_WORD) as usize; + let summary_word_index = dirty_word_index / BITS_PER_WORD as usize; + while self.summary.len() < summary_word_index + 1 { + self.summary.push(AtomicU64::default()); + } + while self.dirty_bits.len() < dirty_word_index + 1 { + self.dirty_bits.push(AtomicU64::default()); } - self.note_changed_index(index); + self.note_changed_index(index); index } - /// Marks the page corresponding to the given element index as dirty so that - /// we know that we need to upload it. + /// Marks the given element index as dirty so that we know that we need to + /// upload it. fn note_changed_index(&self, index: u32) { - let page = self.index_to_page(index); - let (page_word, page_in_word) = (page / PAGES_PER_DIRTY_WORD, page % PAGES_PER_DIRTY_WORD); - self.dirty_pages[page_word as usize].fetch_or(1 << page_in_word, Ordering::Relaxed); - } - - /// Returns the page corresponding to the given element index. - fn index_to_page(&self, index: u32) -> u32 { - index / self.page_size() + note_changed_index(index, &self.summary, &self.dirty_bits); } /// Ensures that the backing buffer for this buffer vector is present and @@ -612,6 +599,8 @@ where /// Grows the buffer by adding default values so that it's at least the /// given size. /// + /// This method sets all the newly-added values to dirty. + /// /// If the buffer is already large enough, this method does nothing. pub fn grow(&mut self, new_len: u32) { let old_len = self.values.len() as u32; @@ -622,41 +611,7 @@ where self.values.reserve(new_len as usize - old_len as usize); self.values.resize_with(new_len as usize, T::Blob::default); - // This is a bit tricky. We want to set the dirty bits corresponding to - // all pages that we added, if any. First, we compute the index of the - // last page word before the append operation. - let old_final_page = self.index_to_page(old_len); - let old_final_page_word_index = old_final_page / PAGES_PER_DIRTY_WORD; - let old_final_page_in_word = old_final_page % PAGES_PER_DIRTY_WORD; - - // Next, we set the bits corresponding to every page that we added to - // that final page word. Note that this might set bits corresponding to - // pages past the end of our buffer; that's OK as we ignore them. - if old_final_page_in_word != 0 - && let Some(ref mut old_final_atomic_page_word) = - self.dirty_pages.get_mut(old_final_page_word_index as usize) - { - *old_final_atomic_page_word.get_mut() |= !((1u64 << old_final_page_in_word) - 1); - } - - // Finally, we add any new page words, with all bits set. - let new_page_count = self.index_to_page(new_len); - self.dirty_pages.resize_with( - (new_page_count as usize).div_ceil(PAGES_PER_DIRTY_WORD as usize), - || AtomicU64::new(u64::MAX), - ); - } - - /// Truncates the buffer to the given length. - /// - /// If the buffer is already that length or shorter, this method does - /// nothing. - pub fn truncate(&mut self, len: u32) { - self.values.truncate(len as usize); - - let page = self.index_to_page(len); - self.dirty_pages - .truncate(page.div_ceil(PAGES_PER_DIRTY_WORD) as usize); + set_dirty_bits_for_vector_growth(old_len, new_len, &mut self.summary, &mut self.dirty_bits); } /// Writes the data to the GPU, either via a sparse upload or a bulk data @@ -682,23 +637,15 @@ where /// because it was resized or because too much data changed for a sparse /// update to be worthwhile. fn should_perform_full_reupload(&self, render_device: &RenderDevice) -> bool { - if self.needs_full_reupload { - return true; - } - - if render_device.limits().max_storage_buffers_per_shader_stage < 3 { + if self.needs_full_reupload + || render_device.limits().max_storage_buffers_per_shader_stage < 3 + { return true; } - // Calculate the number of changed pages via population count. - let changed_page_count: u32 = self - .dirty_pages - .iter() - .map(|atomic_page_word| atomic_page_word.load(Ordering::Relaxed).count_ones()) - .sum(); - + let changed_element_count = count_dirty_elements(&self.summary, &self.dirty_bits); self.staging_buffers - .should_perform_full_reupload(changed_page_count, self.values.len()) + .should_perform_full_reupload(changed_element_count, self.values.len()) } /// Writes the entire buffer in bulk. @@ -727,56 +674,60 @@ where } // Mark all pages as clean. - for atomic_page_word in self.dirty_pages.iter() { - atomic_page_word.store(0, Ordering::Relaxed); + for atomic_summary_word in self.summary.iter() { + atomic_summary_word.store(0, Ordering::Relaxed); + } + for atomic_dirty_word in self.dirty_bits.iter() { + atomic_dirty_word.store(0, Ordering::Relaxed); } self.sparse_update_scheduled = false; } - /// Schedules a sparse upload of only the pages that changed. + /// Schedules a sparse upload of only the elements that changed. fn prepare_sparse_upload(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) { - // Iterate over all dirty pages. - for (page_word_index, atomic_page_word) in self.dirty_pages.iter().enumerate() { - let page_word = atomic_page_word.load(Ordering::Relaxed); - for page_index_in_word in BitIter::new(page_word) { - let page = page_word_index as u32 * PAGES_PER_DIRTY_WORD + page_index_in_word; - - // Write the index of the page so the shader will know where to - // scatter the data to. - self.staging_buffers.indices.push(page); - - // Copy the page to the GPU staging buffer. - let page_size = self.staging_buffers.page_size(); - let page_start = page as usize * page_size; - let page_end = page_start + page_size; - for value_index in page_start..page_end { - match self.values.get(value_index) { - Some(blob) => { - let value = T::read_from_blob(blob); - self.staging_buffers - .source_data - .extend(bytemuck::cast_slice(&[value]).iter().copied()); - } - None => { - self.staging_buffers.source_data.extend(iter::repeat_n( - 0, - self.staging_buffers.element_word_size as usize, - )); - } - } + // Iterate over all dirty elements, using the summary to accelerate the + // search. + for (summary_word_index, atomic_summary_word) in self.summary.iter().enumerate() { + let summary_word = atomic_summary_word.load(Ordering::Relaxed); + for summary_bit_offset in BitIter::new(summary_word) { + let dirty_word_index = + summary_word_index * BITS_PER_WORD as usize + summary_bit_offset as usize; + + // Iterate over all dirty elements in each dirty page. + let atomic_dirty_word = &self.dirty_bits[dirty_word_index]; + let dirty_word = atomic_dirty_word.load(Ordering::Relaxed); + for dirty_bit_offset in BitIter::new(dirty_word) { + let element_index = + dirty_word_index * BITS_PER_WORD as usize + dirty_bit_offset as usize; + + let Some(blob) = self.values.get(element_index) else { + continue; + }; + + // Write the index of the element so the shader will know where to + // scatter the data to. + self.staging_buffers.indices.push(element_index as u32); + + // Copy the element to the GPU staging buffer. + let value = T::read_from_blob(blob); + self.staging_buffers + .source_data + .extend(bytemuck::cast_slice(&[value]).iter().copied()); + + // Make sure we're aligned up to a full element. + debug_assert_eq!( + self.staging_buffers.source_data.len() + % self.staging_buffers.element_word_size as usize, + 0 + ); } - // Make sure we're aligned up to a full page. - debug_assert_eq!( - self.staging_buffers.source_data.len() - % (self.staging_buffers.element_word_size as usize - * self.staging_buffers.page_size()), - 0 - ); + // Mark the element as clean. + atomic_dirty_word.store(0, Ordering::Relaxed); } - // Mark the page as clean. - atomic_page_word.store(0, Ordering::Relaxed); + // Mark the block as clean. + atomic_summary_word.store(0, Ordering::Relaxed); } // Schedule a sparse update if there was something to do. @@ -856,6 +807,116 @@ impl FromWorld for SparseBufferUpdateBindGroups { } } +/// Marks elements within the range `old_len..new_len` as dirty, under the +/// assumption that the vector is being resized from a length of `old_len` to a +/// length of `new_len`. +/// +/// This is more efficient than individually marking elements one-by-one. It +/// also resizes the `summary` and `dirty_bits` bitfields as necessary. +/// +/// `new_len` must be greater than or equal to `old_len`. +fn set_dirty_bits_for_vector_growth( + old_len: u32, + new_len: u32, + summary: &mut Vec, + dirty_bits: &mut Vec, +) { + debug_assert!(new_len >= old_len); + if new_len == old_len { + return; + } + + if old_len > 0 { + // Compute the index of the bit corresponding to the final existing + // element. We're going to set every bit *after* that bit. + let old_final_dirty_word_index = (old_len - 1) / BITS_PER_WORD; + let old_final_dirty_bit_offset = (old_len - 1) % BITS_PER_WORD; + if old_final_dirty_bit_offset < BITS_PER_WORD - 1 + && let Some(ref mut old_final_atomic_dirty_word) = + dirty_bits.get_mut(old_final_dirty_word_index as usize) + { + // We add one here because we want to set every bit *after*, but not + // including, the index we computed above. + *old_final_atomic_dirty_word.get_mut() |= + !((1u64 << (old_final_dirty_bit_offset + 1)).wrapping_sub(1)); + } + + // Now set all the blocks from the block corresponding to `old_len - 1` + // onward to dirty. Note that this is an inclusive range, because we + // want to include the page that `old_len - 1` is on. + let old_final_summary_word_index = old_final_dirty_word_index / BITS_PER_WORD; + let mut old_final_summary_bit_offset = old_final_dirty_word_index % BITS_PER_WORD; + // This is a tricky exception. If `old_len` was precisely aligned on a + // block boundary, then we *don't* include the block that `old_len - 1` + // is on. + if old_final_dirty_bit_offset == BITS_PER_WORD - 1 { + old_final_summary_bit_offset += 1; + } + if let Some(ref mut old_final_atomic_summary_word) = + summary.get_mut(old_final_summary_word_index as usize) + { + // We don't add one to `old_final_summary_bit_offset` here because + // we want to include the block that `old_len - 1` is on. + *old_final_atomic_summary_word.get_mut() |= + !((1u64 << old_final_summary_bit_offset).wrapping_sub(1)); + } + } + + // Add any new summary and dirty words, with all bits set. + let new_dirty_word_count = (new_len as usize).div_ceil(BITS_PER_WORD as usize); + let new_summary_word_count = new_dirty_word_count.div_ceil(BITS_PER_WORD as usize); + summary.resize_with(new_summary_word_count, || AtomicU64::new(u64::MAX)); + dirty_bits.resize_with(new_dirty_word_count, || AtomicU64::new(u64::MAX)); + + // Clear all bits past the last valid element index in `dirty_bits`. + let last_dirty_bit_offset = new_len % BITS_PER_WORD; + if last_dirty_bit_offset != 0 { + let mut final_dirty_word = dirty_bits[new_dirty_word_count - 1].load(Ordering::Relaxed); + final_dirty_word &= (1u64 << last_dirty_bit_offset) - 1; + dirty_bits[new_dirty_word_count - 1].store(final_dirty_word, Ordering::Relaxed); + } + + // Clear all bits past the last valid summary bit in `summary`. + let last_summary_bit_offset = new_dirty_word_count % BITS_PER_WORD as usize; + if last_summary_bit_offset != 0 { + let mut final_summary_word = summary[new_summary_word_count - 1].load(Ordering::Relaxed); + final_summary_word &= (1u64 << last_summary_bit_offset) - 1; + summary[new_summary_word_count - 1].store(final_summary_word, Ordering::Relaxed); + } +} + +/// Marks the given element index as dirty so that we know that we need to +/// upload it. +/// +/// This is a separate function so we can unit test it easily (i.e. without the +/// need of a `RenderDevice`). +fn note_changed_index(index: u32, summary: &[AtomicU64], dirty_bits: &[AtomicU64]) { + let dirty_word_index = index / BITS_PER_WORD; + let (summary_word_index, summary_bit_offset) = ( + dirty_word_index / BITS_PER_WORD, + dirty_word_index % BITS_PER_WORD, + ); + summary[summary_word_index as usize].fetch_or(1 << summary_bit_offset, Ordering::Relaxed); + let (element_word, element_in_word) = (index / BITS_PER_WORD, index % BITS_PER_WORD); + dirty_bits[element_word as usize].fetch_or(1 << element_in_word, Ordering::Relaxed); +} + +/// Returns the total number of bits set in `dirty_bits`, using the given +/// `summary` to accelerate the count. +fn count_dirty_elements(summary: &[AtomicU64], dirty_bits: &[AtomicU64]) -> u32 { + let mut changed_element_count = 0u32; + for (summary_word_index, summary_word) in summary.iter().enumerate() { + for summary_bit_offset in BitIter::new(summary_word.load(Ordering::Relaxed)) { + let dirty_word_index = + summary_word_index * BITS_PER_WORD as usize + summary_bit_offset as usize; + let dirty_word = dirty_bits[dirty_word_index].load(Ordering::Relaxed); + changed_element_count += dirty_word.count_ones(); + } + } + + changed_element_count +} + /// Prepares all GPU resources necessary to perform a sparse buffer update, /// other than updating the metadata uniform. /// @@ -888,8 +949,7 @@ fn prepare_to_populate_buffers( // Record the update job. sparse_buffer_update_jobs.push(SparseBufferUpdateJob { sparse_buffer_handle: sparse_buffer_handle.clone(), - page_size_log2: staging_buffers.page_size_log2, - updated_page_count: staging_buffers.updated_page_count(), + updated_element_count: staging_buffers.updated_element_count(), element_word_size: staging_buffers.element_word_size, label: (*label).clone(), }); @@ -949,14 +1009,12 @@ fn reserve( } impl GpuSparseBufferUpdateMetadata { - /// Returns a new [`GpuSparseBufferUpdateMetadata`] for the given type and - /// page size. - fn new(page_size_log2: u32) -> GpuSparseBufferUpdateMetadata { + /// Returns a new [`GpuSparseBufferUpdateMetadata`] for the given type. + fn new() -> GpuSparseBufferUpdateMetadata { assert_eq!(size_of::() % 4, 0); GpuSparseBufferUpdateMetadata { element_size: (size_of::() / 4) as u32, - updated_page_count: 0, - page_size_log2, + updated_element_count: 0, } } } @@ -991,3 +1049,156 @@ fn calculate_allocation_size(length: usize) -> usize { let size = REALLOCATION_FACTOR.powf(exponent) as usize; size.next_multiple_of(REALLOCATION_SIZE_MULTIPLE) } + +#[cfg(test)] +mod tests { + use super::{BitIter, BITS_PER_WORD}; + use core::{ + iter, + sync::atomic::{AtomicU64, Ordering}, + }; + use proptest::prelude::proptest; + + proptest! { + // Ensures that the `BitIter` is correct. + #[test] + fn bit_iter(bits: u64) { + let mut bits_reference: Vec<_> = (0u32..64u32).filter(|bit_pos| { + (bits & (1 << bit_pos)) != 0 + }).collect(); + bits_reference.sort_unstable(); + + let mut bits_iter_results: Vec<_> = BitIter::new(bits).collect(); + bits_iter_results.sort_unstable(); + + assert_eq!(bits_iter_results, bits_reference); + } + + // Ensures that `set_dirty_bits_for_vector_growth` is correct. + #[test] + fn set_dirty_bits_for_vector_growth( + old_len in 0u32..16384u32, + new_element_count in 0u32..16384u32, + start_dirty: bool + ) { + // Initialize the dirty bits. + let new_len = old_len + new_element_count; + let mut dirty_bits: Vec<_> = iter::repeat_with(|| { + AtomicU64::new(if start_dirty { u64::MAX } else { 0 }) + }).take(old_len.div_ceil(BITS_PER_WORD) as usize).collect(); + let mut summary: Vec<_> = iter::repeat_with(|| { + AtomicU64::new(if start_dirty { u64::MAX } else { 0 }) + }).take(dirty_bits.len().div_ceil(BITS_PER_WORD as usize)).collect(); + + super::set_dirty_bits_for_vector_growth( + old_len, + new_len, + &mut summary, + &mut dirty_bits + ); + + // Check dirty flags for elements. + // Bits in the range [0, old_len) should be unchanged. + for element_index in 0..old_len { + check_element_dirty(element_index, &dirty_bits, start_dirty); + } + // Bits in the range [old_len, new_len) should be dirty. + for element_index in old_len..new_len { + check_element_dirty(element_index, &dirty_bits, true); + } + // Bits in the range [new_len, end) should be clean. + for element_index in (new_len..).take_while(|element_index| { + element_index % BITS_PER_WORD != 0 + }) { + check_element_dirty(element_index, &dirty_bits, false); + } + + // Check the dirty flag for each block to ensure that it precisely + // corresponds to the logical *or* of the dirty flags for all + // elements in that block. + for (dirty_word_index, atomic_dirty_word) in dirty_bits.iter().enumerate() { + // Determine the range of elements that this block encompasses. + let element_start = dirty_word_index * BITS_PER_WORD as usize; + let element_end = + ((dirty_word_index + 1) * BITS_PER_WORD as usize).min(new_len as usize); + assert!(element_start <= element_end); + + // Determine whether the block should be dirty. + let dirty_word = atomic_dirty_word.load(Ordering::Relaxed); + let block_is_dirty = (element_start..element_end).any(|element_index| { + (dirty_word & (1 << (element_index % (BITS_PER_WORD as usize)))) != 0 + }); + + // Check to make sure that the block has the correct dirty state. + check_block_dirty(dirty_word_index as u32, &summary, block_is_dirty); + } + + // Make sure that all dirty block bits past the last valid dirty + // block bit are clear. + if !summary.is_empty() { + let last_summary_word_index = summary.len() - 1; + let last_padding_block_index = last_summary_word_index * BITS_PER_WORD as usize; + let last_dirty_word_index = (new_len as usize - 1) / BITS_PER_WORD as usize; + for padding_block_index in (last_dirty_word_index + 1)..last_padding_block_index { + check_block_dirty(padding_block_index as u32, &summary, false); + } + } + + // Asserts that the dirty status of the element at `element_index` + // matches the expected dirty status. + fn check_element_dirty( + element_index: u32, + dirty_bits: &[AtomicU64], + expect_dirty: bool + ) { + let expected = if expect_dirty { 1 } else { 0 }; + + let dirty_word_index = element_index / BITS_PER_WORD; + let dirty_bit_offset = element_index % BITS_PER_WORD; + let dirty_word = dirty_bits[dirty_word_index as usize].load(Ordering::Relaxed); + assert_eq!((dirty_word >> dirty_bit_offset) & 1, expected); + } + + // Asserts that the dirty status of the block at `block_index` + // matches the expected dirty status in the summary. + // + // This is actually the same code as `ensure_elements_dirty`, but is + // duplicated for clarity. + fn check_block_dirty(block_index: u32, summary: &[AtomicU64], expect_dirty: bool) { + let expected = if expect_dirty { 1 } else { 0 }; + + let summary_word_index = block_index / BITS_PER_WORD; + let summary_bit_offset = block_index % BITS_PER_WORD; + let summary_word = summary[summary_word_index as usize].load(Ordering::Relaxed); + assert_eq!((summary_word >> summary_bit_offset) & 1, expected); + } + } + + // Ensures that the population-count-based `count_dirty_elements` code + // correctly calculates the number of changed elements. + // + // The input `dirty_flags` is an array of booleans, one for each + // element, in which `false` represents "not changed" and `true` + // represents "changed". + #[test] + fn dirty_element_count(dirty_flags: Vec) { + let dirty_word_count = dirty_flags.len().div_ceil(BITS_PER_WORD as usize); + let summary_word_count = dirty_word_count.div_ceil(BITS_PER_WORD as usize); + + let dirty_bits: Vec<_> = (0..dirty_word_count).map(|_| AtomicU64::new(0)).collect(); + let summary: Vec<_> = (0..summary_word_count).map(|_| AtomicU64::new(0)).collect(); + + let mut true_dirty_element_count = 0; + for (element_index, _) in dirty_flags.iter().enumerate().filter(|(_, element)| **element) { + super::note_changed_index(element_index as u32, &summary, &dirty_bits); + true_dirty_element_count += 1; + } + + let calculated_dirty_element_count = super::count_dirty_elements( + &summary, + &dirty_bits + ); + assert_eq!(calculated_dirty_element_count, true_dirty_element_count); + } + } +}