diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 253540c6dbcb9..696d1b873d43e 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -1641,7 +1641,6 @@ impl Default for MeshCullingDataBuffer {
     fn default() -> Self {
         Self(AtomicSparseBufferVec::new(
             BufferUsages::STORAGE,
-            8,
             Arc::from("mesh culling data buffer"),
         ))
     }
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 95e0b3d804efc..90f29ccf9c934 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -307,7 +307,6 @@ where
         InstanceInputUniformBuffer {
             buffer: AtomicSparseBufferVec::new(
                 BufferUsages::STORAGE,
-                8,
                 Arc::from("instance input uniform buffer"),
             ),
             free_uniform_indices: vec![],
diff --git a/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl b/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl
index 0ef601e946d4e..4dd53ea1bf1c2 100644
--- a/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl
+++ b/crates/bevy_render/src/render_resource/sparse_buffer_update.wgsl
@@ -12,10 +12,8 @@
 struct SparseBufferUpdateMetadata {
     // The size of a single element in words.
     element_size: u32,
-    // The total number of pages to be updated.
-    updated_page_count: u32,
-    // The base-2 logarithm of the page size.
-    page_size_log2: u32,
+    // The total number of elements to be updated.
+    updated_element_count: u32,
 };
 
 // The buffer we're copying to.
@@ -34,8 +32,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     // Calculate which word we are. Remember that this shader executes with one
     // thread per word.
     let invocation_index = global_id.x;
-    let total_word_count = (metadata.updated_page_count << metadata.page_size_log2) *
-        metadata.element_size;
+    let total_word_count = metadata.updated_element_count * metadata.element_size;
     if (invocation_index >= total_word_count) {
         return;
     }
@@ -44,16 +41,11 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     let element_index = invocation_index / metadata.element_size;
     // Calculate which word *within* that element we're looking at.
     let word_index = invocation_index % metadata.element_size;
-    // Calculate which page we're copying.
-    let update_index = element_index >> metadata.page_size_log2;
-    // Determine which element we're copying within that page.
-    let element_index_in_page = element_index & ((1u << metadata.page_size_log2) - 1u);
 
-    // Look up our destination page.
-    let page_index = indices[update_index];
+    // Look up our destination element.
+    let dest_element_index = indices[element_index];
     // Calculate where we should write our word.
-    let dest_index = ((page_index << metadata.page_size_log2) + element_index_in_page) *
-        metadata.element_size + word_index;
+    let dest_index = dest_element_index * metadata.element_size + word_index;
     if (dest_index >= arrayLength(&dest_buffer)) {
         return;
     }
diff --git a/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs b/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs
index cc73847a5b3ed..e3e157668f294 100644
--- a/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs
@@ -3,7 +3,7 @@
 
 use alloc::sync::{Arc, Weak};
 use core::{
-    iter, slice,
+    slice,
     sync::atomic::{AtomicU64, Ordering},
 };
 
@@ -16,7 +16,7 @@ use bevy_ecs::{
     system::{Res, ResMut},
     world::{FromWorld, World},
 };
-use bevy_log::{error, info};
+use bevy_log::{debug, error, info};
 use bevy_material::{
     bind_group_layout_entries::{
         binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer},
@@ -104,8 +104,8 @@ const REALLOCATION_FACTOR: f64 = 1.5;
 /// We round all allocations up to the nearest multiple of this.
 const REALLOCATION_SIZE_MULTIPLE: usize = 256;
 
-/// The number of dirty-page bits packed into each [`AtomicU64`] word.
-const PAGES_PER_DIRTY_WORD: u32 = 64;
+/// The number of bits packed into each [`AtomicU64`] word.
+const BITS_PER_WORD: u32 = 64;
 
 /// Pipelines for the sparse buffer update shader.
 ///
@@ -149,12 +149,8 @@ pub struct SparseBufferUpdateJobs(pub Vec<SparseBufferUpdateJob>);
 pub struct SparseBufferUpdateJob {
     /// A handle to the buffer to be updated.
     sparse_buffer_handle: SparseBufferHandle,
-    /// The number of pages to update.
-    updated_page_count: u32,
-    /// The base-2 logarithm of the size of a page for the buffer.
-    ///
-    /// The actual page size can be computed as `1 << page_size_log2`.
-    page_size_log2: u32,
+    /// The number of elements to update.
+    updated_element_count: u32,
     /// The size of each element in 32-bit words.
     element_word_size: u32,
     /// A debugging label for the buffer.
@@ -162,14 +158,9 @@ pub struct SparseBufferUpdateJob {
 }
 
 impl SparseBufferUpdateJob {
-    /// The number of elements per page.
-    fn page_size(&self) -> u32 {
-        1 << self.page_size_log2
-    }
-
     /// Calculates the number of words that need to be updated.
     fn words_to_update(&self) -> u32 {
-        self.updated_page_count * self.page_size() * self.element_word_size
+        self.updated_element_count * self.element_word_size
     }
 
     /// Calculates the number of workgroups that need to be dispatched.
@@ -185,12 +176,8 @@ impl SparseBufferUpdateJob {
 struct GpuSparseBufferUpdateMetadata {
     /// The size of a single element in 32-bit words.
     element_size: u32,
-    /// The number of pages that need to be updated.
-    updated_page_count: u32,
-    /// The base-2 logarithm of the page size.
-    ///
-    /// That is, the page size is `1 << page_size_log2`.
-    page_size_log2: u32,
+    /// The number of elements that need to be updated.
+    updated_element_count: u32,
 }
 
 /// A system, part of the render graph, that performs sparse buffer updates to
@@ -334,32 +321,22 @@ impl SpecializedComputePipeline for SparseBufferUpdatePipelines {
 ///
 /// There's one such set of buffers per sparse buffer vector.
 struct SparseBufferStagingBuffers {
-    /// All pages that have changed and need to be updated.
+    /// All elements that have changed and need to be updated.
     source_data: RawBufferVec<u32>,
 
-    /// The index at which we write each page in [`Self::source_data`].
+    /// The index at which we write each element in [`Self::source_data`].
     ///
     /// The length of this buffer is equal to [`Self::source_data`] divided by
-    /// 2^[`Self::page_size_log2`].
+    /// [`Self::element_word_size`].
     indices: RawBufferVec<u32>,
 
     /// The size of each element in 32-bit words.
     element_word_size: u32,
-
-    /// The base-2 logarithm of the page size in elements.
-    ///
-    /// That is, the page size in elements is `1 << page_size_log2`.
-    page_size_log2: u32,
 }
 
 impl SparseBufferStagingBuffers {
-    /// The number of elements per page.
-    fn page_size(&self) -> usize {
-        1 << self.page_size_log2
-    }
-
     /// Creates a new set of staging buffers for a sparse buffer vector.
-    fn new(label: &str, element_word_size: u32, page_size_log2: u32) -> SparseBufferStagingBuffers {
+    fn new(label: &str, element_word_size: u32) -> SparseBufferStagingBuffers {
         let mut source_data_buffer =
             RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE);
         source_data_buffer.set_label(Some(&*format!("{} staging buffer", label)));
@@ -371,16 +348,12 @@ impl SparseBufferStagingBuffers {
             source_data: source_data_buffer,
             indices: indices_buffer,
             element_word_size,
-            page_size_log2,
         }
     }
 
-    /// Returns the number of updated pages.
-    fn updated_page_count(&self) -> u32 {
-        // Note that we don't have to round up here because data is always
-        // uploaded in increments of a whole page.
-        let element_count = self.source_data.len() / self.element_word_size as usize;
-        (element_count / self.page_size()) as u32
+    /// Returns the number of updated elements.
+    fn updated_element_count(&self) -> u32 {
+        (self.source_data.len() / self.element_word_size as usize) as u32
     }
 
     /// Writes the buffers that contain all the data necessary to perform a
@@ -394,7 +367,7 @@ impl SparseBufferStagingBuffers {
         render_device: &RenderDevice,
         render_queue: &RenderQueue,
     ) {
-        metadata_uniform.get_mut().updated_page_count = self.updated_page_count();
+        metadata_uniform.get_mut().updated_element_count = self.updated_element_count();
         metadata_uniform.write_buffer(render_device, render_queue);
 
         self.source_data.write_buffer(render_device, render_queue);
@@ -403,21 +376,38 @@ impl SparseBufferStagingBuffers {
 
     /// Returns true if a sparse buffer update should *not* be performed because
     /// too many words changed.
-    fn should_perform_full_reupload(&self, changed_page_count: u32, buffer_length: usize) -> bool {
+    fn should_perform_full_reupload(
+        &self,
+        changed_element_count: u32,
+        buffer_length: usize,
+    ) -> bool {
         // Calculate the number of changed words. If it's greater than the
         // maximum number of workgroups as defined by `wgpu`, we must perform a
         // full reupload.
-        let total_changed_word_count =
-            changed_page_count * self.page_size() as u32 * self.element_word_size;
+        //
+        // FIXME: This degrades performance in the exact scenarios we need it
+        // the most. We should fall back to doing multiple rounds of uploads in
+        // this case.
+        let total_changed_word_count = changed_element_count * self.element_word_size;
         if total_changed_word_count > MAX_WORKGROUPS * SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE {
             return true;
         }
 
         // Don't perform a sparse upload if too many words changed, as it'll end
         // up being slower than just uploading the whole buffer afresh.
-        let sparse_upload_fraction =
-            changed_page_count as f64 / buffer_length.div_ceil(self.page_size()) as f64;
-        sparse_upload_fraction > SPARSE_UPLOAD_THRESHOLD
+        let sparse_upload_fraction = changed_element_count as f64 / buffer_length as f64;
+        let should_reupload = sparse_upload_fraction > SPARSE_UPLOAD_THRESHOLD;
+
+        debug!(
+            "Sparse buffer changed {}/{} elements ({:.3}, threshold {:.3}): performing {} upload",
+            changed_element_count,
+            buffer_length,
+            sparse_upload_fraction,
+            SPARSE_UPLOAD_THRESHOLD,
+            if should_reupload { "full" } else { "sparse" }
+        );
+
+        should_reupload
     }
 }
 
@@ -428,9 +418,9 @@ impl SparseBufferStagingBuffers {
 /// This type is similar to
 /// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], but instead of
 /// reuploading the entire buffer to the GPU when it's changed, it tracks
-/// changes on a per-page level and uploads only the pages that changed if the
-/// number of such pages is small. It uses a compute shader to scatter the
-/// changed pages.
+/// changes on a per-element level and uploads only the elements that changed if
+/// the number of such pages is small. It uses a compute shader to scatter the
+/// changed elements.
 ///
 /// As the stored data is [`AtomicPod`], multiple threads may update the buffer
 /// simultaneously. Note that, like
@@ -464,12 +454,23 @@ where
     buffer_usages: BufferUsages,
     /// An optional debug label to identify this buffer.
     label: Arc<str>,
-    /// A bit set of dirty pages.
+    /// A bit set of dirty blocks.
+    ///
+    /// The size of this vector in bits is the number of elements divided
+    /// (rounded up) by 64: in other words, the size of this vector in *bits* is
+    /// the size of the [`Self::dirty_bits`] vector in *words*. A 1 in a bit
+    /// indicates that the block has changed since the last upload, while a 0
+    /// indicates that the block hasn't changed.
+    summary: Vec<AtomicU64>,
+    /// A bit set of dirty elements.
+    ///
+    /// The size of this vector in bits is the number of elements, rounded up to
+    /// the nearest 64. A 1 in a bit indicates that the element has changed since
+    /// the last upload, while a 0 indicates that the element hasn't changed.
     ///
-    /// The size of this vector in bits is the number of elements divided by the
-    /// page size, rounded up. A 1 in a bit indicates that the page has changed
-    /// since the last upload, while a 0 indicates that the page hasn't changed.
-    dirty_pages: Vec<AtomicU64>,
+    /// Each group of 64 elements, corresponding to a single word in this array,
+    /// is known as a *block*.
+    dirty_bits: Vec<AtomicU64>,
     /// True if the entire buffer needs to be reuploaded because it resized.
     needs_full_reupload: bool,
     /// True if a sparse update is to be performed.
@@ -480,21 +481,13 @@ impl<T> AtomicSparseBufferVec<T>
 where
     T: AtomicPod,
 {
-    /// The number of elements per page.
-    fn page_size(&self) -> u32 {
-        1 << self.staging_buffers.page_size_log2
-    }
-
     /// Creates a new [`AtomicSparseBufferVec`] with the given set of buffer
-    /// usages, page size, and label.
+    /// usages and label.
     ///
     /// `buffer_usages` specifies the set of allowed `wgpu` buffer usages for
     /// the buffer that [`AtomicSparseBufferVec`] manages.
     /// `BufferUsages::COPY_DST` is automatically added to this set.
-    ///
-    /// The `page_size_log2` parameter is the base-2 logarithm of the page size.
-    /// That is, the page size is `1 << page_size_log2`.
-    pub fn new(buffer_usages: BufferUsages, page_size_log2: u32, label: Arc<str>) -> Self {
+    pub fn new(buffer_usages: BufferUsages, label: Arc<str>) -> Self {
         // Make sure the value is word-aligned.
         debug_assert_eq!(size_of::<T>() % 4, 0);
         let element_word_size = size_of::<T>() / 4;
@@ -508,18 +501,13 @@ where
             handle: id,
             values: vec![],
             data_buffer: None,
-            staging_buffers: SparseBufferStagingBuffers::new(
-                &label,
-                element_word_size as u32,
-                page_size_log2,
-            ),
-            metadata_uniform: UniformBuffer::from(GpuSparseBufferUpdateMetadata::new::<T>(
-                page_size_log2,
-            )),
+            staging_buffers: SparseBufferStagingBuffers::new(&label, element_word_size as u32),
+            metadata_uniform: UniformBuffer::from(GpuSparseBufferUpdateMetadata::new::<T>()),
             capacity: 0,
             buffer_usages: buffer_usages | BufferUsages::COPY_DST,
             label,
-            dirty_pages: vec![],
+            summary: vec![],
+            dirty_bits: vec![],
             needs_full_reupload: false,
             sparse_update_scheduled: false,
         }
@@ -542,7 +530,9 @@ where
 
     /// Removes all elements from the buffer.
     pub fn clear(&mut self) {
-        self.truncate(0);
+        self.values.clear();
+        self.summary.clear();
+        self.dirty_bits.clear();
     }
 
     /// Copies a value out of the buffer.
@@ -572,26 +562,23 @@ where
         self.values.push(T::Blob::default());
         value.write_to_blob(&self.values[index as usize]);
 
-        let page_word = (self.index_to_page(index) / PAGES_PER_DIRTY_WORD) as usize;
-        while self.dirty_pages.len() < page_word + 1 {
-            self.dirty_pages.push(AtomicU64::default());
+        let dirty_word_index = (index / BITS_PER_WORD) as usize;
+        let summary_word_index = dirty_word_index / BITS_PER_WORD as usize;
+        while self.summary.len() < summary_word_index + 1 {
+            self.summary.push(AtomicU64::default());
+        }
+        while self.dirty_bits.len() < dirty_word_index + 1 {
+            self.dirty_bits.push(AtomicU64::default());
         }
-        self.note_changed_index(index);
 
+        self.note_changed_index(index);
         index
     }
 
-    /// Marks the page corresponding to the given element index as dirty so that
-    /// we know that we need to upload it.
+    /// Marks the given element index as dirty so that we know that we need to
+    /// upload it.
     fn note_changed_index(&self, index: u32) {
-        let page = self.index_to_page(index);
-        let (page_word, page_in_word) = (page / PAGES_PER_DIRTY_WORD, page % PAGES_PER_DIRTY_WORD);
-        self.dirty_pages[page_word as usize].fetch_or(1 << page_in_word, Ordering::Relaxed);
-    }
-
-    /// Returns the page corresponding to the given element index.
-    fn index_to_page(&self, index: u32) -> u32 {
-        index / self.page_size()
+        note_changed_index(index, &self.summary, &self.dirty_bits);
     }
 
     /// Ensures that the backing buffer for this buffer vector is present and
@@ -612,6 +599,8 @@ where
     /// Grows the buffer by adding default values so that it's at least the
     /// given size.
     ///
+    /// This method sets all the newly-added values to dirty.
+    ///
     /// If the buffer is already large enough, this method does nothing.
     pub fn grow(&mut self, new_len: u32) {
         let old_len = self.values.len() as u32;
@@ -622,41 +611,7 @@ where
         self.values.reserve(new_len as usize - old_len as usize);
         self.values.resize_with(new_len as usize, T::Blob::default);
 
-        // This is a bit tricky. We want to set the dirty bits corresponding to
-        // all pages that we added, if any. First, we compute the index of the
-        // last page word before the append operation.
-        let old_final_page = self.index_to_page(old_len);
-        let old_final_page_word_index = old_final_page / PAGES_PER_DIRTY_WORD;
-        let old_final_page_in_word = old_final_page % PAGES_PER_DIRTY_WORD;
-
-        // Next, we set the bits corresponding to every page that we added to
-        // that final page word. Note that this might set bits corresponding to
-        // pages past the end of our buffer; that's OK as we ignore them.
-        if old_final_page_in_word != 0
-            && let Some(ref mut old_final_atomic_page_word) =
-                self.dirty_pages.get_mut(old_final_page_word_index as usize)
-        {
-            *old_final_atomic_page_word.get_mut() |= !((1u64 << old_final_page_in_word) - 1);
-        }
-
-        // Finally, we add any new page words, with all bits set.
-        let new_page_count = self.index_to_page(new_len);
-        self.dirty_pages.resize_with(
-            (new_page_count as usize).div_ceil(PAGES_PER_DIRTY_WORD as usize),
-            || AtomicU64::new(u64::MAX),
-        );
-    }
-
-    /// Truncates the buffer to the given length.
-    ///
-    /// If the buffer is already that length or shorter, this method does
-    /// nothing.
-    pub fn truncate(&mut self, len: u32) {
-        self.values.truncate(len as usize);
-
-        let page = self.index_to_page(len);
-        self.dirty_pages
-            .truncate(page.div_ceil(PAGES_PER_DIRTY_WORD) as usize);
+        set_dirty_bits_for_vector_growth(old_len, new_len, &mut self.summary, &mut self.dirty_bits);
     }
 
     /// Writes the data to the GPU, either via a sparse upload or a bulk data
@@ -682,23 +637,15 @@ where
     /// because it was resized or because too much data changed for a sparse
     /// update to be worthwhile.
     fn should_perform_full_reupload(&self, render_device: &RenderDevice) -> bool {
-        if self.needs_full_reupload {
-            return true;
-        }
-
-        if render_device.limits().max_storage_buffers_per_shader_stage < 3 {
+        if self.needs_full_reupload
+            || render_device.limits().max_storage_buffers_per_shader_stage < 3
+        {
             return true;
         }
 
-        // Calculate the number of changed pages via population count.
-        let changed_page_count: u32 = self
-            .dirty_pages
-            .iter()
-            .map(|atomic_page_word| atomic_page_word.load(Ordering::Relaxed).count_ones())
-            .sum();
-
+        let changed_element_count = count_dirty_elements(&self.summary, &self.dirty_bits);
         self.staging_buffers
-            .should_perform_full_reupload(changed_page_count, self.values.len())
+            .should_perform_full_reupload(changed_element_count, self.values.len())
     }
 
     /// Writes the entire buffer in bulk.
@@ -727,56 +674,60 @@ where
         }
 
         // Mark all pages as clean.
-        for atomic_page_word in self.dirty_pages.iter() {
-            atomic_page_word.store(0, Ordering::Relaxed);
+        for atomic_summary_word in self.summary.iter() {
+            atomic_summary_word.store(0, Ordering::Relaxed);
+        }
+        for atomic_dirty_word in self.dirty_bits.iter() {
+            atomic_dirty_word.store(0, Ordering::Relaxed);
         }
         self.sparse_update_scheduled = false;
     }
 
-    /// Schedules a sparse upload of only the pages that changed.
+    /// Schedules a sparse upload of only the elements that changed.
     fn prepare_sparse_upload(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
-        // Iterate over all dirty pages.
-        for (page_word_index, atomic_page_word) in self.dirty_pages.iter().enumerate() {
-            let page_word = atomic_page_word.load(Ordering::Relaxed);
-            for page_index_in_word in BitIter::new(page_word) {
-                let page = page_word_index as u32 * PAGES_PER_DIRTY_WORD + page_index_in_word;
-
-                // Write the index of the page so the shader will know where to
-                // scatter the data to.
-                self.staging_buffers.indices.push(page);
-
-                // Copy the page to the GPU staging buffer.
-                let page_size = self.staging_buffers.page_size();
-                let page_start = page as usize * page_size;
-                let page_end = page_start + page_size;
-                for value_index in page_start..page_end {
-                    match self.values.get(value_index) {
-                        Some(blob) => {
-                            let value = T::read_from_blob(blob);
-                            self.staging_buffers
-                                .source_data
-                                .extend(bytemuck::cast_slice(&[value]).iter().copied());
-                        }
-                        None => {
-                            self.staging_buffers.source_data.extend(iter::repeat_n(
-                                0,
-                                self.staging_buffers.element_word_size as usize,
-                            ));
-                        }
-                    }
+        // Iterate over all dirty elements, using the summary to accelerate the
+        // search.
+        for (summary_word_index, atomic_summary_word) in self.summary.iter().enumerate() {
+            let summary_word = atomic_summary_word.load(Ordering::Relaxed);
+            for summary_bit_offset in BitIter::new(summary_word) {
+                let dirty_word_index =
+                    summary_word_index * BITS_PER_WORD as usize + summary_bit_offset as usize;
+
+                // Iterate over all dirty elements in each dirty page.
+                let atomic_dirty_word = &self.dirty_bits[dirty_word_index];
+                let dirty_word = atomic_dirty_word.load(Ordering::Relaxed);
+                for dirty_bit_offset in BitIter::new(dirty_word) {
+                    let element_index =
+                        dirty_word_index * BITS_PER_WORD as usize + dirty_bit_offset as usize;
+
+                    let Some(blob) = self.values.get(element_index) else {
+                        continue;
+                    };
+
+                    // Write the index of the element so the shader will know where to
+                    // scatter the data to.
+                    self.staging_buffers.indices.push(element_index as u32);
+
+                    // Copy the element to the GPU staging buffer.
+                    let value = T::read_from_blob(blob);
+                    self.staging_buffers
+                        .source_data
+                        .extend(bytemuck::cast_slice(&[value]).iter().copied());
+
+                    // Make sure we're aligned up to a full element.
+                    debug_assert_eq!(
+                        self.staging_buffers.source_data.len()
+                            % self.staging_buffers.element_word_size as usize,
+                        0
+                    );
                 }
 
-                // Make sure we're aligned up to a full page.
-                debug_assert_eq!(
-                    self.staging_buffers.source_data.len()
-                        % (self.staging_buffers.element_word_size as usize
-                            * self.staging_buffers.page_size()),
-                    0
-                );
+                // Mark the element as clean.
+                atomic_dirty_word.store(0, Ordering::Relaxed);
             }
 
-            // Mark the page as clean.
-            atomic_page_word.store(0, Ordering::Relaxed);
+            // Mark the block as clean.
+            atomic_summary_word.store(0, Ordering::Relaxed);
         }
 
         // Schedule a sparse update if there was something to do.
@@ -856,6 +807,116 @@ impl FromWorld for SparseBufferUpdateBindGroups {
     }
 }
 
+/// Marks elements within the range `old_len..new_len` as dirty, under the
+/// assumption that the vector is being resized from a length of `old_len` to a
+/// length of `new_len`.
+///
+/// This is more efficient than individually marking elements one-by-one. It
+/// also resizes the `summary` and `dirty_bits` bitfields as necessary.
+///
+/// `new_len` must be greater than or equal to `old_len`.
+fn set_dirty_bits_for_vector_growth(
+    old_len: u32,
+    new_len: u32,
+    summary: &mut Vec<AtomicU64>,
+    dirty_bits: &mut Vec<AtomicU64>,
+) {
+    debug_assert!(new_len >= old_len);
+    if new_len == old_len {
+        return;
+    }
+
+    if old_len > 0 {
+        // Compute the index of the bit corresponding to the final existing
+        // element. We're going to set every bit *after* that bit.
+        let old_final_dirty_word_index = (old_len - 1) / BITS_PER_WORD;
+        let old_final_dirty_bit_offset = (old_len - 1) % BITS_PER_WORD;
+        if old_final_dirty_bit_offset < BITS_PER_WORD - 1
+            && let Some(ref mut old_final_atomic_dirty_word) =
+                dirty_bits.get_mut(old_final_dirty_word_index as usize)
+        {
+            // We add one here because we want to set every bit *after*, but not
+            // including, the index we computed above.
+            *old_final_atomic_dirty_word.get_mut() |=
+                !((1u64 << (old_final_dirty_bit_offset + 1)).wrapping_sub(1));
+        }
+
+        // Now set all the blocks from the block corresponding to `old_len - 1`
+        // onward to dirty. Note that this is an inclusive range, because we
+        // want to include the page that `old_len - 1` is on.
+        let old_final_summary_word_index = old_final_dirty_word_index / BITS_PER_WORD;
+        let mut old_final_summary_bit_offset = old_final_dirty_word_index % BITS_PER_WORD;
+        // This is a tricky exception. If `old_len` was precisely aligned on a
+        // block boundary, then we *don't* include the block that `old_len - 1`
+        // is on.
+        if old_final_dirty_bit_offset == BITS_PER_WORD - 1 {
+            old_final_summary_bit_offset += 1;
+        }
+        if let Some(ref mut old_final_atomic_summary_word) =
+            summary.get_mut(old_final_summary_word_index as usize)
+        {
+            // We don't add one to `old_final_summary_bit_offset` here because
+            // we want to include the block that `old_len - 1` is on.
+            *old_final_atomic_summary_word.get_mut() |=
+                !((1u64 << old_final_summary_bit_offset).wrapping_sub(1));
+        }
+    }
+
+    // Add any new summary and dirty words, with all bits set.
+    let new_dirty_word_count = (new_len as usize).div_ceil(BITS_PER_WORD as usize);
+    let new_summary_word_count = new_dirty_word_count.div_ceil(BITS_PER_WORD as usize);
+    summary.resize_with(new_summary_word_count, || AtomicU64::new(u64::MAX));
+    dirty_bits.resize_with(new_dirty_word_count, || AtomicU64::new(u64::MAX));
+
+    // Clear all bits past the last valid element index in `dirty_bits`.
+    let last_dirty_bit_offset = new_len % BITS_PER_WORD;
+    if last_dirty_bit_offset != 0 {
+        let mut final_dirty_word = dirty_bits[new_dirty_word_count - 1].load(Ordering::Relaxed);
+        final_dirty_word &= (1u64 << last_dirty_bit_offset) - 1;
+        dirty_bits[new_dirty_word_count - 1].store(final_dirty_word, Ordering::Relaxed);
+    }
+
+    // Clear all bits past the last valid summary bit in `summary`.
+    let last_summary_bit_offset = new_dirty_word_count % BITS_PER_WORD as usize;
+    if last_summary_bit_offset != 0 {
+        let mut final_summary_word = summary[new_summary_word_count - 1].load(Ordering::Relaxed);
+        final_summary_word &= (1u64 << last_summary_bit_offset) - 1;
+        summary[new_summary_word_count - 1].store(final_summary_word, Ordering::Relaxed);
+    }
+}
+
+/// Marks the given element index as dirty so that we know that we need to
+/// upload it.
+///
+/// This is a separate function so we can unit test it easily (i.e. without the
+/// need of a `RenderDevice`).
+fn note_changed_index(index: u32, summary: &[AtomicU64], dirty_bits: &[AtomicU64]) {
+    let dirty_word_index = index / BITS_PER_WORD;
+    let (summary_word_index, summary_bit_offset) = (
+        dirty_word_index / BITS_PER_WORD,
+        dirty_word_index % BITS_PER_WORD,
+    );
+    summary[summary_word_index as usize].fetch_or(1 << summary_bit_offset, Ordering::Relaxed);
+    let (element_word, element_in_word) = (index / BITS_PER_WORD, index % BITS_PER_WORD);
+    dirty_bits[element_word as usize].fetch_or(1 << element_in_word, Ordering::Relaxed);
+}
+
+/// Returns the total number of bits set in `dirty_bits`, using the given
+/// `summary` to accelerate the count.
+fn count_dirty_elements(summary: &[AtomicU64], dirty_bits: &[AtomicU64]) -> u32 {
+    let mut changed_element_count = 0u32;
+    for (summary_word_index, summary_word) in summary.iter().enumerate() {
+        for summary_bit_offset in BitIter::new(summary_word.load(Ordering::Relaxed)) {
+            let dirty_word_index =
+                summary_word_index * BITS_PER_WORD as usize + summary_bit_offset as usize;
+            let dirty_word = dirty_bits[dirty_word_index].load(Ordering::Relaxed);
+            changed_element_count += dirty_word.count_ones();
+        }
+    }
+
+    changed_element_count
+}
+
 /// Prepares all GPU resources necessary to perform a sparse buffer update,
 /// other than updating the metadata uniform.
 ///
@@ -888,8 +949,7 @@ fn prepare_to_populate_buffers(
     // Record the update job.
     sparse_buffer_update_jobs.push(SparseBufferUpdateJob {
         sparse_buffer_handle: sparse_buffer_handle.clone(),
-        page_size_log2: staging_buffers.page_size_log2,
-        updated_page_count: staging_buffers.updated_page_count(),
+        updated_element_count: staging_buffers.updated_element_count(),
         element_word_size: staging_buffers.element_word_size,
         label: (*label).clone(),
     });
@@ -949,14 +1009,12 @@ fn reserve(
 }
 
 impl GpuSparseBufferUpdateMetadata {
-    /// Returns a new [`GpuSparseBufferUpdateMetadata`] for the given type and
-    /// page size.
-    fn new<T>(page_size_log2: u32) -> GpuSparseBufferUpdateMetadata {
+    /// Returns a new [`GpuSparseBufferUpdateMetadata`] for the given type.
+    fn new<T>() -> GpuSparseBufferUpdateMetadata {
         assert_eq!(size_of::<T>() % 4, 0);
         GpuSparseBufferUpdateMetadata {
             element_size: (size_of::<T>() / 4) as u32,
-            updated_page_count: 0,
-            page_size_log2,
+            updated_element_count: 0,
         }
     }
 }
@@ -991,3 +1049,156 @@ fn calculate_allocation_size(length: usize) -> usize {
     let size = REALLOCATION_FACTOR.powf(exponent) as usize;
     size.next_multiple_of(REALLOCATION_SIZE_MULTIPLE)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{BitIter, BITS_PER_WORD};
+    use core::{
+        iter,
+        sync::atomic::{AtomicU64, Ordering},
+    };
+    use proptest::prelude::proptest;
+
+    proptest! {
+        // Ensures that the `BitIter` is correct.
+        #[test]
+        fn bit_iter(bits: u64) {
+            let mut bits_reference: Vec<_> = (0u32..64u32).filter(|bit_pos| {
+                (bits & (1 << bit_pos)) != 0
+            }).collect();
+            bits_reference.sort_unstable();
+
+            let mut bits_iter_results: Vec<_> = BitIter::new(bits).collect();
+            bits_iter_results.sort_unstable();
+
+            assert_eq!(bits_iter_results, bits_reference);
+        }
+
+        // Ensures that `set_dirty_bits_for_vector_growth` is correct.
+        #[test]
+        fn set_dirty_bits_for_vector_growth(
+            old_len in 0u32..16384u32,
+            new_element_count in 0u32..16384u32,
+            start_dirty: bool
+        ) {
+            // Initialize the dirty bits.
+            let new_len = old_len + new_element_count;
+            let mut dirty_bits: Vec<_> = iter::repeat_with(|| {
+                AtomicU64::new(if start_dirty { u64::MAX } else { 0 })
+            }).take(old_len.div_ceil(BITS_PER_WORD) as usize).collect();
+            let mut summary: Vec<_> = iter::repeat_with(|| {
+                AtomicU64::new(if start_dirty { u64::MAX } else { 0 })
+            }).take(dirty_bits.len().div_ceil(BITS_PER_WORD as usize)).collect();
+
+            super::set_dirty_bits_for_vector_growth(
+                old_len,
+                new_len,
+                &mut summary,
+                &mut dirty_bits
+            );
+
+            // Check dirty flags for elements.
+            // Bits in the range [0, old_len) should be unchanged.
+            for element_index in 0..old_len {
+                check_element_dirty(element_index, &dirty_bits, start_dirty);
+            }
+            // Bits in the range [old_len, new_len) should be dirty.
+            for element_index in old_len..new_len {
+                check_element_dirty(element_index, &dirty_bits, true);
+            }
+            // Bits in the range [new_len, end) should be clean.
+            for element_index in (new_len..).take_while(|element_index| {
+                element_index % BITS_PER_WORD != 0
+            }) {
+                check_element_dirty(element_index, &dirty_bits, false);
+            }
+
+            // Check the dirty flag for each block to ensure that it precisely
+            // corresponds to the logical *or* of the dirty flags for all
+            // elements in that block.
+            for (dirty_word_index, atomic_dirty_word) in dirty_bits.iter().enumerate() {
+                // Determine the range of elements that this block encompasses.
+                let element_start = dirty_word_index * BITS_PER_WORD as usize;
+                let element_end =
+                    ((dirty_word_index + 1) * BITS_PER_WORD as usize).min(new_len as usize);
+                assert!(element_start <= element_end);
+
+                // Determine whether the block should be dirty.
+                let dirty_word = atomic_dirty_word.load(Ordering::Relaxed);
+                let block_is_dirty = (element_start..element_end).any(|element_index| {
+                    (dirty_word & (1 << (element_index % (BITS_PER_WORD as usize)))) != 0
+                });
+
+                // Check to make sure that the block has the correct dirty state.
+                check_block_dirty(dirty_word_index as u32, &summary, block_is_dirty);
+            }
+
+            // Make sure that all dirty block bits past the last valid dirty
+            // block bit are clear.
+            if !summary.is_empty() {
+                let last_summary_word_index = summary.len() - 1;
+                let last_padding_block_index = last_summary_word_index * BITS_PER_WORD as usize;
+                let last_dirty_word_index = (new_len as usize - 1) / BITS_PER_WORD as usize;
+                for padding_block_index in (last_dirty_word_index + 1)..last_padding_block_index {
+                    check_block_dirty(padding_block_index as u32, &summary, false);
+                }
+            }
+
+            // Asserts that the dirty status of the element at `element_index`
+            // matches the expected dirty status.
+            fn check_element_dirty(
+                element_index: u32,
+                dirty_bits: &[AtomicU64],
+                expect_dirty: bool
+            ) {
+                let expected = if expect_dirty { 1 } else { 0 };
+
+                let dirty_word_index = element_index / BITS_PER_WORD;
+                let dirty_bit_offset = element_index % BITS_PER_WORD;
+                let dirty_word = dirty_bits[dirty_word_index as usize].load(Ordering::Relaxed);
+                assert_eq!((dirty_word >> dirty_bit_offset) & 1, expected);
+            }
+
+            // Asserts that the dirty status of the block at `block_index`
+            // matches the expected dirty status in the summary.
+            //
+            // This is actually the same code as `ensure_elements_dirty`, but is
+            // duplicated for clarity.
+            fn check_block_dirty(block_index: u32, summary: &[AtomicU64], expect_dirty: bool) {
+                let expected = if expect_dirty { 1 } else { 0 };
+
+                let summary_word_index = block_index / BITS_PER_WORD;
+                let summary_bit_offset = block_index % BITS_PER_WORD;
+                let summary_word = summary[summary_word_index as usize].load(Ordering::Relaxed);
+                assert_eq!((summary_word >> summary_bit_offset) & 1, expected);
+            }
+        }
+
+        // Ensures that the population-count-based `count_dirty_elements` code
+        // correctly calculates the number of changed elements.
+        //
+        // The input `dirty_flags` is an array of booleans, one for each
+        // element, in which `false` represents "not changed" and `true`
+        // represents "changed".
+        #[test]
+        fn dirty_element_count(dirty_flags: Vec<bool>) {
+            let dirty_word_count = dirty_flags.len().div_ceil(BITS_PER_WORD as usize);
+            let summary_word_count = dirty_word_count.div_ceil(BITS_PER_WORD as usize);
+
+            let dirty_bits: Vec<_> = (0..dirty_word_count).map(|_| AtomicU64::new(0)).collect();
+            let summary: Vec<_> = (0..summary_word_count).map(|_| AtomicU64::new(0)).collect();
+
+            let mut true_dirty_element_count = 0;
+            for (element_index, _) in dirty_flags.iter().enumerate().filter(|(_, element)| **element) {
+                super::note_changed_index(element_index as u32, &summary, &dirty_bits);
+                true_dirty_element_count += 1;
+            }
+
+            let calculated_dirty_element_count = super::count_dirty_elements(
+                &summary,
+                &dirty_bits
+            );
+            assert_eq!(calculated_dirty_element_count, true_dirty_element_count);
+        }
+    }
+}