Skip to main content

bevy_render/render_resource/
sparse_buffer_vec.rs

1//! GPU buffers that support sparse updates if only a small number of elements
2//! have changed.
3
4use alloc::sync::{Arc, Weak};
5use core::{
6    iter, slice,
7    sync::atomic::{AtomicU64, Ordering},
8};
9
10use bevy_app::{App, Plugin};
11use bevy_asset::{embedded_asset, load_embedded_asset, Handle};
12use bevy_derive::{Deref, DerefMut};
13use bevy_ecs::{
14    resource::Resource,
15    schedule::IntoScheduleConfigs as _,
16    system::{Res, ResMut},
17    world::{FromWorld, World},
18};
19use bevy_log::{error, info};
20use bevy_material::{
21    bind_group_layout_entries::{
22        binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer},
23        BindGroupLayoutEntries,
24    },
25    descriptor::{BindGroupLayoutDescriptor, CachedComputePipelineId, ComputePipelineDescriptor},
26};
27use bevy_shader::Shader;
28use bytemuck::{Pod, Zeroable};
29use encase::ShaderType;
30use weak_table::WeakKeyHashMap;
31use wgpu::{BufferDescriptor, BufferUsages, ComputePassDescriptor, ShaderStages};
32
33use crate::{
34    diagnostic::{DiagnosticsRecorder, RecordDiagnostics as _},
35    render_resource::{
36        AtomicPod, BindGroup, BindGroupEntries, Buffer, PipelineCache, RawBufferVec,
37        SpecializedComputePipeline, SpecializedComputePipelines, UniformBuffer,
38    },
39    renderer::{RenderDevice, RenderGraph, RenderGraphSystems, RenderQueue},
40    ExtractSchedule, RenderApp,
41};
42
43/// A plugin that allows sparse updates of GPU buffers if only a small number of
44/// elements have changed.
45pub struct SparseBufferPlugin;
46
47impl Plugin for SparseBufferPlugin {
48    fn build(&self, app: &mut App) {
49        embedded_asset!(app, "sparse_buffer_update.wgsl");
50    }
51
52    fn finish(&self, app: &mut App) {
53        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
54            return;
55        };
56
57        render_app
58            .init_resource::<SparseBufferUpdateJobs>()
59            .init_resource::<SparseBufferUpdatePipelines>()
60            .init_resource::<SpecializedComputePipelines<SparseBufferUpdatePipelines>>()
61            .init_resource::<SparseBufferUpdateBindGroups>()
62            .add_systems(ExtractSchedule, clear_sparse_buffer_jobs)
63            .add_systems(
64                RenderGraph,
65                // We perform sparse buffer updates very early so that sparse
66                // buffers can be used in any render pass.
67                update_sparse_buffers.in_set(RenderGraphSystems::Begin),
68            );
69    }
70}
71
72/// A globally-unique ID that identifies this sparse buffer.
73#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Deref, DerefMut)]
74pub struct SparseBufferId(pub u64);
75
76/// An object that allows the sparse buffer ID to be query and holds the bind
77/// group for that sparse buffer alive.
78///
79/// Each sparse buffer holds a strong reference to this handle, and the
80/// [`SparseBufferUpdateBindGroups`] resource contains a weak map from this
81/// handle to the bind group. This setup ensures that, when the sparse buffer is
82/// freed, the bind groups for that sparse buffer are freed as well.
83pub type SparseBufferHandle = Arc<SparseBufferId>;
84
85/// The next sparse buffer ID to be assigned.
86static NEXT_SPARSE_BUFFER_ID: AtomicU64 = AtomicU64::new(0);
87
88/// The size of a single workgroup in the sparse buffer shader.
89const SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE: u32 = 256;
90
91/// The fraction of the buffer that may be changed before we fall back to full
92/// reupload.
93///
94/// This is set to 15% by default. This was obtained experimentally by testing
95/// very large scenes and roughly matches the values used by other engines that
96/// perform sparse buffer updates.
97const SPARSE_UPLOAD_THRESHOLD: f64 = 0.15;
98
99/// The WebGPU limit on the number of workgroups that can be dispatched.
100const MAX_WORKGROUPS: u32 = 65535;
101
102/// We round all allocations up to the nearest power of this.
103const REALLOCATION_FACTOR: f64 = 1.5;
104/// We round all allocations up to the nearest multiple of this.
105const REALLOCATION_SIZE_MULTIPLE: usize = 256;
106
107/// The number of dirty-page bits packed into each [`AtomicU64`] word.
108const PAGES_PER_DIRTY_WORD: u32 = 64;
109
110/// Pipelines for the sparse buffer update shader.
111///
112/// This shader is shared among all sparse buffer vectors.
113#[derive(Resource)]
114pub struct SparseBufferUpdatePipelines {
115    /// The bind group layout.
116    ///
117    /// We only have one bind group layout shared among all sparse buffer
118    /// vectors.
119    bind_group_layout: Option<BindGroupLayoutDescriptor>,
120    /// The shader that performs the scatter operation.
121    shader: Option<Handle<Shader>>,
122}
123
124/// A resource, part of the render world, that stores the bind groups for each
125/// sparse buffer.
126#[derive(Resource)]
127pub struct SparseBufferUpdateBindGroups {
128    /// The bind groups for each sparse buffer.
129    ///
130    /// These are stored in a weak map so that when the sparse buffer goes away,
131    /// the bind group for that buffer goes away as well.
132    bind_groups: WeakKeyHashMap<Weak<SparseBufferId>, SparseBufferUpdateBindGroup>,
133    /// The ID of the update shader pipeline shared among all sparse buffers.
134    pipeline_id: CachedComputePipelineId,
135}
136
137/// A single bind group for the sparse buffer update shader.
138pub struct SparseBufferUpdateBindGroup {
139    /// The actual bind group.
140    bind_group: BindGroup,
141}
142
143/// A resource, part of the render world, that stores all pending sparse updates
144/// to buffers.
145#[derive(Resource, Default, Deref, DerefMut)]
146pub struct SparseBufferUpdateJobs(pub Vec<SparseBufferUpdateJob>);
147
148/// Describes a sparse update operation for a buffer.
149pub struct SparseBufferUpdateJob {
150    /// A handle to the buffer to be updated.
151    sparse_buffer_handle: SparseBufferHandle,
152    /// The number of pages to update.
153    updated_page_count: u32,
154    /// The base-2 logarithm of the size of a page for the buffer.
155    ///
156    /// The actual page size can be computed as `1 << page_size_log2`.
157    page_size_log2: u32,
158    /// The size of each element in 32-bit words.
159    element_word_size: u32,
160    /// A debugging label for the buffer.
161    label: Arc<str>,
162}
163
164impl SparseBufferUpdateJob {
165    /// The number of elements per page.
166    fn page_size(&self) -> u32 {
167        1 << self.page_size_log2
168    }
169
170    /// Calculates the number of words that need to be updated.
171    fn words_to_update(&self) -> u32 {
172        self.updated_page_count * self.page_size() * self.element_word_size
173    }
174
175    /// Calculates the number of workgroups that need to be dispatched.
176    fn workgroup_count(&self) -> u32 {
177        self.words_to_update()
178            .div_ceil(SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE)
179    }
180}
181
182/// A GPU type that describes a sparse update that is to be performed.
183#[derive(Clone, Copy, Default, ShaderType, Pod, Zeroable)]
184#[repr(C)]
185struct GpuSparseBufferUpdateMetadata {
186    /// The size of a single element in 32-bit words.
187    element_size: u32,
188    /// The number of pages that need to be updated.
189    updated_page_count: u32,
190    /// The base-2 logarithm of the page size.
191    ///
192    /// That is, the page size is `1 << page_size_log2`.
193    page_size_log2: u32,
194}
195
196/// A system, part of the render graph, that performs sparse buffer updates to
197/// buffers for which only a small number of elements have changed.
198///
199/// This runs as early in the pipeline as possible so that sparse buffers can be
200/// used for any subsequent pass.
201fn update_sparse_buffers(
202    sparse_buffer_update_jobs: Res<SparseBufferUpdateJobs>,
203    sparse_buffer_update_bind_groups: Res<SparseBufferUpdateBindGroups>,
204    pipeline_cache: Res<PipelineCache>,
205    mut diagnostics: Option<ResMut<DiagnosticsRecorder>>,
206    render_device: Res<RenderDevice>,
207    render_queue: Res<RenderQueue>,
208) {
209    // Bail if we have nothing to do.
210    if sparse_buffer_update_jobs.is_empty() {
211        return;
212    }
213
214    // We need to create a command encoder since this pass isn't associated with
215    // a view.
216    let mut command_encoder =
217        render_device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
218            label: Some("sparse buffer update"),
219        });
220
221    let time_span = diagnostics
222        .as_mut()
223        .map(|diagnostics| diagnostics.time_span(&mut command_encoder, "sparse buffer update"));
224
225    command_encoder.push_debug_group("sparse buffer update");
226
227    let Some(compute_pipeline) =
228        pipeline_cache.get_compute_pipeline(sparse_buffer_update_bind_groups.pipeline_id)
229    else {
230        return;
231    };
232
233    // Process each sparse buffer update job.
234    for sparse_buffer_update_job in sparse_buffer_update_jobs.iter() {
235        let Some(sparse_buffer_update_bind_group) = sparse_buffer_update_bind_groups
236            .bind_groups
237            .get(&sparse_buffer_update_job.sparse_buffer_handle)
238        else {
239            continue;
240        };
241
242        let mut sparse_buffer_update_pass =
243            command_encoder.begin_compute_pass(&ComputePassDescriptor {
244                label: Some(&*format!(
245                    "sparse buffer update ({})",
246                    &sparse_buffer_update_job.label
247                )),
248                timestamp_writes: None,
249            });
250        sparse_buffer_update_pass.set_pipeline(compute_pipeline);
251        sparse_buffer_update_pass.set_bind_group(
252            0,
253            &sparse_buffer_update_bind_group.bind_group,
254            &[],
255        );
256        sparse_buffer_update_pass.dispatch_workgroups(
257            sparse_buffer_update_job.workgroup_count(),
258            1,
259            1,
260        );
261    }
262
263    command_encoder.pop_debug_group();
264    if let Some(time_span) = time_span {
265        time_span.end(&mut command_encoder);
266    }
267
268    render_queue.submit([command_encoder.finish()]);
269}
270
271/// A system that clears out the sparse buffer update jobs in preparation for a
272/// new frame.
273fn clear_sparse_buffer_jobs(mut sparse_buffer_update_jobs: ResMut<SparseBufferUpdateJobs>) {
274    sparse_buffer_update_jobs.clear();
275}
276
277impl FromWorld for SparseBufferUpdatePipelines {
278    fn from_world(world: &mut World) -> Self {
279        let render_device = world.resource::<RenderDevice>();
280        let limit = render_device.limits().max_storage_buffers_per_shader_stage;
281
282        if limit < 3 {
283            info!(
284                "Sparse buffer updates disabled. RenderDevice lacks support: max_storage_buffers_per_shader_stage ({}) < 3.",
285                limit
286            );
287
288            return SparseBufferUpdatePipelines {
289                bind_group_layout: None,
290                shader: None,
291            };
292        }
293
294        let bind_group_layout = BindGroupLayoutDescriptor::new(
295            "sparse buffer update bind group layout",
296            &BindGroupLayoutEntries::sequential(
297                ShaderStages::COMPUTE,
298                (
299                    // @group(0) @binding(0) var<storage, read_write> dest_buffer: array<u32>;
300                    storage_buffer::<u32>(false),
301                    // @group(0) @binding(1) var<storage> src_buffer: array<u32>;
302                    storage_buffer_read_only::<u32>(false),
303                    // @group(0) @binding(2) var<storage> indices: array<u32>;
304                    storage_buffer_read_only::<u32>(false),
305                    // @group(0) @binding(3) var<uniform> metadata:
306                    // SparseBufferUpdateMetadata;
307                    uniform_buffer::<GpuSparseBufferUpdateMetadata>(false),
308                ),
309            ),
310        );
311
312        SparseBufferUpdatePipelines {
313            bind_group_layout: Some(bind_group_layout),
314            shader: Some(load_embedded_asset!(world, "sparse_buffer_update.wgsl")),
315        }
316    }
317}
318
319impl SpecializedComputePipeline for SparseBufferUpdatePipelines {
320    type Key = ();
321
322    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
323        ComputePipelineDescriptor {
324            label: Some("sparse buffer update pipeline".into()),
325            layout: self.bind_group_layout.clone().into_iter().collect(),
326            shader: self.shader.clone().unwrap_or_default(),
327            shader_defs: vec![],
328            ..ComputePipelineDescriptor::default()
329        }
330    }
331}
332
333/// The buffers that we use to sparsely scatter new data to the GPU.
334///
335/// There's one such set of buffers per sparse buffer vector.
336struct SparseBufferStagingBuffers {
337    /// All pages that have changed and need to be updated.
338    source_data: RawBufferVec<u32>,
339
340    /// The index at which we write each page in [`Self::source_data`].
341    ///
342    /// The length of this buffer is equal to [`Self::source_data`] divided by
343    /// 2^[`Self::page_size_log2`].
344    indices: RawBufferVec<u32>,
345
346    /// The size of each element in 32-bit words.
347    element_word_size: u32,
348
349    /// The base-2 logarithm of the page size in elements.
350    ///
351    /// That is, the page size in elements is `1 << page_size_log2`.
352    page_size_log2: u32,
353}
354
355impl SparseBufferStagingBuffers {
356    /// The number of elements per page.
357    fn page_size(&self) -> usize {
358        1 << self.page_size_log2
359    }
360
361    /// Creates a new set of staging buffers for a sparse buffer vector.
362    fn new(label: &str, element_word_size: u32, page_size_log2: u32) -> SparseBufferStagingBuffers {
363        let mut source_data_buffer =
364            RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE);
365        source_data_buffer.set_label(Some(&*format!("{} staging buffer", label)));
366
367        let mut indices_buffer = RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE);
368        indices_buffer.set_label(Some(&*format!("{} index buffer", label)));
369
370        SparseBufferStagingBuffers {
371            source_data: source_data_buffer,
372            indices: indices_buffer,
373            element_word_size,
374            page_size_log2,
375        }
376    }
377
378    /// Returns the number of updated pages.
379    fn updated_page_count(&self) -> u32 {
380        // Note that we don't have to round up here because data is always
381        // uploaded in increments of a whole page.
382        let element_count = self.source_data.len() / self.element_word_size as usize;
383        (element_count / self.page_size()) as u32
384    }
385
386    /// Writes the buffers that contain all the data necessary to perform a
387    /// sparse upload to the GPU.
388    ///
389    /// This includes the buffer associated with the supplied
390    /// `metadata_uniform`.
391    fn write_buffers(
392        &mut self,
393        metadata_uniform: &mut UniformBuffer<GpuSparseBufferUpdateMetadata>,
394        render_device: &RenderDevice,
395        render_queue: &RenderQueue,
396    ) {
397        metadata_uniform.get_mut().updated_page_count = self.updated_page_count();
398        metadata_uniform.write_buffer(render_device, render_queue);
399
400        self.source_data.write_buffer(render_device, render_queue);
401        self.indices.write_buffer(render_device, render_queue);
402    }
403
404    /// Returns true if a sparse buffer update should *not* be performed because
405    /// too many words changed.
406    fn should_perform_full_reupload(&self, changed_page_count: u32, buffer_length: usize) -> bool {
407        // Calculate the number of changed words. If it's greater than the
408        // maximum number of workgroups as defined by `wgpu`, we must perform a
409        // full reupload.
410        let total_changed_word_count =
411            changed_page_count * self.page_size() as u32 * self.element_word_size;
412        if total_changed_word_count > MAX_WORKGROUPS * SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE {
413            return true;
414        }
415
416        // Don't perform a sparse upload if too many words changed, as it'll end
417        // up being slower than just uploading the whole buffer afresh.
418        let sparse_upload_fraction =
419            changed_page_count as f64 / buffer_length.div_ceil(self.page_size()) as f64;
420        sparse_upload_fraction > SPARSE_UPLOAD_THRESHOLD
421    }
422}
423
424/// A GPU buffer that can grow, can be updated atomically from multiple threads
425/// on the CPU, and is sparsely updated on the GPU if only a small number of
426/// elements have changed.
427///
428/// This type is similar to
429/// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], but instead of
430/// reuploading the entire buffer to the GPU when it's changed, it tracks
431/// changes on a per-page level and uploads only the pages that changed if the
432/// number of such pages is small. It uses a compute shader to scatter the
433/// changed pages.
434///
435/// As the stored data is [`AtomicPod`], multiple threads may update the buffer
436/// simultaneously. Note that, like
437/// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], only existing
438/// elements may be updated from multiple threads; new data still requires
439/// exclusive access.
440///
441/// `T` must have a size that's a multiple of 4.
442pub struct AtomicSparseBufferVec<T>
443where
444    T: AtomicPod,
445{
446    /// An ID that uniquely identifies this [`AtomicSparseBufferVec`].
447    handle: SparseBufferHandle,
448    /// The underlying values.
449    ///
450    /// These are stored as their blob representation to allow for thread-safe
451    /// update.
452    values: Vec<T::Blob>,
453    /// The GPU buffer, if allocated.
454    data_buffer: Option<Buffer>,
455    /// The GPU buffers that data is copied into in preparation to be scattered
456    /// to the [`Self::data_buffer`].
457    staging_buffers: SparseBufferStagingBuffers,
458    /// A GPU buffer that stores information such as the element size and stride
459    /// that's needed to perform sparse updates.
460    metadata_uniform: UniformBuffer<GpuSparseBufferUpdateMetadata>,
461    /// The capacity of the GPU buffer in elements.
462    capacity: usize,
463    /// The allowed `wgpu` buffer usages for the GPU buffer.
464    buffer_usages: BufferUsages,
465    /// An optional debug label to identify this buffer.
466    label: Arc<str>,
467    /// A bit set of dirty pages.
468    ///
469    /// The size of this vector in bits is the number of elements divided by the
470    /// page size, rounded up. A 1 in a bit indicates that the page has changed
471    /// since the last upload, while a 0 indicates that the page hasn't changed.
472    dirty_pages: Vec<AtomicU64>,
473    /// True if the entire buffer needs to be reuploaded because it resized.
474    needs_full_reupload: bool,
475    /// True if a sparse update is to be performed.
476    sparse_update_scheduled: bool,
477}
478
479impl<T> AtomicSparseBufferVec<T>
480where
481    T: AtomicPod,
482{
483    /// The number of elements per page.
484    fn page_size(&self) -> u32 {
485        1 << self.staging_buffers.page_size_log2
486    }
487
488    /// Creates a new [`AtomicSparseBufferVec`] with the given set of buffer
489    /// usages, page size, and label.
490    ///
491    /// `buffer_usages` specifies the set of allowed `wgpu` buffer usages for
492    /// the buffer that [`AtomicSparseBufferVec`] manages.
493    /// `BufferUsages::COPY_DST` is automatically added to this set.
494    ///
495    /// The `page_size_log2` parameter is the base-2 logarithm of the page size.
496    /// That is, the page size is `1 << page_size_log2`.
497    pub fn new(buffer_usages: BufferUsages, page_size_log2: u32, label: Arc<str>) -> Self {
498        // Make sure the value is word-aligned.
499        debug_assert_eq!(size_of::<T>() % 4, 0);
500        let element_word_size = size_of::<T>() / 4;
501
502        // Create a unique ID.
503        let id = Arc::new(SparseBufferId(
504            NEXT_SPARSE_BUFFER_ID.fetch_add(1, Ordering::Relaxed),
505        ));
506
507        Self {
508            handle: id,
509            values: vec![],
510            data_buffer: None,
511            staging_buffers: SparseBufferStagingBuffers::new(
512                &label,
513                element_word_size as u32,
514                page_size_log2,
515            ),
516            metadata_uniform: UniformBuffer::from(GpuSparseBufferUpdateMetadata::new::<T>(
517                page_size_log2,
518            )),
519            capacity: 0,
520            buffer_usages: buffer_usages | BufferUsages::COPY_DST,
521            label,
522            dirty_pages: vec![],
523            needs_full_reupload: false,
524            sparse_update_scheduled: false,
525        }
526    }
527
528    /// Returns the number of elements in the CPU side copy of the buffer.
529    pub fn len(&self) -> u32 {
530        self.values.len() as u32
531    }
532
533    /// Returns true if there are no elements in the CPU side copy of the buffer.
534    pub fn is_empty(&self) -> bool {
535        self.values.is_empty()
536    }
537
538    /// Returns a handle to the buffer, if the data has been uploaded.
539    pub fn buffer(&self) -> Option<&Buffer> {
540        self.data_buffer.as_ref()
541    }
542
543    /// Removes all elements from the buffer.
544    pub fn clear(&mut self) {
545        self.truncate(0);
546    }
547
548    /// Copies a value out of the buffer.
549    pub fn get(&self, index: u32) -> T {
550        T::read_from_blob(&self.values[index as usize])
551    }
552
553    /// Sets the value at the given index.
554    ///
555    /// If the index isn't in range of the buffer, this method panics.
556    ///
557    /// Internally, the value is converted to its blob representation.
558    ///
559    /// Note that this method is thread-safe and doesn't require `&mut self`.
560    /// It's your responsibility, however, to ensure synchronization; though
561    /// this method is memory-safe, it's possible for other threads to observe
562    /// partially-overwritten values if [`Self::get`] or similar methods are
563    /// called while the write operation is occurring.
564    pub fn set(&self, index: u32, value: T) {
565        value.write_to_blob(&self.values[index as usize]);
566        self.note_changed_index(index);
567    }
568
569    /// Adds a new value and returns its index.
570    pub fn push(&mut self, value: T) -> u32 {
571        let index = self.values.len() as u32;
572        self.values.push(T::Blob::default());
573        value.write_to_blob(&self.values[index as usize]);
574
575        let page_word = (self.index_to_page(index) / PAGES_PER_DIRTY_WORD) as usize;
576        while self.dirty_pages.len() < page_word + 1 {
577            self.dirty_pages.push(AtomicU64::default());
578        }
579        self.note_changed_index(index);
580
581        index
582    }
583
584    /// Marks the page corresponding to the given element index as dirty so that
585    /// we know that we need to upload it.
586    fn note_changed_index(&self, index: u32) {
587        let page = self.index_to_page(index);
588        let (page_word, page_in_word) = (page / PAGES_PER_DIRTY_WORD, page % PAGES_PER_DIRTY_WORD);
589        self.dirty_pages[page_word as usize].fetch_or(1 << page_in_word, Ordering::Relaxed);
590    }
591
592    /// Returns the page corresponding to the given element index.
593    fn index_to_page(&self, index: u32) -> u32 {
594        index / self.page_size()
595    }
596
597    /// Ensures that the backing buffer for this buffer vector is present and
598    /// appropriately sized on the GPU.
599    pub fn reserve(&mut self, new_capacity: usize, render_device: &RenderDevice) {
600        reserve(
601            new_capacity,
602            &mut self.capacity,
603            &self.label,
604            &mut self.data_buffer,
605            self.buffer_usages,
606            &mut self.needs_full_reupload,
607            size_of::<T::Blob>(),
608            render_device,
609        );
610    }
611
612    /// Grows the buffer by adding default values so that it's at least the
613    /// given size.
614    ///
615    /// If the buffer is already large enough, this method does nothing.
616    pub fn grow(&mut self, new_len: u32) {
617        let old_len = self.values.len() as u32;
618        if old_len >= new_len {
619            return;
620        }
621
622        self.values.reserve(new_len as usize - old_len as usize);
623        self.values.resize_with(new_len as usize, T::Blob::default);
624
625        // This is a bit tricky. We want to set the dirty bits corresponding to
626        // all pages that we added, if any. First, we compute the index of the
627        // last page word before the append operation.
628        let old_final_page = self.index_to_page(old_len);
629        let old_final_page_word_index = old_final_page / PAGES_PER_DIRTY_WORD;
630        let old_final_page_in_word = old_final_page % PAGES_PER_DIRTY_WORD;
631
632        // Next, we set the bits corresponding to every page that we added to
633        // that final page word. Note that this might set bits corresponding to
634        // pages past the end of our buffer; that's OK as we ignore them.
635        if old_final_page_in_word != 0
636            && let Some(ref mut old_final_atomic_page_word) =
637                self.dirty_pages.get_mut(old_final_page_word_index as usize)
638        {
639            *old_final_atomic_page_word.get_mut() |= !((1u64 << old_final_page_in_word) - 1);
640        }
641
642        // Finally, we add any new page words, with all bits set.
643        let new_page_count = self.index_to_page(new_len);
644        self.dirty_pages.resize_with(
645            (new_page_count as usize).div_ceil(PAGES_PER_DIRTY_WORD as usize),
646            || AtomicU64::new(u64::MAX),
647        );
648    }
649
650    /// Truncates the buffer to the given length.
651    ///
652    /// If the buffer is already that length or shorter, this method does
653    /// nothing.
654    pub fn truncate(&mut self, len: u32) {
655        self.values.truncate(len as usize);
656
657        let page = self.index_to_page(len);
658        self.dirty_pages
659            .truncate(page.div_ceil(PAGES_PER_DIRTY_WORD) as usize);
660    }
661
662    /// Writes the data to the GPU, either via a sparse upload or a bulk data
663    /// upload.
664    pub fn write_buffers(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
665        if self.values.is_empty() {
666            return;
667        }
668
669        // Round up the size to a good value to balance reallocation frequency
670        // against memory waste.
671        let good_size = calculate_allocation_size(self.values.len());
672        self.reserve(good_size, render_device);
673
674        if self.should_perform_full_reupload(render_device) {
675            self.write_entire_buffer(render_queue);
676        } else {
677            self.prepare_sparse_upload(render_device, render_queue);
678        }
679    }
680
681    /// Returns true if the sparse buffer should perform a full reupload, either
682    /// because it was resized or because too much data changed for a sparse
683    /// update to be worthwhile.
684    fn should_perform_full_reupload(&self, render_device: &RenderDevice) -> bool {
685        if self.needs_full_reupload {
686            return true;
687        }
688
689        if render_device.limits().max_storage_buffers_per_shader_stage < 3 {
690            return true;
691        }
692
693        // Calculate the number of changed pages via population count.
694        let changed_page_count: u32 = self
695            .dirty_pages
696            .iter()
697            .map(|atomic_page_word| atomic_page_word.load(Ordering::Relaxed).count_ones())
698            .sum();
699
700        self.staging_buffers
701            .should_perform_full_reupload(changed_page_count, self.values.len())
702    }
703
704    /// Writes the entire buffer in bulk.
705    ///
706    /// This is the method used when a sparse update is not used, either because
707    /// the buffer resized or because too much data changed for a sparse update
708    /// to be worthwhile.
709    fn write_entire_buffer(&mut self, render_queue: &RenderQueue) {
710        let Some(ref mut data_buffer) = self.data_buffer else {
711            error!("Dirty sparse buffer should have created a data buffer by now");
712            return;
713        };
714
715        // SAFETY: We're just writing atomic data to the GPU. The worst that
716        // can happen is that we race with somebody, which is unfortunate
717        // but not memory-unsafe.
718        unsafe {
719            render_queue.write_buffer(
720                data_buffer,
721                0,
722                slice::from_raw_parts(
723                    self.values.as_ptr().cast::<u8>(),
724                    self.values.len() * size_of::<T::Blob>(),
725                ),
726            );
727        }
728
729        // Mark all pages as clean.
730        for atomic_page_word in self.dirty_pages.iter() {
731            atomic_page_word.store(0, Ordering::Relaxed);
732        }
733        self.sparse_update_scheduled = false;
734    }
735
736    /// Schedules a sparse upload of only the pages that changed.
737    fn prepare_sparse_upload(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
738        // Iterate over all dirty pages.
739        for (page_word_index, atomic_page_word) in self.dirty_pages.iter().enumerate() {
740            let page_word = atomic_page_word.load(Ordering::Relaxed);
741            for page_index_in_word in BitIter::new(page_word) {
742                let page = page_word_index as u32 * PAGES_PER_DIRTY_WORD + page_index_in_word;
743
744                // Write the index of the page so the shader will know where to
745                // scatter the data to.
746                self.staging_buffers.indices.push(page);
747
748                // Copy the page to the GPU staging buffer.
749                let page_size = self.staging_buffers.page_size();
750                let page_start = page as usize * page_size;
751                let page_end = page_start + page_size;
752                for value_index in page_start..page_end {
753                    match self.values.get(value_index) {
754                        Some(blob) => {
755                            let value = T::read_from_blob(blob);
756                            self.staging_buffers
757                                .source_data
758                                .extend(bytemuck::cast_slice(&[value]).iter().copied());
759                        }
760                        None => {
761                            self.staging_buffers.source_data.extend(iter::repeat_n(
762                                0,
763                                self.staging_buffers.element_word_size as usize,
764                            ));
765                        }
766                    }
767                }
768
769                // Make sure we're aligned up to a full page.
770                debug_assert_eq!(
771                    self.staging_buffers.source_data.len()
772                        % (self.staging_buffers.element_word_size as usize
773                            * self.staging_buffers.page_size()),
774                    0
775                );
776            }
777
778            // Mark the page as clean.
779            atomic_page_word.store(0, Ordering::Relaxed);
780        }
781
782        // Schedule a sparse update if there was something to do.
783        self.sparse_update_scheduled = !self.staging_buffers.source_data.is_empty();
784        if self.sparse_update_scheduled {
785            self.staging_buffers.write_buffers(
786                &mut self.metadata_uniform,
787                render_device,
788                render_queue,
789            );
790        }
791    }
792
793    /// If a sparse update has been scheduled, prepares all GPU resources
794    /// necessary to perform a sparse buffer update, other than updating the
795    /// metadata uniform.
796    pub fn prepare_to_populate_buffers(
797        &mut self,
798        render_device: &RenderDevice,
799        pipeline_cache: &PipelineCache,
800        sparse_buffer_update_jobs: &mut SparseBufferUpdateJobs,
801        sparse_buffer_update_bind_groups: &mut SparseBufferUpdateBindGroups,
802        sparse_buffer_update_pipelines: &SparseBufferUpdatePipelines,
803    ) {
804        if self.sparse_update_scheduled {
805            match (&self.data_buffer, self.metadata_uniform.buffer()) {
806                (Some(data_buffer), Some(metadata_buffer)) => {
807                    prepare_to_populate_buffers(
808                        self.handle.clone(),
809                        &self.label,
810                        data_buffer,
811                        &mut self.staging_buffers,
812                        metadata_buffer,
813                        render_device,
814                        pipeline_cache,
815                        sparse_buffer_update_jobs,
816                        sparse_buffer_update_bind_groups,
817                        sparse_buffer_update_pipelines,
818                    );
819                }
820                _ => {
821                    error!("Buffers should have been created by now");
822                }
823            }
824        }
825
826        // Clear out the staging buffers, now that we know the data is already
827        // on the GPU.
828        self.staging_buffers.source_data.clear();
829        self.staging_buffers.indices.clear();
830
831        // Reset the `needs_full_reupload` and `needs_sparse_update` flags.
832        self.needs_full_reupload = false;
833        self.sparse_update_scheduled = false;
834    }
835}
836
837impl FromWorld for SparseBufferUpdateBindGroups {
838    fn from_world(world: &mut World) -> Self {
839        world.resource_scope::<SpecializedComputePipelines<SparseBufferUpdatePipelines>, _>(
840            |world, mut specialized_sparse_buffer_update_pipelines| {
841                let pipeline_cache = world.resource::<PipelineCache>();
842                let sparse_buffer_update_pipelines =
843                    world.resource::<SparseBufferUpdatePipelines>();
844                let pipeline_id = specialized_sparse_buffer_update_pipelines.specialize(
845                    pipeline_cache,
846                    sparse_buffer_update_pipelines,
847                    (),
848                );
849
850                SparseBufferUpdateBindGroups {
851                    bind_groups: WeakKeyHashMap::default(),
852                    pipeline_id,
853                }
854            },
855        )
856    }
857}
858
859/// Prepares all GPU resources necessary to perform a sparse buffer update,
860/// other than updating the metadata uniform.
861///
862/// This function creates the [`SparseBufferUpdateJob`] and ensures the bind
863/// group and pipeline are up to date.
864fn prepare_to_populate_buffers(
865    sparse_buffer_handle: SparseBufferHandle,
866    label: &Arc<str>,
867    data_buffer: &Buffer,
868    staging_buffers: &mut SparseBufferStagingBuffers,
869    metadata_buffer: &Buffer,
870    render_device: &RenderDevice,
871    pipeline_cache: &PipelineCache,
872    sparse_buffer_update_jobs: &mut SparseBufferUpdateJobs,
873    sparse_buffer_update_bind_groups: &mut SparseBufferUpdateBindGroups,
874    sparse_buffer_update_pipelines: &SparseBufferUpdatePipelines,
875) {
876    let (Some(source_data_staging_buffer), Some(indices_staging_buffer)) = (
877        staging_buffers.source_data.buffer(),
878        staging_buffers.indices.buffer(),
879    ) else {
880        error!("Staging buffers should have been created by now");
881        return;
882    };
883
884    let Some(bind_group_layout) = &sparse_buffer_update_pipelines.bind_group_layout else {
885        return;
886    };
887
888    // Record the update job.
889    sparse_buffer_update_jobs.push(SparseBufferUpdateJob {
890        sparse_buffer_handle: sparse_buffer_handle.clone(),
891        page_size_log2: staging_buffers.page_size_log2,
892        updated_page_count: staging_buffers.updated_page_count(),
893        element_word_size: staging_buffers.element_word_size,
894        label: (*label).clone(),
895    });
896
897    // Create the bind group.
898    let bind_group = render_device.create_bind_group(
899        Some(&*format!("{} bind group", label)),
900        &pipeline_cache.get_bind_group_layout(bind_group_layout),
901        &BindGroupEntries::sequential((
902            // @group(0) @binding(0) var<storage, read_write> dest_buffer: array<u32>;
903            data_buffer.as_entire_binding(),
904            // @group(0) @binding(1) var<storage> src_buffer: array<u32>;
905            source_data_staging_buffer.as_entire_binding(),
906            // @group(0) @binding(2) var<storage> indices: array<u32>;
907            indices_staging_buffer.as_entire_binding(),
908            // @group(0) @binding(3) var<uniform> metadata:
909            // SparseBufferUpdateMetadata;
910            metadata_buffer.as_entire_binding(),
911        )),
912    );
913    sparse_buffer_update_bind_groups.bind_groups.insert(
914        sparse_buffer_handle,
915        SparseBufferUpdateBindGroup { bind_group },
916    );
917}
918
919/// Ensures that the backing buffer for an [`AtomicSparseBufferVec`] is present
920/// on the GPU.
921///
922/// The `capacity`, `data_buffer`, and `needs_full_reupload` fields are updated
923/// to reflect the new buffer.
924fn reserve(
925    new_capacity: usize,
926    capacity: &mut usize,
927    label: &str,
928    data_buffer: &mut Option<Buffer>,
929    buffer_usages: BufferUsages,
930    needs_full_reupload: &mut bool,
931    element_size: usize,
932    render_device: &RenderDevice,
933) {
934    // If the buffer is already big enough, do nothing.
935    if new_capacity == 0 || new_capacity <= *capacity {
936        return;
937    }
938
939    *capacity = new_capacity;
940    *data_buffer = Some(render_device.create_buffer(&BufferDescriptor {
941        label: Some(label),
942        size: element_size as u64 * new_capacity as u64,
943        usage: buffer_usages,
944        mapped_at_creation: false,
945    }));
946
947    // Since we resized the buffer, we need to reupload it.
948    *needs_full_reupload = true;
949}
950
951impl GpuSparseBufferUpdateMetadata {
952    /// Returns a new [`GpuSparseBufferUpdateMetadata`] for the given type and
953    /// page size.
954    fn new<T>(page_size_log2: u32) -> GpuSparseBufferUpdateMetadata {
955        assert_eq!(size_of::<T>() % 4, 0);
956        GpuSparseBufferUpdateMetadata {
957            element_size: (size_of::<T>() / 4) as u32,
958            updated_page_count: 0,
959            page_size_log2,
960        }
961    }
962}
963
964/// Iterates over the bits in a single `u64`, from the least significant bit to
965/// the most significant bit.
966struct BitIter(u64);
967
968impl BitIter {
969    fn new(bits: u64) -> BitIter {
970        BitIter(bits)
971    }
972}
973
974impl Iterator for BitIter {
975    type Item = u32;
976
977    fn next(&mut self) -> Option<Self::Item> {
978        let trailing_zeros = self.0.trailing_zeros();
979        if trailing_zeros == 64 {
980            return None;
981        }
982        self.0 &= !(1 << trailing_zeros);
983        Some(trailing_zeros)
984    }
985}
986
987/// Calculates the size that a buffer should be in order to balance reallocation
988/// frequency against memory waste.
989fn calculate_allocation_size(length: usize) -> usize {
990    let exponent = (length as f64).log(REALLOCATION_FACTOR).ceil();
991    let size = REALLOCATION_FACTOR.powf(exponent) as usize;
992    size.next_multiple_of(REALLOCATION_SIZE_MULTIPLE)
993}