Skip to content

Don't reallocate work item buffers every frame. #17684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions crates/bevy_pbr/src/render/gpu_preprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ use bevy_render::{
},
renderer::{RenderContext, RenderDevice, RenderQueue},
settings::WgpuFeatures,
view::{NoIndirectDrawing, ViewUniform, ViewUniformOffset, ViewUniforms},
view::{ExtractedView, NoIndirectDrawing, ViewUniform, ViewUniformOffset, ViewUniforms},
Render, RenderApp, RenderSet,
};
use bevy_utils::TypeIdMap;
Expand Down Expand Up @@ -100,7 +100,7 @@ pub struct GpuMeshPreprocessPlugin {
pub struct EarlyGpuPreprocessNode {
view_query: QueryState<
(
Entity,
Read<ExtractedView>,
Option<Read<PreprocessBindGroups>>,
Option<Read<ViewUniformOffset>>,
Has<NoIndirectDrawing>,
Expand All @@ -120,7 +120,11 @@ pub struct EarlyGpuPreprocessNode {
/// metadata for the subsequent [`LatePrepassBuildIndirectParametersNode`].
pub struct LateGpuPreprocessNode {
view_query: QueryState<
(Entity, Read<PreprocessBindGroups>, Read<ViewUniformOffset>),
(
Read<ExtractedView>,
Read<PreprocessBindGroups>,
Read<ViewUniformOffset>,
),
(
Without<SkipGpuPreprocess>,
Without<NoIndirectDrawing>,
Expand Down Expand Up @@ -580,7 +584,8 @@ impl Node for EarlyGpuPreprocessNode {
};

// Grab the work item buffers for this view.
let Some(phase_work_item_buffers) = index_buffers.get(&view) else {
let Some(phase_work_item_buffers) = index_buffers.get(&view.retained_view_entity)
else {
warn!("The preprocessing index buffer wasn't present");
continue;
};
Expand Down Expand Up @@ -791,7 +796,8 @@ impl Node for LateGpuPreprocessNode {
// Run the compute passes.
for (view, bind_groups, view_uniform_offset) in self.view_query.iter_manual(world) {
// Grab the work item buffers for this view.
let Some(phase_work_item_buffers) = work_item_buffers.get(&view) else {
let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
else {
warn!("The preprocessing index buffer wasn't present");
continue;
};
Expand Down Expand Up @@ -1619,6 +1625,7 @@ impl BuildIndirectParametersPipeline {
)]
pub fn prepare_preprocess_bind_groups(
mut commands: Commands,
views: Query<(Entity, &ExtractedView)>,
view_depth_pyramids: Query<(&ViewDepthPyramid, &PreviousViewUniformOffset)>,
render_device: Res<RenderDevice>,
batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
Expand Down Expand Up @@ -1651,14 +1658,19 @@ pub fn prepare_preprocess_bind_groups(
let mut any_indirect = false;

// Loop over each view.
for (view, phase_work_item_buffers) in work_item_buffers {
for (view_entity, view) in &views {
let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
else {
continue;
};

let mut bind_groups = TypeIdMap::default();

// Loop over each phase.
for (&phase_id, work_item_buffers) in phase_work_item_buffers {
// Create the `PreprocessBindGroupBuilder`.
let preprocess_bind_group_builder = PreprocessBindGroupBuilder {
view: *view,
view: view_entity,
late_indexed_indirect_parameters_buffer,
late_non_indexed_indirect_parameters_buffer,
render_device: &render_device,
Expand Down Expand Up @@ -1719,7 +1731,7 @@ pub fn prepare_preprocess_bind_groups(

// Save the bind groups.
commands
.entity(*view)
.entity(view_entity)
.insert(PreprocessBindGroups(bind_groups));
}

Expand Down
62 changes: 46 additions & 16 deletions crates/bevy_render/src/batching/gpu_preprocessing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use core::any::TypeId;

use bevy_app::{App, Plugin};
use bevy_ecs::{
entity::{hash_map::EntityHashMap, Entity},
query::{Has, With},
resource::Resource,
schedule::IntoSystemConfigs as _,
Expand All @@ -13,7 +12,7 @@ use bevy_ecs::{
};
use bevy_encase_derive::ShaderType;
use bevy_math::UVec4;
use bevy_platform_support::collections::hash_map::Entry;
use bevy_platform_support::collections::{hash_map::Entry, HashMap, HashSet};
use bevy_utils::{default, TypeIdMap};
use bytemuck::{Pod, Zeroable};
use nonmax::NonMaxU32;
Expand All @@ -30,7 +29,7 @@ use crate::{
},
render_resource::{Buffer, BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
renderer::{RenderAdapter, RenderDevice, RenderQueue},
view::{ExtractedView, NoIndirectDrawing},
view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
Render, RenderApp, RenderSet,
};

Expand Down Expand Up @@ -157,7 +156,7 @@ where
/// corresponds to each instance.
///
/// This is keyed off each view. Each view has a separate buffer.
pub work_item_buffers: EntityHashMap<TypeIdMap<PreprocessWorkItemBuffers>>,
pub work_item_buffers: HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,

/// The uniform data inputs for the current frame.
///
Expand Down Expand Up @@ -409,8 +408,8 @@ impl Default for LatePreprocessWorkItemIndirectParameters {
/// You may need to call this function if you're implementing your own custom
/// render phases. See the `specialized_mesh_pipeline` example.
pub fn get_or_create_work_item_buffer<'a, I>(
work_item_buffers: &'a mut EntityHashMap<TypeIdMap<PreprocessWorkItemBuffers>>,
view: Entity,
work_item_buffers: &'a mut HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,
view: RetainedViewEntity,
no_indirect_drawing: bool,
gpu_occlusion_culling: bool,
late_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec<
Expand Down Expand Up @@ -493,6 +492,28 @@ impl PreprocessWorkItemBuffers {
}
}
}

/// Clears out the GPU work item buffers in preparation for a new frame.
pub fn clear(&mut self) {
match *self {
PreprocessWorkItemBuffers::Direct(ref mut buffer) => {
buffer.clear();
}
PreprocessWorkItemBuffers::Indirect {
indexed: ref mut indexed_buffer,
non_indexed: ref mut non_indexed_buffer,
ref mut gpu_occlusion_culling,
} => {
indexed_buffer.clear();
non_indexed_buffer.clear();

if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling {
gpu_occlusion_culling.late_indexed.clear();
gpu_occlusion_culling.late_non_indexed.clear();
}
}
}
}
}

/// One invocation of the preprocessing shader: i.e. one mesh instance in a
Expand Down Expand Up @@ -942,7 +963,7 @@ where
pub fn new() -> Self {
BatchedInstanceBuffers {
data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
work_item_buffers: EntityHashMap::default(),
work_item_buffers: HashMap::default(),
current_input_buffer: InstanceInputUniformBuffer::new(),
previous_input_buffer: InstanceInputUniformBuffer::new(),
late_indexed_indirect_parameters_buffer: RawBufferVec::new(
Expand All @@ -968,7 +989,14 @@ where
self.data_buffer.clear();
self.late_indexed_indirect_parameters_buffer.clear();
self.late_non_indexed_indirect_parameters_buffer.clear();
self.work_item_buffers.clear();

// Clear each individual set of buffers, but don't depopulate the hash
// table. We want to avoid reallocating these vectors every frame.
for view_work_item_buffers in self.work_item_buffers.values_mut() {
for phase_work_item_buffers in view_work_item_buffers.values_mut() {
phase_work_item_buffers.clear();
}
}
}
}

Expand Down Expand Up @@ -1074,13 +1102,17 @@ pub fn delete_old_work_item_buffers<GFBD>(
mut gpu_batched_instance_buffers: ResMut<
BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
>,
extracted_views: Query<Entity, With<ExtractedView>>,
extracted_views: Query<&ExtractedView>,
) where
GFBD: GetFullBatchData,
{
let retained_view_entities: HashSet<_> = extracted_views
.iter()
.map(|extracted_view| extracted_view.retained_view_entity)
.collect();
gpu_batched_instance_buffers
.work_item_buffers
.retain(|entity, _| extracted_views.contains(*entity));
.retain(|retained_view_entity, _| retained_view_entities.contains(retained_view_entity));
}

/// Batch the items in a sorted render phase, when GPU instance buffer building
Expand All @@ -1091,7 +1123,6 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
mut sorted_render_phases: ResMut<ViewSortedRenderPhases<I>>,
mut views: Query<(
Entity,
&ExtractedView,
Has<NoIndirectDrawing>,
Has<OcclusionCulling>,
Expand All @@ -1110,15 +1141,15 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
..
} = gpu_array_buffer.into_inner();

for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else {
continue;
};

// Create the work item buffer if necessary.
let work_item_buffer = get_or_create_work_item_buffer::<I>(
work_item_buffers,
view,
extracted_view.retained_view_entity,
no_indirect_drawing,
gpu_occlusion_culling,
late_indexed_indirect_parameters_buffer,
Expand Down Expand Up @@ -1248,7 +1279,6 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
mut views: Query<
(
Entity,
&ExtractedView,
Has<NoIndirectDrawing>,
Has<OcclusionCulling>,
Expand All @@ -1270,7 +1300,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
..
} = gpu_array_buffer.into_inner();

for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else {
continue;
};
Expand All @@ -1279,7 +1309,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
// used this frame.
let work_item_buffer = get_or_create_work_item_buffer::<BPI>(
work_item_buffers,
view,
extracted_view.retained_view_entity,
no_indirect_drawing,
gpu_occlusion_culling,
late_indexed_indirect_parameters_buffer,
Expand Down
13 changes: 3 additions & 10 deletions examples/shader/specialized_mesh_pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,6 @@ fn queue_custom_mesh_pipeline(
),
mut specialized_mesh_pipelines: ResMut<SpecializedMeshPipelines<CustomMeshPipeline>>,
views: Query<(
Entity,
&RenderVisibleEntities,
&ExtractedView,
&Msaa,
Expand Down Expand Up @@ -318,14 +317,8 @@ fn queue_custom_mesh_pipeline(
// Render phases are per-view, so we need to iterate over all views so that
// the entity appears in them. (In this example, we have only one view, but
// it's good practice to loop over all views anyway.)
for (
view_entity,
view_visible_entities,
view,
msaa,
no_indirect_drawing,
gpu_occlusion_culling,
) in views.iter()
for (view_visible_entities, view, msaa, no_indirect_drawing, gpu_occlusion_culling) in
views.iter()
{
let Some(opaque_phase) = opaque_render_phases.get_mut(&view.retained_view_entity) else {
continue;
Expand All @@ -336,7 +329,7 @@ fn queue_custom_mesh_pipeline(
// enabled.
let work_item_buffer = gpu_preprocessing::get_or_create_work_item_buffer::<Opaque3d>(
work_item_buffers,
view_entity,
view.retained_view_entity,
no_indirect_drawing,
gpu_occlusion_culling,
late_indexed_indirect_parameters_buffer,
Expand Down