Skip to content
This repository was archived by the owner on Feb 25, 2025. It is now read-only.

[Impeller] Fix 1-d grid computation for compute #42516

Merged
merged 11 commits into from
Jun 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ci/licenses_golden/licenses_flutter
Original file line number Diff line number Diff line change
Expand Up @@ -1601,6 +1601,7 @@ ORIGIN: ../../../flutter/impeller/renderer/snapshot.h + ../../../flutter/LICENSE
ORIGIN: ../../../flutter/impeller/renderer/stroke.comp + ../../../flutter/LICENSE
ORIGIN: ../../../flutter/impeller/renderer/surface.cc + ../../../flutter/LICENSE
ORIGIN: ../../../flutter/impeller/renderer/surface.h + ../../../flutter/LICENSE
ORIGIN: ../../../flutter/impeller/renderer/threadgroup_sizing_test.comp + ../../../flutter/LICENSE
ORIGIN: ../../../flutter/impeller/renderer/vertex_buffer_builder.cc + ../../../flutter/LICENSE
ORIGIN: ../../../flutter/impeller/renderer/vertex_buffer_builder.h + ../../../flutter/LICENSE
ORIGIN: ../../../flutter/impeller/renderer/vertex_descriptor.cc + ../../../flutter/LICENSE
Expand Down Expand Up @@ -4272,6 +4273,7 @@ FILE: ../../../flutter/impeller/renderer/snapshot.h
FILE: ../../../flutter/impeller/renderer/stroke.comp
FILE: ../../../flutter/impeller/renderer/surface.cc
FILE: ../../../flutter/impeller/renderer/surface.h
FILE: ../../../flutter/impeller/renderer/threadgroup_sizing_test.comp
FILE: ../../../flutter/impeller/renderer/vertex_buffer_builder.cc
FILE: ../../../flutter/impeller/renderer/vertex_buffer_builder.h
FILE: ../../../flutter/impeller/renderer/vertex_descriptor.cc
Expand Down
1 change: 1 addition & 0 deletions impeller/renderer/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ if (impeller_enable_compute) {
"stroke.comp",
"path_polyline.comp",
"prefix_sum_test.comp",
"threadgroup_sizing_test.comp",
]
}

Expand Down
6 changes: 4 additions & 2 deletions impeller/renderer/backend/metal/compute_pass_mtl.mm
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,10 @@ static bool Bind(ComputePassBindingsCache& pass,

// Special case for linear processing.
if (height == 1) {
int64_t threadGroups =
std::max(width / maxTotalThreadsPerThreadgroup, 1LL);
int64_t threadGroups = std::max(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Forgot these were ints so we were capped at 1.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test for this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Doing so required me to update the vulkan compute shaders to provide a specialization constant which contains the actual wg size, as these need to agree with the value in compute_pass.vk for anything to work correctly.

static_cast<int64_t>(
std::ceil(width * 1.0 / maxTotalThreadsPerThreadgroup * 1.0)),
1LL);
[encoder dispatchThreadgroups:MTLSizeMake(threadGroups, 1, 1)
threadsPerThreadgroup:MTLSizeMake(maxTotalThreadsPerThreadgroup,
1, 1)];
Expand Down
22 changes: 15 additions & 7 deletions impeller/renderer/backend/vulkan/compute_pass_vk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,14 +252,22 @@ bool ComputePassVK::OnEncodeCommands(const Context& context,
int64_t width = grid_size.width;
int64_t height = grid_size.height;

while (width > max_wg_size[0]) {
width = std::max(static_cast<int64_t>(1), width / 2);
// Special case for linear processing.
if (height == 1) {
int64_t minimum = 1;
int64_t threadGroups = std::max(
static_cast<int64_t>(std::ceil(width * 1.0 / max_wg_size[0] * 1.0)),
minimum);
cmd_buffer.dispatch(threadGroups, 1, 1);
} else {
while (width > max_wg_size[0]) {
width = std::max(static_cast<int64_t>(1), width / 2);
}
while (height > max_wg_size[1]) {
height = std::max(static_cast<int64_t>(1), height / 2);
}
cmd_buffer.dispatch(width, height, 1);
}
while (height > max_wg_size[1]) {
height = std::max(static_cast<int64_t>(1), height / 2);
}

cmd_buffer.dispatch(width, height, 1);
}
}

Expand Down
5 changes: 5 additions & 0 deletions impeller/renderer/backend/vulkan/context_vk.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ class ContextVK final : public Context,
struct DeviceHolderImpl : public DeviceHolder {
// |DeviceHolder|
const vk::Device& GetDevice() const override { return device.get(); }
// |DeviceHolder|
const vk::PhysicalDevice& GetPhysicalDevice() const override {
return physical_device;
}

vk::UniqueInstance instance;
vk::PhysicalDevice physical_device;
vk::UniqueDevice device;
Expand Down
1 change: 1 addition & 0 deletions impeller/renderer/backend/vulkan/device_holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class DeviceHolder {
public:
virtual ~DeviceHolder() = default;
virtual const vk::Device& GetDevice() const = 0;
virtual const vk::PhysicalDevice& GetPhysicalDevice() const = 0;
};

} // namespace impeller
28 changes: 23 additions & 5 deletions impeller/renderer/backend/vulkan/pipeline_library_vk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -357,17 +357,35 @@ std::unique_ptr<ComputePipelineVK> PipelineLibraryVK::CreateComputePipeline(
return nullptr;
}

std::shared_ptr<DeviceHolder> strong_device = device_holder_.lock();
if (!strong_device) {
return nullptr;
}
auto device_properties = strong_device->GetPhysicalDevice().getProperties();
auto max_wg_size = device_properties.limits.maxComputeWorkGroupSize;

// Give all compute shaders a specialization constant entry for the
// workgroup/threadgroup size.
vk::SpecializationMapEntry specialization_map_entry[1];

uint32_t workgroup_size_x = max_wg_size[0];
specialization_map_entry[0].constantID = 0;
specialization_map_entry[0].offset = 0;
specialization_map_entry[0].size = sizeof(uint32_t);

vk::SpecializationInfo specialization_info;
specialization_info.mapEntryCount = 1;
specialization_info.pMapEntries = &specialization_map_entry[0];
specialization_info.dataSize = sizeof(uint32_t);
specialization_info.pData = &workgroup_size_x;

vk::PipelineShaderStageCreateInfo info;
info.setStage(vk::ShaderStageFlagBits::eCompute);
info.setPName("main");
info.setModule(ShaderFunctionVK::Cast(entrypoint.get())->GetModule());
info.setPSpecializationInfo(&specialization_info);
pipeline_info.setStage(info);

std::shared_ptr<DeviceHolder> strong_device = device_holder_.lock();
if (!strong_device) {
return nullptr;
}

//----------------------------------------------------------------------------
/// Pipeline Layout a.k.a the descriptor sets and uniforms.
///
Expand Down
3 changes: 1 addition & 2 deletions impeller/renderer/compute_pipeline_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ struct ComputePipelineBuilder {
ComputePipelineDescriptor desc;
if (InitializePipelineDescriptorDefaults(context, desc)) {
return {std::move(desc)};
} else {
return std::nullopt;
}
return std::nullopt;
}

[[nodiscard]] static bool InitializePipelineDescriptorDefaults(
Expand Down
54 changes: 54 additions & 0 deletions impeller/renderer/compute_unittests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "impeller/renderer/compute_pipeline_builder.h"
#include "impeller/renderer/pipeline_library.h"
#include "impeller/renderer/prefix_sum_test.comp.h"
#include "impeller/renderer/threadgroup_sizing_test.comp.h"

namespace impeller {
namespace testing {
Expand Down Expand Up @@ -176,6 +177,59 @@ TEST_P(ComputeTest, CanComputePrefixSum) {
latch.Wait();
}

TEST_P(ComputeTest, 1DThreadgroupSizingIsCorrect) {
using CS = ThreadgroupSizingTestComputeShader;
auto context = GetContext();
ASSERT_TRUE(context);
ASSERT_TRUE(context->GetCapabilities()->SupportsCompute());

using SamplePipelineBuilder = ComputePipelineBuilder<CS>;
auto pipeline_desc =
SamplePipelineBuilder::MakeDefaultPipelineDescriptor(*context);
ASSERT_TRUE(pipeline_desc.has_value());
auto compute_pipeline =
context->GetPipelineLibrary()->GetPipeline(pipeline_desc).Get();
ASSERT_TRUE(compute_pipeline);

auto cmd_buffer = context->CreateCommandBuffer();
auto pass = cmd_buffer->CreateComputePass();
ASSERT_TRUE(pass && pass->IsValid());

static constexpr size_t kCount = 2048;

pass->SetGridSize(ISize(kCount, 1));
pass->SetThreadGroupSize(ISize(kCount, 1));

ComputeCommand cmd;
cmd.label = "Compute";
cmd.pipeline = compute_pipeline;

auto output_buffer = CreateHostVisibleDeviceBuffer<CS::OutputData<kCount>>(
context, "Output Buffer");

CS::BindOutputData(cmd, output_buffer->AsBufferView());

ASSERT_TRUE(pass->AddCommand(std::move(cmd)));
ASSERT_TRUE(pass->EncodeCommands());

fml::AutoResetWaitableEvent latch;
ASSERT_TRUE(cmd_buffer->SubmitCommands(
[&latch, output_buffer](CommandBuffer::Status status) {
EXPECT_EQ(status, CommandBuffer::Status::kCompleted);

auto view = output_buffer->AsBufferView();
EXPECT_EQ(view.range.length, sizeof(CS::OutputData<kCount>));

CS::OutputData<kCount>* output =
reinterpret_cast<CS::OutputData<kCount>*>(view.contents);
EXPECT_TRUE(output);
EXPECT_EQ(output->data[kCount - 1], kCount - 1);
latch.Signal();
}));

latch.Wait();
}

TEST_P(ComputeTest, CanComputePrefixSumLargeInteractive) {
using CS = PrefixSumTestComputeShader;

Expand Down
4 changes: 1 addition & 3 deletions impeller/renderer/prefix_sum_test.comp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// TODO(dnfield): This should not need to be so small,
// https://github.com/flutter/flutter/issues/119357
layout(local_size_x = 256, local_size_y = 1) in;
layout(local_size_x_id = 0) in;
layout(std430) buffer;

#include <impeller/prefix_sum.glsl>
Expand Down
18 changes: 18 additions & 0 deletions impeller/renderer/threadgroup_sizing_test.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright 2013 The Flutter Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Size is passed in via specialization constant.
layout(local_size_x_id = 0) in;

layout(std430) buffer;

layout(binding = 1) writeonly buffer OutputData {
uint data[];
}
output_data;

void main() {
uint ident = gl_GlobalInvocationID.x;
output_data.data[ident] = ident;
}
76 changes: 69 additions & 7 deletions impeller/tools/malioc.json
Original file line number Diff line number Diff line change
Expand Up @@ -13569,9 +13569,9 @@
"load_store"
],
"longest_path_cycles": [
2.65625,
2.450000047683716,
0.0,
2.65625,
2.450000047683716,
1.0,
72.0,
0.0
Expand All @@ -13589,9 +13589,9 @@
"arith_cvt"
],
"shortest_path_cycles": [
0.9375,
0.762499988079071,
0.0,
0.9375,
0.762499988079071,
0.0,
0.0,
0.0
Expand All @@ -13600,9 +13600,9 @@
"load_store"
],
"total_cycles": [
2.65625,
2.46875,
0.0,
2.65625,
2.46875,
1.0,
72.0,
0.0
Expand All @@ -13612,7 +13612,7 @@
"stack_spill_bytes": 0,
"thread_occupancy": 100,
"uniform_registers_used": 8,
"work_registers_used": 17
"work_registers_used": 18
}
}
}
Expand Down Expand Up @@ -13680,6 +13680,68 @@
}
}
},
"flutter/impeller/renderer/threadgroup_sizing_test.comp.vkspv": {
"Mali-G78": {
"core": "Mali-G78",
"filename": "flutter/impeller/renderer/threadgroup_sizing_test.comp.vkspv",
"has_uniform_computation": true,
"type": "Compute",
"variants": {
"Main": {
"fp16_arithmetic": null,
"has_stack_spilling": false,
"performance": {
"longest_path_bound_pipelines": [
"load_store"
],
"longest_path_cycles": [
0.03125,
0.0,
0.03125,
0.0,
1.0,
0.0
],
"pipelines": [
"arith_total",
"arith_fma",
"arith_cvt",
"arith_sfu",
"load_store",
"texture"
],
"shortest_path_bound_pipelines": [
"load_store"
],
"shortest_path_cycles": [
0.03125,
0.0,
0.03125,
0.0,
1.0,
0.0
],
"total_bound_pipelines": [
"load_store"
],
"total_cycles": [
0.03125,
0.0,
0.03125,
0.0,
1.0,
0.0
]
},
"shared_storage_used": 0,
"stack_spill_bytes": 0,
"thread_occupancy": 100,
"uniform_registers_used": 2,
"work_registers_used": 4
}
}
}
},
"flutter/impeller/scene/shaders/gles/skinned.vert.gles": {
"Mali-G78": {
"core": "Mali-G78",
Expand Down