Browse Source

Attempt to optimize SwizzleImpl

Try to fix GPU unswizzle causing a crash
Added steamdeck detection to disable costly effects in MP4
pull/3246/head
Forrest Keller 3 months ago
committed by crueter
parent
commit
dc1d1fbf67
  1. 84
      src/CMakeLists.txt
  2. 38
      src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
  3. 4
      src/video_core/renderer_opengl/gl_device.cpp
  4. 2
      src/video_core/renderer_opengl/gl_device.h
  5. 4
      src/video_core/renderer_opengl/gl_texture_cache.cpp
  6. 2
      src/video_core/renderer_opengl/gl_texture_cache.h
  7. 25
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  8. 4
      src/video_core/renderer_vulkan/vk_texture_cache.cpp
  9. 2
      src/video_core/renderer_vulkan/vk_texture_cache.h
  10. 19
      src/video_core/texture_cache/texture_cache.h
  11. 7
      src/video_core/vulkan_common/vulkan_device.cpp
  12. 1
      src/video_core/vulkan_common/vulkan_device.h

84
src/CMakeLists.txt

@ -15,6 +15,90 @@ endif()
# CMake seems to only define _DEBUG on Windows # CMake seems to only define _DEBUG on Windows
set_property(DIRECTORY APPEND PROPERTY set_property(DIRECTORY APPEND PROPERTY
COMPILE_DEFINITIONS $<$<CONFIG:Debug>:_DEBUG> $<$<NOT:$<CONFIG:Debug>>:NDEBUG>) COMPILE_DEFINITIONS $<$<CONFIG:Debug>:_DEBUG> $<$<NOT:$<CONFIG:Debug>>:NDEBUG>)
# Platform detection macros
if(WIN32)
add_compile_definitions(PLATFORM_WINDOWS)
elseif(ANDROID)
add_compile_definitions(PLATFORM_ANDROID)
elseif(APPLE)
add_compile_definitions(PLATFORM_MACOS)
elseif(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
add_compile_definitions(PLATFORM_FREEBSD)
elseif(UNIX)
add_compile_definitions(PLATFORM_LINUX)
endif()
# Architecture-specific intrinsics flags
if(ARCHITECTURE_x86_64)
add_compile_definitions(ARCH_X86_64)
if(MSVC AND NOT CXX_CLANG)
# MSVC: Enable AVX2 (includes BMI2 support)
add_compile_options(/arch:AVX2)
else()
# GCC/Clang: Enable SSE4.2, AVX2, and BMI2
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2)
if(COMPILER_SUPPORTS_AVX2)
add_compile_options(-mavx2)
endif()
if(COMPILER_SUPPORTS_BMI2)
add_compile_options(-mbmi2)
add_compile_definitions(HAS_BMI2)
endif()
endif()
elseif(ARCHITECTURE_arm64)
add_compile_definitions(ARCH_ARM64)
# ARM64 has NEON by default, but enable explicitly for some compilers
if(NOT MSVC)
add_compile_options(-march=armv8-a)
endif()
# Android-specific ARM64 configuration
if(ANDROID AND ANDROID_ABI STREQUAL "arm64-v8a")
add_compile_options(-march=armv8-a+simd)
endif()
elseif(ARCHITECTURE_arm)
add_compile_definitions(ARCH_ARM32)
# Check for NEON support on ARM32
if(NOT MSVC)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-mfpu=neon" COMPILER_SUPPORTS_NEON)
if(COMPILER_SUPPORTS_NEON)
add_compile_options(-mfpu=neon)
add_compile_definitions(HAS_NEON)
# Android ARM32 specific
if(ANDROID)
add_compile_options(-mfloat-abi=softfp)
endif()
endif()
endif()
endif()
# Android-specific ABI configurations
if(ANDROID)
if(ANDROID_ABI STREQUAL "x86_64")
add_compile_definitions(ARCH_X86_64)
if(NOT MSVC)
add_compile_options(-msse4.2 -mavx2 -mbmi2)
endif()
elseif(ANDROID_ABI STREQUAL "armeabi-v7a")
add_compile_definitions(ARCH_ARM32)
add_compile_options(-mfpu=neon -mfloat-abi=softfp)
add_compile_definitions(HAS_NEON)
endif()
endif()
# Set compilation flags # Set compilation flags
if (MSVC AND NOT CXX_CLANG) if (MSVC AND NOT CXX_CLANG)

38
src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp

@ -81,12 +81,12 @@ layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; }; layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; }; layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_BUFFER, std430) buffer OutputBuffer {
layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer {
uint out_u32[]; uint out_u32[];
}; };
// --- Constants --- // --- Constants ---
layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in;
layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in;
const uint GOB_SIZE_X = 64; const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8; const uint GOB_SIZE_Y = 8;
@ -100,9 +100,18 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_S
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u); const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
// --- Helpers --- // --- Helpers ---
shared uint swizzle_cache[64 * 8]; // Cache 64x8 GOB worth of swizzle data
uint SwizzleOffset(uvec2 pos) { uint SwizzleOffset(uvec2 pos) {
pos &= SWIZZLE_MASK; pos &= SWIZZLE_MASK;
return swizzle_table[pos.y * 64u + pos.x];
uint local_id = gl_LocalInvocationIndex;
if (local_id < 512u) {
swizzle_cache[local_id] = swizzle_table[local_id];
}
barrier();
return swizzle_cache[pos.y * 64u + pos.x];
} }
uvec4 ReadTexel(uint offset) { uvec4 ReadTexel(uint offset) {
@ -152,12 +161,23 @@ void main() {
uint block_index = block_coord.x + uint block_index = block_coord.x +
(block_coord.y * pc.blocks_dim.x) + (block_coord.y * pc.blocks_dim.x) +
(block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y); (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
uint out_idx = block_index * (bytes_per_block >> 2u);
out_u32[out_idx] = texel.x;
out_u32[out_idx + 1] = texel.y;
if (pc.bytes_per_block_log2 == 4u) {
out_u32[out_idx + 2] = texel.z;
out_u32[out_idx + 3] = texel.w;
if (bytes_per_block == 16u) {
// BC6H/BC7
uvec4 out_data[1];
out_data[0] = texel;
out_u32[block_index * 4u] = texel.x;
out_u32[block_index * 4u + 1u] = texel.y;
out_u32[block_index * 4u + 2u] = texel.z;
out_u32[block_index * 4u + 3u] = texel.w;
} else if (bytes_per_block == 8u) {
// BC1/BC4
uint out_idx = block_index * 2u;
out_u32[out_idx] = texel.x;
out_u32[out_idx + 1u] = texel.y;
} else {
uint out_idx = block_index * (bytes_per_block >> 2u);
out_u32[out_idx] = texel.x;
if (bytes_per_block > 4u) out_u32[out_idx + 1u] = texel.y;
} }
} }

4
src/video_core/renderer_opengl/gl_device.cpp

@ -350,4 +350,8 @@ u64 Device::GetCurrentDedicatedVideoMemory() const {
return static_cast<u64>(cur_avail_mem_kb) * 1_KiB; return static_cast<u64>(cur_avail_mem_kb) * 1_KiB;
} }
bool Device::IsSteamDeck() const {
return false;
}
} // namespace OpenGL } // namespace OpenGL

2
src/video_core/renderer_opengl/gl_device.h

@ -195,6 +195,8 @@ public:
bool HasLmemPerfBug() const { bool HasLmemPerfBug() const {
return has_lmem_perf_bug; return has_lmem_perf_bug;
} }
bool IsSteamDeck() const;
private: private:
static bool TestVariableAoffi(); static bool TestVariableAoffi();

4
src/video_core/renderer_opengl/gl_texture_cache.cpp

@ -693,6 +693,10 @@ bool TextureCacheRuntime::HasNativeASTC() const noexcept {
return device.HasASTC(); return device.HasASTC();
} }
bool TextureCacheRuntime::IsSteamDeck() const {
return device.IsSteamDeck();
}
Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_,
VAddr cpu_addr_) VAddr cpu_addr_)
: VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} { : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} {

2
src/video_core/renderer_opengl/gl_texture_cache.h

@ -109,6 +109,8 @@ public:
void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
bool IsSteamDeck() const;
bool CanImageBeCopied(const Image& dst, const Image& src); bool CanImageBeCopied(const Image& dst, const Image& src);

25
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -775,6 +775,7 @@ void BlockLinearUnswizzle3DPass::Unswizzle(
constexpr u32 SLICES_PER_CHUNK = 64; constexpr u32 SLICES_PER_CHUNK = 64;
scheduler.RequestOutsideRenderPassOperationContext();
for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) { for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) {
const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset); const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset);
const u32 current_z_start = z_start + z_offset; const u32 current_z_start = z_start + z_offset;
@ -826,9 +827,9 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
const void* descriptor_data = compute_pass_descriptor_queue.UpdateData(); const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
const VkDescriptorSet set = descriptor_allocator.Commit(); const VkDescriptorSet set = descriptor_allocator.Commit();
const u32 gx = Common::DivCeil(blocks_x, 32u);
const u32 gx = Common::DivCeil(blocks_x, 8u);
const u32 gy = Common::DivCeil(blocks_y, 8u); const u32 gy = Common::DivCeil(blocks_y, 8u);
const u32 gz = Common::DivCeil(z_count, 1u);
const u32 gz = Common::DivCeil(z_count, 4u);
const u32 bytes_per_block = 1u << pc.bytes_per_block_log2; const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
const VkDeviceSize output_slice_size = const VkDeviceSize output_slice_size =
@ -836,14 +837,18 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
const VkDeviceSize barrier_size = output_slice_size * z_count; const VkDeviceSize barrier_size = output_slice_size * z_count;
const bool is_first_chunk = (z_start == 0); const bool is_first_chunk = (z_start == 0);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
barrier_size, is_first_chunk](vk::CommandBuffer cmdbuf) {
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
const u32 image_width = image.info.size.width;
const u32 image_height = image.info.size.height;
scheduler.Record([this, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
barrier_size, is_first_chunk, out_buffer, dst_image, aspect,
image_width, image_height
](vk::CommandBuffer cmdbuf) {
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
@ -893,7 +898,7 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk(
.bufferImageHeight = 0, .bufferImageHeight = 0,
.imageSubresource = {aspect, 0, 0, 1}, .imageSubresource = {aspect, 0, 0, 1},
.imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z .imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
.imageExtent = {image.info.size.width, image.info.size.height, z_count},
.imageExtent = {image_width, image_height, z_count},
}; };
cmdbuf.CopyBufferToImage(out_buffer, dst_image, cmdbuf.CopyBufferToImage(out_buffer, dst_image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);

4
src/video_core/renderer_vulkan/vk_texture_cache.cpp

@ -1398,6 +1398,10 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im
} }
} }
bool TextureCacheRuntime::IsSteamDeck() const {
return device.IsSteamDeck();
}
VkFormat TextureCacheRuntime::GetSupportedFormat(VkFormat requested_format, VkFormat TextureCacheRuntime::GetSupportedFormat(VkFormat requested_format,
VkFormatFeatureFlags required_features) const { VkFormatFeatureFlags required_features) const {
if (requested_format == VK_FORMAT_A8B8G8R8_SRGB_PACK32 && if (requested_format == VK_FORMAT_A8B8G8R8_SRGB_PACK32 &&

2
src/video_core/renderer_vulkan/vk_texture_cache.h

@ -81,6 +81,8 @@ public:
void ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); void ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view); void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view);
bool IsSteamDeck() const;
bool CanAccelerateImageUpload(Image&) const noexcept { bool CanAccelerateImageUpload(Image&) const noexcept {
return false; return false;

19
src/video_core/texture_cache/texture_cache.h

@ -71,21 +71,10 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
DEFAULT_CRITICAL_MEMORY)); DEFAULT_CRITICAL_MEMORY));
minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2); minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
const u64 device_memory_u64 = static_cast<u64>(device_local_memory);
if (device_memory_u64 <= 4_GiB) {
chunk_size = 16_MiB;
slices_per_batch = 16;
} else if (device_memory_u64 <= 8_GiB) {
chunk_size = 32_MiB;
slices_per_batch = 32;
} else {
chunk_size = 64_MiB;
slices_per_batch = 64;
}
if (device_memory_u64 <= 8_GiB) {
lowmemorydevice = true;
}
chunk_size = 64_MiB;
slices_per_batch = 64;
lowmemorydevice = runtime.IsSteamDeck();
} else { } else {
expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;

7
src/video_core/vulkan_common/vulkan_device.cpp

@ -935,6 +935,13 @@ bool Device::ShouldBoostClocks() const {
return validated_driver && !is_steam_deck && !is_debugging; return validated_driver && !is_steam_deck && !is_debugging;
} }
bool Device::IsSteamDeck() const {
const auto vendor_id = properties.properties.vendorID;
const auto device_id = properties.properties.deviceID;
return (vendor_id == 0x1002 && device_id == 0x163F) ||
(vendor_id == 0x1002 && device_id == 0x1435);
}
bool Device::HasTimelineSemaphore() const { bool Device::HasTimelineSemaphore() const {
if (GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY || if (GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY ||
GetDriverID() == VK_DRIVER_ID_MESA_TURNIP) { GetDriverID() == VK_DRIVER_ID_MESA_TURNIP) {

1
src/video_core/vulkan_common/vulkan_device.h

@ -301,6 +301,7 @@ public:
} }
bool ShouldBoostClocks() const; bool ShouldBoostClocks() const;
bool IsSteamDeck() const;
/// Returns uniform buffer alignment requirement. /// Returns uniform buffer alignment requirement.
VkDeviceSize GetUniformBufferAlignment() const { VkDeviceSize GetUniformBufferAlignment() const {

Loading…
Cancel
Save