diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c06ed56255..058bca7c00 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,90 @@ endif() # CMake seems to only define _DEBUG on Windows set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS $<$:_DEBUG> $<$>:NDEBUG>) + +# Platform detection macros +if(WIN32) + add_compile_definitions(PLATFORM_WINDOWS) +elseif(ANDROID) + add_compile_definitions(PLATFORM_ANDROID) +elseif(APPLE) + add_compile_definitions(PLATFORM_MACOS) +elseif(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") + add_compile_definitions(PLATFORM_FREEBSD) +elseif(UNIX) + add_compile_definitions(PLATFORM_LINUX) +endif() + +# Architecture-specific intrinsics flags +if(ARCHITECTURE_x86_64) + add_compile_definitions(ARCH_X86_64) + + if(MSVC AND NOT CXX_CLANG) + # MSVC: Enable AVX2 (includes BMI2 support) + add_compile_options(/arch:AVX2) + else() + # GCC/Clang: Enable SSE4.2, AVX2, and BMI2 + include(CheckCXXCompilerFlag) + + check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) + check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2) + + if(COMPILER_SUPPORTS_AVX2) + add_compile_options(-mavx2) + endif() + + if(COMPILER_SUPPORTS_BMI2) + add_compile_options(-mbmi2) + add_compile_definitions(HAS_BMI2) + endif() + endif() + +elseif(ARCHITECTURE_arm64) + add_compile_definitions(ARCH_ARM64) + + # ARM64 has NEON by default, but enable explicitly for some compilers + if(NOT MSVC) + add_compile_options(-march=armv8-a) + endif() + + # Android-specific ARM64 configuration + if(ANDROID AND ANDROID_ABI STREQUAL "arm64-v8a") + add_compile_options(-march=armv8-a+simd) + endif() + +elseif(ARCHITECTURE_arm) + add_compile_definitions(ARCH_ARM32) + + # Check for NEON support on ARM32 + if(NOT MSVC) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-mfpu=neon" COMPILER_SUPPORTS_NEON) + + if(COMPILER_SUPPORTS_NEON) + add_compile_options(-mfpu=neon) + add_compile_definitions(HAS_NEON) + + # Android ARM32 specific + if(ANDROID) + add_compile_options(-mfloat-abi=softfp) + endif() + endif() + endif() +endif() + +# Android-specific ABI configurations +if(ANDROID) + if(ANDROID_ABI STREQUAL "x86_64") + add_compile_definitions(ARCH_X86_64) + if(NOT MSVC) + add_compile_options(-msse4.2 -mavx2 -mbmi2) + endif() + elseif(ANDROID_ABI STREQUAL "armeabi-v7a") + add_compile_definitions(ARCH_ARM32) + add_compile_options(-mfpu=neon -mfloat-abi=softfp) + add_compile_definitions(HAS_NEON) + endif() +endif() # Set compilation flags if (MSVC AND NOT CXX_CLANG) diff --git a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp index 72e0969cf1..19969688ab 100644 --- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp +++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp @@ -81,12 +81,12 @@ layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; }; layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; }; -layout(binding = BINDING_OUTPUT_BUFFER, std430) buffer OutputBuffer { +layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer { uint out_u32[]; }; // --- Constants --- -layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in; +layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in; const uint GOB_SIZE_X = 64; const uint GOB_SIZE_Y = 8; @@ -100,9 +100,18 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_S const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u); // --- Helpers --- +shared uint swizzle_cache[64 * 8]; // Cache 64x8 GOB worth of swizzle data + uint SwizzleOffset(uvec2 pos) { pos &= SWIZZLE_MASK; - return swizzle_table[pos.y * 64u + pos.x]; + uint local_id = gl_LocalInvocationIndex; + + if (local_id < 512u) { + swizzle_cache[local_id] = swizzle_table[local_id]; + } + barrier(); + + return swizzle_cache[pos.y * 64u + pos.x]; } uvec4 ReadTexel(uint offset) { @@ -152,12 +161,23 @@ void main() { uint block_index = block_coord.x + (block_coord.y * pc.blocks_dim.x) + (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y); - uint out_idx = block_index * (bytes_per_block >> 2u); - out_u32[out_idx] = texel.x; - out_u32[out_idx + 1] = texel.y; - if (pc.bytes_per_block_log2 == 4u) { - out_u32[out_idx + 2] = texel.z; - out_u32[out_idx + 3] = texel.w; + if (bytes_per_block == 16u) { + // BC6H/BC7 + uvec4 out_data[1]; + out_data[0] = texel; + out_u32[block_index * 4u] = texel.x; + out_u32[block_index * 4u + 1u] = texel.y; + out_u32[block_index * 4u + 2u] = texel.z; + out_u32[block_index * 4u + 3u] = texel.w; + } else if (bytes_per_block == 8u) { + // BC1/BC4 + uint out_idx = block_index * 2u; + out_u32[out_idx] = texel.x; + out_u32[out_idx + 1u] = texel.y; + } else { + uint out_idx = block_index * (bytes_per_block >> 2u); + out_u32[out_idx] = texel.x; + if (bytes_per_block > 4u) out_u32[out_idx + 1u] = texel.y; } } \ No newline at end of file diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index f5bf995d00..0579a3abf1 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -350,4 +350,8 @@ u64 Device::GetCurrentDedicatedVideoMemory() const { return static_cast(cur_avail_mem_kb) * 1_KiB; } +bool Device::IsSteamDeck() const { + return false; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index a5a6bbbba7..c11a382382 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -195,6 +195,8 @@ public: bool HasLmemPerfBug() const { return has_lmem_perf_bug; } + + bool IsSteamDeck() const; private: static bool TestVariableAoffi(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 75254049a6..a327e4b879 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -693,6 +693,10 @@ bool TextureCacheRuntime::HasNativeASTC() const noexcept { return device.HasASTC(); } +bool TextureCacheRuntime::IsSteamDeck() const { + return device.IsSteamDeck(); +} + Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 9693a97954..bcb559e145 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -109,6 +109,8 @@ public: void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { UNIMPLEMENTED(); } + + bool IsSteamDeck() const; bool CanImageBeCopied(const Image& dst, const Image& src); diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 411e33f06a..a83d2931fd 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -775,6 +775,7 @@ void BlockLinearUnswizzle3DPass::Unswizzle( constexpr u32 SLICES_PER_CHUNK = 64; + scheduler.RequestOutsideRenderPassOperationContext(); for (u32 z_offset = 0; z_offset < z_count; z_offset += SLICES_PER_CHUNK) { const u32 current_chunk_slices = std::min(SLICES_PER_CHUNK, z_count - z_offset); const u32 current_z_start = z_start + z_offset; @@ -826,9 +827,9 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk( const void* descriptor_data = compute_pass_descriptor_queue.UpdateData(); const VkDescriptorSet set = descriptor_allocator.Commit(); - const u32 gx = Common::DivCeil(blocks_x, 32u); + const u32 gx = Common::DivCeil(blocks_x, 8u); const u32 gy = Common::DivCeil(blocks_y, 8u); - const u32 gz = Common::DivCeil(z_count, 1u); + const u32 gz = Common::DivCeil(z_count, 4u); const u32 bytes_per_block = 1u << pc.bytes_per_block_log2; const VkDeviceSize output_slice_size = @@ -836,14 +837,18 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk( const VkDeviceSize barrier_size = output_slice_size * z_count; const bool is_first_chunk = (z_start == 0); - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this, &image, set, descriptor_data, pc, gx, gy, gz, z_start, z_count, - barrier_size, is_first_chunk](vk::CommandBuffer cmdbuf) { - const VkBuffer out_buffer = *image.compute_unswizzle_buffer; - const VkImage dst_image = image.Handle(); - const VkImageAspectFlags aspect = image.AspectMask(); + const VkBuffer out_buffer = *image.compute_unswizzle_buffer; + const VkImage dst_image = image.Handle(); + const VkImageAspectFlags aspect = image.AspectMask(); + const u32 image_width = image.info.size.width; + const u32 image_height = image.info.size.height; + + scheduler.Record([this, set, descriptor_data, pc, gx, gy, gz, z_start, z_count, + barrier_size, is_first_chunk, out_buffer, dst_image, aspect, + image_width, image_height + ](vk::CommandBuffer cmdbuf) { + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); @@ -893,7 +898,7 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk( .bufferImageHeight = 0, .imageSubresource = {aspect, 0, 0, 1}, .imageOffset = {0, 0, static_cast(z_start)}, // Write to correct Z - .imageExtent = {image.info.size.width, image.info.size.height, z_count}, + .imageExtent = {image_width, image_height, z_count}, }; cmdbuf.CopyBufferToImage(out_buffer, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 6437cc4c5b..4f24cc13cd 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1398,6 +1398,10 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im } } +bool TextureCacheRuntime::IsSteamDeck() const { + return device.IsSteamDeck(); +} + VkFormat TextureCacheRuntime::GetSupportedFormat(VkFormat requested_format, VkFormatFeatureFlags required_features) const { if (requested_format == VK_FORMAT_A8B8G8R8_SRGB_PACK32 && diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index b545ba8669..cc8b888d57 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -81,6 +81,8 @@ public: void ReinterpretImage(Image& dst, Image& src, std::span copies); void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view); + + bool IsSteamDeck() const; bool CanAccelerateImageUpload(Image&) const noexcept { return false; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 79285ec707..f07699ea70 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -71,21 +71,10 @@ TextureCache

::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag DEFAULT_CRITICAL_MEMORY)); minimum_memory = static_cast((device_local_memory - mem_threshold) / 2); - const u64 device_memory_u64 = static_cast(device_local_memory); - if (device_memory_u64 <= 4_GiB) { - chunk_size = 16_MiB; - slices_per_batch = 16; - } else if (device_memory_u64 <= 8_GiB) { - chunk_size = 32_MiB; - slices_per_batch = 32; - } else { - chunk_size = 64_MiB; - slices_per_batch = 64; - } - - if (device_memory_u64 <= 8_GiB) { - lowmemorydevice = true; - } + chunk_size = 64_MiB; + slices_per_batch = 64; + + lowmemorydevice = runtime.IsSteamDeck(); } else { expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 2ae5052640..dfb35c8789 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -935,6 +935,13 @@ bool Device::ShouldBoostClocks() const { return validated_driver && !is_steam_deck && !is_debugging; } +bool Device::IsSteamDeck() const { + const auto vendor_id = properties.properties.vendorID; + const auto device_id = properties.properties.deviceID; + return (vendor_id == 0x1002 && device_id == 0x163F) || + (vendor_id == 0x1002 && device_id == 0x1435); +} + bool Device::HasTimelineSemaphore() const { if (GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY || GetDriverID() == VK_DRIVER_ID_MESA_TURNIP) { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 9b08dc2926..15ed89ba0d 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -301,6 +301,7 @@ public: } bool ShouldBoostClocks() const; + bool IsSteamDeck() const; /// Returns uniform buffer alignment requirement. VkDeviceSize GetUniformBufferAlignment() const {