From 440ee4916dd618cea27040fdb9e1c185ac3494f8 Mon Sep 17 00:00:00 2001 From: lizzie Date: Fri, 17 Oct 2025 05:08:51 +0200 Subject: [PATCH] [nca] Use better tight loop allocation schemes (none at all) for AES decrypt/encrypt and force MbedTLS to use AES x86_64 instructions (#2750) Uses stack instead of allocating stuff haphazardly (16 bytes and 512 bytes respectively) - removes malloc() pollution and all that nasty stuff from tight loops Original work by Ribbit but edited by me. Will NOT bring a massive speedup since the main bottleneck is mbedtls itself, but may bring nice oddities to STARTUP TIMES nonetheless. AES instructions being forced wont affect CPUs without them since there is always a runtime check for them. Signed-off-by: lizzie lizzie@eden-emu.dev Co-authored-by: Ribbit Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2750 Reviewed-by: CamilleLaVey Co-authored-by: lizzie Co-committed-by: lizzie --- .patch/mbedtls/0002-aesni-fix.patch | 13 +++ .patch/mbedtls/0003-aesni-fix.patch | 22 +++++ CMakeLists.txt | 4 + externals/cpmfile.json | 6 +- src/core/crypto/aes_util.cpp | 58 ++++++------ src/core/crypto/ctr_encryption_layer.cpp | 49 ++++++---- src/core/crypto/xts_encryption_layer.cpp | 91 ++++++++++++------- .../fssystem/fssystem_aes_ctr_storage.cpp | 29 +++--- .../fssystem/fssystem_aes_xts_storage.cpp | 58 +++++------- 9 files changed, 200 insertions(+), 130 deletions(-) create mode 100644 .patch/mbedtls/0002-aesni-fix.patch create mode 100644 .patch/mbedtls/0003-aesni-fix.patch diff --git a/.patch/mbedtls/0002-aesni-fix.patch b/.patch/mbedtls/0002-aesni-fix.patch new file mode 100644 index 0000000000..dc5d3153b7 --- /dev/null +++ b/.patch/mbedtls/0002-aesni-fix.patch @@ -0,0 +1,13 @@ +diff --git a/library/aesni.h b/library/aesni.h +index 754c984c79..59e27afd3e 100644 +--- a/library/aesni.h ++++ b/library/aesni.h +@@ -35,7 +35,7 @@ + /* GCC-like compilers: currently, we only support intrinsics if the requisite + * target flag is enabled when building the library (e.g. `gcc -mpclmul -msse2` + * or `clang -maes -mpclmul`). */ +-#if (defined(__GNUC__) || defined(__clang__)) && defined(__AES__) && defined(__PCLMUL__) ++#if defined(__GNUC__) || defined(__clang__) + #define MBEDTLS_AESNI_HAVE_INTRINSICS + #endif + /* For 32-bit, we only support intrinsics */ diff --git a/.patch/mbedtls/0003-aesni-fix.patch b/.patch/mbedtls/0003-aesni-fix.patch new file mode 100644 index 0000000000..c620b42554 --- /dev/null +++ b/.patch/mbedtls/0003-aesni-fix.patch @@ -0,0 +1,22 @@ +diff --git a/library/aesni.c b/library/aesni.c +index 2857068..3e104ab 100644 +--- a/library/aesni.c ++++ b/library/aesni.c +@@ -31,16 +31,14 @@ + #include + #endif + +-#if defined(MBEDTLS_ARCH_IS_X86) + #if defined(MBEDTLS_COMPILER_IS_GCC) + #pragma GCC push_options + #pragma GCC target ("pclmul,sse2,aes") + #define MBEDTLS_POP_TARGET_PRAGMA +-#elif defined(__clang__) && (__clang_major__ >= 5) ++#elif defined(__clang__) + #pragma clang attribute push (__attribute__((target("pclmul,sse2,aes"))), apply_to=function) + #define MBEDTLS_POP_TARGET_PRAGMA + #endif +-#endif + + #if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) + /* diff --git a/CMakeLists.txt b/CMakeLists.txt index e0466da3bd..fa35ca3d16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,10 @@ if (PLATFORM_SUN) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") endif() + if (CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + endif() endif() # Needed for FFmpeg w/ VAAPI and DRM diff --git a/externals/cpmfile.json b/externals/cpmfile.json index 669357588a..9025ec2787 100644 --- a/externals/cpmfile.json +++ b/externals/cpmfile.json @@ -97,7 +97,11 @@ "version": "3", "git_version": "3.6.4", "artifact": "%TAG%.tar.bz2", - "skip_updates": true + "skip_updates": true, + "patches": [ + "0002-aesni-fix.patch", + "0003-aesni-fix.patch" + ] }, "enet": { "repo": "lsalzman/enet", diff --git a/src/core/crypto/aes_util.cpp b/src/core/crypto/aes_util.cpp index cd7e15a582..5c31eff3e1 100644 --- a/src/core/crypto/aes_util.cpp +++ b/src/core/crypto/aes_util.cpp @@ -1,7 +1,11 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include #include "common/assert.h" #include "common/logging/log.h" @@ -71,37 +75,37 @@ void AESCipher::Transcode(const u8* src, std::size_t size, u8* des mbedtls_cipher_reset(context); + // Only ECB strictly requires block sized chunks. std::size_t written = 0; - if (mbedtls_cipher_get_cipher_mode(context) == MBEDTLS_MODE_XTS) { + if (mbedtls_cipher_get_cipher_mode(context) != MBEDTLS_MODE_ECB) { mbedtls_cipher_update(context, src, size, dest, &written); - if (written != size) { - LOG_WARNING(Crypto, "Not all data was decrypted requested={:016X}, actual={:016X}.", - size, written); - } - } else { - const auto block_size = mbedtls_cipher_get_block_size(context); - if (size < block_size) { - std::vector block(block_size); - std::memcpy(block.data(), src, size); - Transcode(block.data(), block.size(), block.data(), op); - std::memcpy(dest, block.data(), size); - return; - } + if (written != size) + LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.", size, written); + return; + } + + // ECB path: operate in block sized chunks and mirror previous behavior. + const auto block_size = mbedtls_cipher_get_block_size(context); + if (size < block_size) { + std::vector block(block_size); + std::memcpy(block.data(), src, size); + Transcode(block.data(), block.size(), block.data(), op); + std::memcpy(dest, block.data(), size); + return; + } - for (std::size_t offset = 0; offset < size; offset += block_size) { - auto length = std::min(block_size, size - offset); - mbedtls_cipher_update(context, src + offset, length, dest + offset, &written); - if (written != length) { - if (length < block_size) { - std::vector block(block_size); - std::memcpy(block.data(), src + offset, length); - Transcode(block.data(), block.size(), block.data(), op); - std::memcpy(dest + offset, block.data(), length); - return; - } - LOG_WARNING(Crypto, "Not all data was decrypted requested={:016X}, actual={:016X}.", - length, written); + for (std::size_t offset = 0; offset < size; offset += block_size) { + const auto length = std::min(block_size, size - offset); + mbedtls_cipher_update(context, src + offset, length, dest + offset, &written); + if (written != length) { + if (length < block_size) { + std::vector block(block_size); + std::memcpy(block.data(), src + offset, length); + Transcode(block.data(), block.size(), block.data(), op); + std::memcpy(dest + offset, block.data(), length); + return; } + LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.", length, written); } } } diff --git a/src/core/crypto/ctr_encryption_layer.cpp b/src/core/crypto/ctr_encryption_layer.cpp index b48c3f041f..6769754413 100644 --- a/src/core/crypto/ctr_encryption_layer.cpp +++ b/src/core/crypto/ctr_encryption_layer.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later @@ -15,26 +18,36 @@ std::size_t CTREncryptionLayer::Read(u8* data, std::size_t length, std::size_t o if (length == 0) return 0; - const auto sector_offset = offset & 0xF; - if (sector_offset == 0) { - UpdateIV(base_offset + offset); - std::vector raw = base->ReadBytes(length, offset); - cipher.Transcode(raw.data(), raw.size(), data, Op::Decrypt); - return length; + std::size_t total_read = 0; + // Handle an initial misaligned portion if needed. + if (auto const sector_offset = offset & 0xF; sector_offset != 0) { + const std::size_t aligned_off = offset - sector_offset; + std::array block{}; + if (auto const got = base->Read(block.data(), block.size(), aligned_off); got != 0) { + UpdateIV(base_offset + aligned_off); + cipher.Transcode(block.data(), got, block.data(), Op::Decrypt); + auto const to_copy = std::min(length, got > sector_offset ? got - sector_offset : 0); + if (to_copy > 0) { + std::memcpy(data, block.data() + sector_offset, to_copy); + data += to_copy; + offset += to_copy; + length -= to_copy; + total_read += to_copy; + } + } else { + return 0; + } } - - // offset does not fall on block boundary (0x10) - std::vector block = base->ReadBytes(0x10, offset - sector_offset); - UpdateIV(base_offset + offset - sector_offset); - cipher.Transcode(block.data(), block.size(), block.data(), Op::Decrypt); - std::size_t read = 0x10 - sector_offset; - - if (length + sector_offset < 0x10) { - std::memcpy(data, block.data() + sector_offset, std::min(length, read)); - return std::min(length, read); + if (length > 0) { + // Now aligned to 0x10 + UpdateIV(base_offset + offset); + const std::size_t got = base->Read(data, length, offset); + if (got > 0) { + cipher.Transcode(data, got, data, Op::Decrypt); + total_read += got; + } } - std::memcpy(data, block.data() + sector_offset, read); - return read + Read(data + read, length - read, offset + read); + return total_read; } void CTREncryptionLayer::SetIV(const IVData& iv_) { diff --git a/src/core/crypto/xts_encryption_layer.cpp b/src/core/crypto/xts_encryption_layer.cpp index 36cc501b90..240f930c57 100644 --- a/src/core/crypto/xts_encryption_layer.cpp +++ b/src/core/crypto/xts_encryption_layer.cpp @@ -5,12 +5,13 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include #include "core/crypto/xts_encryption_layer.h" namespace Core::Crypto { -constexpr u64 XTS_SECTOR_SIZE = 0x4000; +constexpr std::size_t XTS_SECTOR_SIZE = 0x4000; XTSEncryptionLayer::XTSEncryptionLayer(FileSys::VirtualFile base_, Key256 key_) : EncryptionLayer(std::move(base_)), cipher(key_, Mode::XTS) {} @@ -19,41 +20,67 @@ std::size_t XTSEncryptionLayer::Read(u8* data, std::size_t length, std::size_t o if (length == 0) return 0; - const auto sector_offset = offset & 0x3FFF; - if (sector_offset == 0) { - if (length % XTS_SECTOR_SIZE == 0) { - std::vector raw = base->ReadBytes(length, offset); - cipher.XTSTranscode(raw.data(), raw.size(), data, offset / XTS_SECTOR_SIZE, + std::size_t total_read = 0; + // Handle initial unaligned part within a sector. + if (auto const sector_offset = offset % XTS_SECTOR_SIZE; sector_offset != 0) { + const std::size_t aligned_off = offset - sector_offset; + std::array block{}; + if (auto const got = base->Read(block.data(), XTS_SECTOR_SIZE, aligned_off); got > 0) { + if (got < XTS_SECTOR_SIZE) + std::memset(block.data() + got, 0, XTS_SECTOR_SIZE - got); + cipher.XTSTranscode(block.data(), XTS_SECTOR_SIZE, block.data(), aligned_off / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt); - return raw.size(); - } - if (length > XTS_SECTOR_SIZE) { - const auto rem = length % XTS_SECTOR_SIZE; - const auto read = length - rem; - return Read(data, read, offset) + Read(data + read, rem, offset + read); + + auto const to_copy = std::min(length, got > sector_offset ? got - sector_offset : 0); + if (to_copy > 0) { + std::memcpy(data, block.data() + sector_offset, to_copy); + data += to_copy; + offset += to_copy; + length -= to_copy; + total_read += to_copy; + } + } else { + return 0; } - std::vector buffer = base->ReadBytes(XTS_SECTOR_SIZE, offset); - if (buffer.size() < XTS_SECTOR_SIZE) - buffer.resize(XTS_SECTOR_SIZE); - cipher.XTSTranscode(buffer.data(), buffer.size(), buffer.data(), offset / XTS_SECTOR_SIZE, - XTS_SECTOR_SIZE, Op::Decrypt); - std::memcpy(data, buffer.data(), (std::min)(buffer.size(), length)); - return (std::min)(buffer.size(), length); } - // offset does not fall on block boundary (0x4000) - std::vector block = base->ReadBytes(0x4000, offset - sector_offset); - if (block.size() < XTS_SECTOR_SIZE) - block.resize(XTS_SECTOR_SIZE); - cipher.XTSTranscode(block.data(), block.size(), block.data(), - (offset - sector_offset) / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt); - const std::size_t read = XTS_SECTOR_SIZE - sector_offset; - - if (length + sector_offset < XTS_SECTOR_SIZE) { - std::memcpy(data, block.data() + sector_offset, std::min(length, read)); - return std::min(length, read); + if (length > 0) { + // Process aligned middle inplace, in sector sized multiples. + while (length >= XTS_SECTOR_SIZE) { + const std::size_t req = (length / XTS_SECTOR_SIZE) * XTS_SECTOR_SIZE; + const std::size_t got = base->Read(data, req, offset); + if (got == 0) { + return total_read; + } + const std::size_t got_rounded = got - (got % XTS_SECTOR_SIZE); + if (got_rounded > 0) { + cipher.XTSTranscode(data, got_rounded, data, offset / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt); + data += got_rounded; + offset += got_rounded; + length -= got_rounded; + total_read += got_rounded; + } + // If we didn't get a full sector next, break to handle tail. + if (got_rounded != got) { + break; + } + } + // Handle tail within a sector, if any. + if (length > 0) { + std::array block{}; + const std::size_t got = base->Read(block.data(), XTS_SECTOR_SIZE, offset); + if (got > 0) { + if (got < XTS_SECTOR_SIZE) { + std::memset(block.data() + got, 0, XTS_SECTOR_SIZE - got); + } + cipher.XTSTranscode(block.data(), XTS_SECTOR_SIZE, block.data(), + offset / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt); + const std::size_t to_copy = std::min(length, got); + std::memcpy(data, block.data(), to_copy); + total_read += to_copy; + } + } } - std::memcpy(data, block.data() + sector_offset, read); - return read + Read(data + read, length - read, offset + read); + return total_read; } } // namespace Core::Crypto diff --git a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp index aaf7788801..1e11d70d8d 100644 --- a/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp +++ b/src/core/file_sys/fssystem/fssystem_aes_ctr_storage.cpp @@ -4,6 +4,7 @@ // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include #include "common/alignment.h" #include "common/swap.h" #include "core/file_sys/fssystem/fssystem_aes_ctr_storage.h" @@ -83,32 +84,24 @@ size_t AesCtrStorage::Write(const u8* buffer, size_t size, size_t offset) { std::memcpy(ctr.data(), m_iv.data(), IvSize); AddCounter(ctr.data(), IvSize, offset / BlockSize); - // Loop until all data is written. - size_t remaining = size; - s64 cur_offset = 0; - - // Get a pooled buffer. - std::vector pooled_buffer(BlockSize); - while (remaining > 0) { + // Loop until all data is written using a pooled buffer residing on the stack (blocksize = 0x10) + boost::container::static_vector pooled_buffer; + for (size_t remaining = size; remaining > 0; ) { // Determine data we're writing and where. - const size_t write_size = std::min(pooled_buffer.size(), remaining); - u8* write_buf = reinterpret_cast(pooled_buffer.data()); + auto const write_size = (std::min)(pooled_buffer.size(), remaining); + u8* write_buf = pooled_buffer.data(); - // Encrypt the data. + // Encrypt the data and then write it. m_cipher->SetIV(ctr); m_cipher->Transcode(buffer, write_size, write_buf, Core::Crypto::Op::Encrypt); + m_base_storage->Write(write_buf, write_size, offset); - // Write the encrypted data. - m_base_storage->Write(write_buf, write_size, offset + cur_offset); - - // Advance. - cur_offset += write_size; + // Advance next write chunk + offset += write_size; remaining -= write_size; - if (remaining > 0) { + if (remaining > 0) AddCounter(ctr.data(), IvSize, write_size / BlockSize); - } } - return size; } diff --git a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp index 9e7a104c89..6e07d44cde 100644 --- a/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp +++ b/src/core/file_sys/fssystem/fssystem_aes_xts_storage.cpp @@ -4,9 +4,13 @@ // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include #include "common/alignment.h" #include "common/swap.h" #include "core/file_sys/fssystem/fssystem_aes_xts_storage.h" +#include "core/file_sys/fssystem/fssystem_nca_header.h" #include "core/file_sys/fssystem/fssystem_utility.h" namespace FileSys { @@ -41,18 +45,12 @@ AesXtsStorage::AesXtsStorage(VirtualFile base, const void* key1, const void* key size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const { // Allow zero-size reads. - if (size == 0) { + if (size == 0) return size; - } - // Ensure buffer is valid. + // Ensure buffer is valid and we can only read at block aligned offsets. ASSERT(buffer != nullptr); - - // We can only read at block aligned offsets. - ASSERT(Common::IsAligned(offset, AesBlockSize)); - ASSERT(Common::IsAligned(size, AesBlockSize)); - - // Read the data. + ASSERT(Common::IsAligned(offset, AesBlockSize) && Common::IsAligned(size, AesBlockSize)); m_base_storage->Read(buffer, size, offset); // Setup the counter. @@ -60,25 +58,21 @@ size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const { std::memcpy(ctr.data(), m_iv.data(), IvSize); AddCounter(ctr.data(), IvSize, offset / m_block_size); - // Handle any unaligned data before the start. + // Handle any unaligned data before the start; then read said data into a local pooled + // buffer that resides on the stack, do not use the global memory allocator this is a + // very tiny (512 bytes) buffer so should be fine to keep on the stack (Nca::XtsBlockSize wide buffer) size_t processed_size = 0; if ((offset % m_block_size) != 0) { + // Decrypt into our pooled stack buffer (max bound = NCA::XtsBlockSize) + boost::container::static_vector tmp_buf; // Determine the size of the pre-data read. - const size_t skip_size = - static_cast(offset - Common::AlignDown(offset, m_block_size)); - const size_t data_size = (std::min)(size, m_block_size - skip_size); - - // Decrypt into a pooled buffer. - { - std::vector tmp_buf(m_block_size, 0); - std::memcpy(tmp_buf.data() + skip_size, buffer, data_size); - - m_cipher->SetIV(ctr); - m_cipher->Transcode(tmp_buf.data(), m_block_size, tmp_buf.data(), - Core::Crypto::Op::Decrypt); - - std::memcpy(buffer, tmp_buf.data() + skip_size, data_size); - } + auto const skip_size = size_t(offset - Common::AlignDown(offset, m_block_size)); + auto const data_size = (std::min)(size, m_block_size - skip_size); + std::fill_n(tmp_buf.begin(), skip_size, u8{0}); + std::memcpy(tmp_buf.data() + skip_size, buffer, data_size); + m_cipher->SetIV(ctr); + m_cipher->Transcode(tmp_buf.data(), m_block_size, tmp_buf.data(), Core::Crypto::Op::Decrypt); + std::memcpy(buffer, tmp_buf.data() + skip_size, data_size); AddCounter(ctr.data(), IvSize, 1); processed_size += data_size; @@ -86,20 +80,16 @@ size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const { } // Decrypt aligned chunks. - char* cur = reinterpret_cast(buffer) + processed_size; - size_t remaining = size - processed_size; - while (remaining > 0) { - const size_t cur_size = (std::min)(m_block_size, remaining); - + auto* cur = buffer + processed_size; + for (size_t remaining = size - processed_size; remaining > 0; ) { + auto const cur_size = (std::min)(m_block_size, remaining); m_cipher->SetIV(ctr); - m_cipher->Transcode(cur, cur_size, cur, Core::Crypto::Op::Decrypt); - + auto* char_cur = reinterpret_cast(cur); //same repr cur - diff signedness + m_cipher->Transcode(char_cur, cur_size, char_cur, Core::Crypto::Op::Decrypt); remaining -= cur_size; cur += cur_size; - AddCounter(ctr.data(), IvSize, 1); } - return size; }