|
|
|
@ -35,115 +35,12 @@ |
|
|
|
|
|
|
|
namespace Core::Memory { |
|
|
|
|
|
|
|
namespace { |
|
|
|
|
|
|
|
inline void FastMemcpy(void* dst, const void* src, std::size_t size) { |
|
|
|
// Fast path for small copies
|
|
|
|
switch (size) { |
|
|
|
case 1: |
|
|
|
*static_cast<u8*>(dst) = *static_cast<const u8*>(src); |
|
|
|
break; |
|
|
|
case 2: |
|
|
|
*static_cast<u16*>(dst) = *static_cast<const u16*>(src); |
|
|
|
break; |
|
|
|
case 4: |
|
|
|
*static_cast<u32*>(dst) = *static_cast<const u32*>(src); |
|
|
|
break; |
|
|
|
case 8: |
|
|
|
*static_cast<u64*>(dst) = *static_cast<const u64*>(src); |
|
|
|
break; |
|
|
|
case 16: { |
|
|
|
// Optimize for 16-byte copy (common case for SIMD registers)
|
|
|
|
const u64* src_64 = static_cast<const u64*>(src); |
|
|
|
u64* dst_64 = static_cast<u64*>(dst); |
|
|
|
dst_64[0] = src_64[0]; |
|
|
|
dst_64[1] = src_64[1]; |
|
|
|
break; |
|
|
|
} |
|
|
|
case 32: { |
|
|
|
// Optimize for 32-byte copy
|
|
|
|
const u64* src_64 = static_cast<const u64*>(src); |
|
|
|
u64* dst_64 = static_cast<u64*>(dst); |
|
|
|
dst_64[0] = src_64[0]; |
|
|
|
dst_64[1] = src_64[1]; |
|
|
|
dst_64[2] = src_64[2]; |
|
|
|
dst_64[3] = src_64[3]; |
|
|
|
break; |
|
|
|
} |
|
|
|
case 64: { |
|
|
|
// Optimize for 64-byte copy
|
|
|
|
const u64* src_64 = static_cast<const u64*>(src); |
|
|
|
u64* dst_64 = static_cast<u64*>(dst); |
|
|
|
dst_64[0] = src_64[0]; |
|
|
|
dst_64[1] = src_64[1]; |
|
|
|
dst_64[2] = src_64[2]; |
|
|
|
dst_64[3] = src_64[3]; |
|
|
|
dst_64[4] = src_64[4]; |
|
|
|
dst_64[5] = src_64[5]; |
|
|
|
dst_64[6] = src_64[6]; |
|
|
|
dst_64[7] = src_64[7]; |
|
|
|
break; |
|
|
|
} |
|
|
|
default: |
|
|
|
// For larger sizes, use standard memcpy which is usually optimized by the compiler
|
|
|
|
std::memcpy(dst, src, size); |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
inline void FastMemset(void* dst, int value, std::size_t size) { |
|
|
|
// Fast path for small fills
|
|
|
|
switch (size) { |
|
|
|
case 1: |
|
|
|
*static_cast<u8*>(dst) = static_cast<u8>(value); |
|
|
|
break; |
|
|
|
case 2: |
|
|
|
*static_cast<u16*>(dst) = static_cast<u16>(value); |
|
|
|
break; |
|
|
|
case 4: |
|
|
|
*static_cast<u32*>(dst) = static_cast<u32>(value); |
|
|
|
break; |
|
|
|
case 8: |
|
|
|
*static_cast<u64*>(dst) = static_cast<u64>(value); |
|
|
|
break; |
|
|
|
case 16: { |
|
|
|
// Optimize for 16-byte fill (common case for SIMD registers)
|
|
|
|
u64* dst_64 = static_cast<u64*>(dst); |
|
|
|
const u64 val64 = static_cast<u8>(value) * 0x0101010101010101ULL; |
|
|
|
dst_64[0] = val64; |
|
|
|
dst_64[1] = val64; |
|
|
|
break; |
|
|
|
} |
|
|
|
default: |
|
|
|
if (size <= 128 && value == 0) { |
|
|
|
// Fast path for small zero-fills
|
|
|
|
u8* dst_bytes = static_cast<u8*>(dst); |
|
|
|
for (std::size_t i = 0; i < size; i += 8) { |
|
|
|
if (i + 8 <= size) { |
|
|
|
*reinterpret_cast<u64*>(dst_bytes + i) = 0; |
|
|
|
} else { |
|
|
|
// Handle remaining bytes (less than 8)
|
|
|
|
for (std::size_t j = i; j < size; j++) { |
|
|
|
dst_bytes[j] = 0; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
// For larger sizes, use standard memset which is usually optimized by the compiler
|
|
|
|
std::memset(dst, value, size); |
|
|
|
} |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, |
|
|
|
static inline bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, |
|
|
|
const std::size_t size) { |
|
|
|
const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits(); |
|
|
|
return addr + size >= addr && addr + size <= max_addr; |
|
|
|
} |
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Implementation class used to keep the specifics of the memory subsystem hidden
|
|
|
|
// from outside classes. This also allows modification to the internals of the memory
|
|
|
|
// subsystem without needing to rebuild all files that make use of the memory interface.
|
|
|
|
@ -416,70 +313,28 @@ struct Memory::Impl { |
|
|
|
LOG_ERROR(HW_Memory, |
|
|
|
"Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", |
|
|
|
GetInteger(current_vaddr), GetInteger(src_addr), size); |
|
|
|
FastMemset(dest_buffer, 0, copy_amount); |
|
|
|
std::memset(dest_buffer, 0, copy_amount); |
|
|
|
}, |
|
|
|
[&](const std::size_t copy_amount, const u8* const src_ptr) { |
|
|
|
FastMemcpy(dest_buffer, src_ptr, copy_amount); |
|
|
|
std::memcpy(dest_buffer, src_ptr, copy_amount); |
|
|
|
}, |
|
|
|
[&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, |
|
|
|
const u8* const host_ptr) { |
|
|
|
if constexpr (!UNSAFE) { |
|
|
|
HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount); |
|
|
|
} |
|
|
|
FastMemcpy(dest_buffer, host_ptr, copy_amount); |
|
|
|
std::memcpy(dest_buffer, host_ptr, copy_amount); |
|
|
|
}, |
|
|
|
[&](const std::size_t copy_amount) { |
|
|
|
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount; |
|
|
|
}); |
|
|
|
} |
|
|
|
|
|
|
|
bool ReadBlockParallel(const Common::ProcessAddress src_addr, void* dest_buffer, |
|
|
|
const std::size_t size) { |
|
|
|
// Calculate chunk size based on thread count
|
|
|
|
const size_t chunk_size = (size + thread_count - 1) / thread_count; |
|
|
|
|
|
|
|
// Create threads for parallel processing
|
|
|
|
std::vector<std::thread> threads; |
|
|
|
threads.reserve(thread_count); |
|
|
|
|
|
|
|
// Create a vector to store the results of each thread
|
|
|
|
std::vector<bool> results(thread_count, true); |
|
|
|
|
|
|
|
// Split the work among threads
|
|
|
|
for (unsigned int i = 0; i < thread_count; ++i) { |
|
|
|
const size_t offset = i * chunk_size; |
|
|
|
if (offset >= size) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
const size_t current_chunk_size = std::min(chunk_size, size - offset); |
|
|
|
const Common::ProcessAddress current_addr = src_addr + offset; |
|
|
|
void* current_dest = static_cast<u8*>(dest_buffer) + offset; |
|
|
|
|
|
|
|
// Launch thread
|
|
|
|
threads.emplace_back([this, i, current_addr, current_dest, current_chunk_size, &results] { |
|
|
|
results[i] = ReadBlockImpl<false>(current_addr, current_dest, current_chunk_size); |
|
|
|
}); |
|
|
|
} |
|
|
|
|
|
|
|
// Wait for all threads to complete
|
|
|
|
for (auto& thread : threads) { |
|
|
|
thread.join(); |
|
|
|
} |
|
|
|
|
|
|
|
// Check if all operations succeeded
|
|
|
|
return std::all_of(results.begin(), results.end(), [](bool result) { return result; }); |
|
|
|
} |
|
|
|
|
|
|
|
bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer, |
|
|
|
const std::size_t size) { |
|
|
|
// For small reads, use the regular implementation
|
|
|
|
if (size < PARALLEL_THRESHOLD) { |
|
|
|
return ReadBlockImpl<false>(src_addr, dest_buffer, size); |
|
|
|
} |
|
|
|
|
|
|
|
// For large reads, use parallel implementation
|
|
|
|
return ReadBlockParallel(src_addr, dest_buffer, size); |
|
|
|
// TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
|
|
|
|
// use TBB or something that splits the job properly
|
|
|
|
return ReadBlockImpl<false>(src_addr, dest_buffer, size); |
|
|
|
} |
|
|
|
|
|
|
|
bool ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer, |
|
|
|
@ -515,67 +370,25 @@ struct Memory::Impl { |
|
|
|
GetInteger(current_vaddr), GetInteger(dest_addr), size); |
|
|
|
}, |
|
|
|
[&](const std::size_t copy_amount, u8* const dest_ptr) { |
|
|
|
FastMemcpy(dest_ptr, src_buffer, copy_amount); |
|
|
|
std::memcpy(dest_ptr, src_buffer, copy_amount); |
|
|
|
}, |
|
|
|
[&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, |
|
|
|
u8* const host_ptr) { |
|
|
|
if constexpr (!UNSAFE) { |
|
|
|
HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); |
|
|
|
} |
|
|
|
FastMemcpy(host_ptr, src_buffer, copy_amount); |
|
|
|
std::memcpy(host_ptr, src_buffer, copy_amount); |
|
|
|
}, |
|
|
|
[&](const std::size_t copy_amount) { |
|
|
|
src_buffer = static_cast<const u8*>(src_buffer) + copy_amount; |
|
|
|
}); |
|
|
|
} |
|
|
|
|
|
|
|
bool WriteBlockParallel(const Common::ProcessAddress dest_addr, const void* src_buffer, |
|
|
|
const std::size_t size) { |
|
|
|
// Calculate chunk size based on thread count
|
|
|
|
const size_t chunk_size = (size + thread_count - 1) / thread_count; |
|
|
|
|
|
|
|
// Create threads for parallel processing
|
|
|
|
std::vector<std::thread> threads; |
|
|
|
threads.reserve(thread_count); |
|
|
|
|
|
|
|
// Create a vector to store the results of each thread
|
|
|
|
std::vector<bool> results(thread_count, true); |
|
|
|
|
|
|
|
// Split the work among threads
|
|
|
|
for (unsigned int i = 0; i < thread_count; ++i) { |
|
|
|
const size_t offset = i * chunk_size; |
|
|
|
if (offset >= size) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
const size_t current_chunk_size = std::min(chunk_size, size - offset); |
|
|
|
const Common::ProcessAddress current_addr = dest_addr + offset; |
|
|
|
const void* current_src = static_cast<const u8*>(src_buffer) + offset; |
|
|
|
|
|
|
|
// Launch thread
|
|
|
|
threads.emplace_back([this, i, current_addr, current_src, current_chunk_size, &results] { |
|
|
|
results[i] = WriteBlockImpl<false>(current_addr, current_src, current_chunk_size); |
|
|
|
}); |
|
|
|
} |
|
|
|
|
|
|
|
// Wait for all threads to complete
|
|
|
|
for (auto& thread : threads) { |
|
|
|
thread.join(); |
|
|
|
} |
|
|
|
|
|
|
|
// Check if all operations succeeded
|
|
|
|
return std::all_of(results.begin(), results.end(), [](bool result) { return result; }); |
|
|
|
} |
|
|
|
|
|
|
|
bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer, |
|
|
|
const std::size_t size) { |
|
|
|
// For small writes, use the regular implementation
|
|
|
|
if (size < PARALLEL_THRESHOLD) { |
|
|
|
return WriteBlockImpl<false>(dest_addr, src_buffer, size); |
|
|
|
} |
|
|
|
|
|
|
|
// For large writes, use parallel implementation
|
|
|
|
return WriteBlockParallel(dest_addr, src_buffer, size); |
|
|
|
// TODO: If you want a proper multithreaded implementation (w/o cache coherency fights)
|
|
|
|
// use TBB or something that splits the job properly
|
|
|
|
return WriteBlockImpl<false>(dest_addr, src_buffer, size); |
|
|
|
} |
|
|
|
|
|
|
|
bool WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer, |
|
|
|
@ -593,12 +406,12 @@ struct Memory::Impl { |
|
|
|
GetInteger(current_vaddr), GetInteger(dest_addr), size); |
|
|
|
}, |
|
|
|
[](const std::size_t copy_amount, u8* const dest_ptr) { |
|
|
|
FastMemset(dest_ptr, 0, copy_amount); |
|
|
|
std::memset(dest_ptr, 0, copy_amount); |
|
|
|
}, |
|
|
|
[&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, |
|
|
|
u8* const host_ptr) { |
|
|
|
HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); |
|
|
|
FastMemset(host_ptr, 0, copy_amount); |
|
|
|
std::memset(host_ptr, 0, copy_amount); |
|
|
|
}, |
|
|
|
[](const std::size_t copy_amount) {}); |
|
|
|
} |
|
|
|
@ -993,7 +806,7 @@ struct Memory::Impl { |
|
|
|
}, |
|
|
|
[&]() { HandleRasterizerDownload(addr, sizeof(T)); }); |
|
|
|
if (ptr) { |
|
|
|
FastMemcpy(&result, ptr, sizeof(T)); |
|
|
|
std::memcpy(&result, ptr, sizeof(T)); |
|
|
|
} |
|
|
|
return result; |
|
|
|
} |
|
|
|
@ -1080,7 +893,7 @@ struct Memory::Impl { |
|
|
|
}, |
|
|
|
[&]() { HandleRasterizerWrite(addr, sizeof(T)); }); |
|
|
|
if (ptr) { |
|
|
|
FastMemcpy(ptr, &data, sizeof(T)); |
|
|
|
std::memcpy(ptr, &data, sizeof(T)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@ -1203,7 +1016,7 @@ struct Memory::Impl { |
|
|
|
unsigned int thread_count = 2; |
|
|
|
|
|
|
|
// Minimum size in bytes for which parallel processing is beneficial
|
|
|
|
static constexpr size_t PARALLEL_THRESHOLD = 64 * 1024; // 64 KB
|
|
|
|
//size_t PARALLEL_THRESHOLD = (L3 CACHE * NUM PHYSICAL CORES); // 64 KB
|
|
|
|
std::array<VideoCore::RasterizerDownloadArea, Core::Hardware::NUM_CPU_CORES> |
|
|
|
rasterizer_read_areas{}; |
|
|
|
std::array<GPUDirtyState, Core::Hardware::NUM_CPU_CORES> rasterizer_write_areas{}; |
|
|
|
|