Browse Source

Attempt to make GPU ASTC decoding faster

Added device aware memory saving for MP4
pull/3246/head
Forrest Keller 4 weeks ago
committed by crueter
parent
commit
44c12fa216
  1. 51
      src/video_core/host_shaders/astc_decoder.comp
  2. 5
      src/video_core/renderer_vulkan/vk_compute_pass.cpp
  3. 33
      src/video_core/texture_cache/texture_cache.h
  4. 3
      src/video_core/texture_cache/texture_cache_base.h

51
src/video_core/host_shaders/astc_decoder.comp

@ -61,6 +61,13 @@ const uint encoding_values[22] = uint[](
(JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u)),
(QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
(TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));
// Precomputed weight tables
const uint WEIGHT_TABLE_1BIT[2] = uint[](0, 64);
const uint WEIGHT_TABLE_2BIT[4] = uint[](0, 21, 43, 64);
const uint WEIGHT_TABLE_3BIT[8] = uint[](0, 9, 18, 27, 37, 46, 55, 64);
const uint WEIGHT_TABLE_4BIT[16] = uint[](0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64);
const uint WEIGHT_TABLE_5BIT[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64);
// Input ASTC texture globals
int total_bitsread = 0;
@ -730,18 +737,30 @@ uint UnquantizeTexelWeight(EncodingData val) {
const uint encoding = Encoding(val);
const uint bitlen = NumBits(val);
const uint bitval = BitValue(val);
if (encoding == JUST_BITS) {
switch (bitlen) {
case 1: return WEIGHT_TABLE_1BIT[bitval];
case 2: return WEIGHT_TABLE_2BIT[bitval];
case 3: return WEIGHT_TABLE_3BIT[bitval];
case 4: return WEIGHT_TABLE_4BIT[bitval];
case 5: return WEIGHT_TABLE_5BIT[bitval];
default: return FastReplicateTo6(bitval, bitlen);
}
}
const uint A = ReplicateBitTo7((bitval & 1));
uint B = 0, C = 0, D = 0;
uint result = 0;
const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
switch (encoding) {
case JUST_BITS:
return FastReplicateTo6(bitval, bitlen);
case TRIT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D * 2];
case 0: {
const uint trit_base[3] = uint[](0, 32, 64);
return trit_base[D];
}
case 1: {
C = 50;
break;
@ -758,16 +777,16 @@ uint UnquantizeTexelWeight(EncodingData val) {
B = (cb << 5) | cb;
break;
}
default:
break;
}
break;
}
case QUINT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D];
case 0: {
const uint quint_base[5] = uint[](0, 16, 32, 48, 64);
return quint_base[D];
}
case 1: {
C = 28;
break;
@ -782,14 +801,17 @@ uint UnquantizeTexelWeight(EncodingData val) {
break;
}
}
if (encoding != JUST_BITS && bitlen > 0) {
if (bitlen > 0) {
result = D * C + B;
result ^= A;
result = (A & 0x20) | (result >> 2);
}
if (result > 32) {
result += 1;
}
return result;
}
@ -1159,10 +1181,11 @@ void DecompressBlock(ivec3 coord) {
}
uint SwizzleOffset(uvec2 pos) {
const uint x = pos.x;
const uint y = pos.y;
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
return ((pos.x & 32u) << 3u) |
((pos.y & 6u) << 5u) |
((pos.x & 16u) << 1u) |
((pos.y & 1u) << 4u) |
(pos.x & 15u);
}
void main() {

5
src/video_core/renderer_vulkan/vk_compute_pass.cpp

@ -623,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -638,9 +638,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
});
scheduler.Finish();
}
constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;

33
src/video_core/texture_cache/texture_cache.h

@ -70,10 +70,29 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
(std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
DEFAULT_CRITICAL_MEMORY));
minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
const u64 device_memory = static_cast<u64>(device_local_memory);
if (device_memory <= 4_GiB) {
chunk_size = 16_MiB;
slices_per_batch = 16;
} else if (device_memory <= 8_GiB) {
chunk_size = 32_MiB;
slices_per_batch = 32;
} else {
chunk_size = 64_MiB;
slices_per_batch = 64;
}
lowmemorydevice = True(device_memory <= 4_GiB);
} else {
expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
minimum_memory = 0;
chunk_size = 32_MiB;
slices_per_batch = 32;
lowmemorydevice = true;
}
}
@ -1131,6 +1150,10 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
}
image.flags &= ~ImageFlagBits::CpuModified;
if( lowmemorydevice && image.info.format == PixelFormat::BC1_RGBA_UNORM && MapSizeBytes(image) >= 256_MiB ) {
return;
}
TrackImage(image, image_id);
if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
@ -1465,17 +1488,13 @@ void TextureCache<P>::TickAsyncUnswizzle() {
task.initialized = true;
}
// ToDo: Make these configurable
static constexpr size_t CHUNK_SIZE = 48_MiB;
static constexpr u32 SLICES_PER_BATCH = 48u;
// Read data
if (task.current_offset < task.total_size) {
const size_t remaining = task.total_size - task.current_offset;
size_t copy_amount = std::min(CHUNK_SIZE, remaining);
size_t copy_amount = std::min(chunk_size, remaining);
if (remaining > CHUNK_SIZE) {
if (remaining > chunk_size) {
copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
if (copy_amount == 0) copy_amount = task.bytes_per_slice;
}
@ -1490,7 +1509,7 @@ void TextureCache<P>::TickAsyncUnswizzle() {
const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
const bool is_final_batch = task.current_offset >= task.total_size;
if (complete_slices >= SLICES_PER_BATCH || (is_final_batch && complete_slices > 0)) {
if (complete_slices >= slices_per_batch || (is_final_batch && complete_slices > 0)) {
const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const u32 z_count = std::min(complete_slices, image.info.size.depth - z_start);

3
src/video_core/texture_cache/texture_cache_base.h

@ -475,6 +475,9 @@ private:
u64 minimum_memory;
u64 expected_memory;
u64 critical_memory;
size_t chunk_size;
size_t slices_per_batch;
bool lowmemorydevice = false;
struct BufferDownload {
GPUVAddr address;

Loading…
Cancel
Save