[video_core] friendlier ASTC replicate function that doesn't trash cache

Signed-off-by: lizzie <lizzie@eden-emu.dev>
2 weeks ago · 39138fbdd2
1 changed files with 17 additions and 112 deletions
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@ -589,109 +589,13 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {

 // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
 // is the same as [(num_bits - 1):0] and repeats all the way down.
-template <typename IntType>
-static constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
-    if (num_bits == 0 || to_bit == 0) {
-        return 0;
-    }
-    const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
-    IntType res = v;
-    u32 reslen = num_bits;
-    while (reslen < to_bit) {
-        u32 comp = 0;
-        if (num_bits > to_bit - reslen) {
-            u32 newshift = to_bit - reslen;
-            comp = num_bits - newshift;
-            num_bits = newshift;
-        }
-        res = static_cast<IntType>(res << num_bits);
-        res = static_cast<IntType>(res | (v >> comp));
-        reslen += num_bits;
-    }
-    return res;
-}
-
-static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
-    return std::size_t(1) << num_bits;
-}
-
-template <typename IntType, u32 num_bits, u32 to_bit>
-static constexpr auto MakeReplicateTable() {
-    std::array<IntType, NumReplicateEntries(num_bits)> table{};
-    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
-        table[value] = Replicate(value, num_bits, to_bit);
-    }
-    return table;
-}
-
-static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
-static constexpr u32 ReplicateByteTo16(std::size_t value) {
-    return REPLICATE_BYTE_TO_16_TABLE[value];
-}
-
-static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
-static constexpr u32 ReplicateBitTo7(std::size_t value) {
-    return REPLICATE_BIT_TO_7_TABLE[value];
-}
-
-static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
-static constexpr u32 ReplicateBitTo9(std::size_t value) {
-    return REPLICATE_BIT_TO_9_TABLE[value];
-}
-
-static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
-static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
-static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
-static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
-static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
-static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
-static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
-static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
-/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
-/// to the runtime implementation
-static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
-    switch (num_bits) {
-    case 1:
-        return REPLICATE_1_BIT_TO_8_TABLE[value];
-    case 2:
-        return REPLICATE_2_BIT_TO_8_TABLE[value];
-    case 3:
-        return REPLICATE_3_BIT_TO_8_TABLE[value];
-    case 4:
-        return REPLICATE_4_BIT_TO_8_TABLE[value];
-    case 5:
-        return REPLICATE_5_BIT_TO_8_TABLE[value];
-    case 6:
-        return REPLICATE_6_BIT_TO_8_TABLE[value];
-    case 7:
-        return REPLICATE_7_BIT_TO_8_TABLE[value];
-    case 8:
-        return REPLICATE_8_BIT_TO_8_TABLE[value];
-    default:
-        return Replicate(value, num_bits, 8);
-    }
-}
-
-static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
-static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
-static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
-static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
-static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
-static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
-    switch (num_bits) {
-    case 1:
-        return REPLICATE_1_BIT_TO_6_TABLE[value];
-    case 2:
-        return REPLICATE_2_BIT_TO_6_TABLE[value];
-    case 3:
-        return REPLICATE_3_BIT_TO_6_TABLE[value];
-    case 4:
-        return REPLICATE_4_BIT_TO_6_TABLE[value];
-    case 5:
-        return REPLICATE_5_BIT_TO_6_TABLE[value];
-    default:
-        return Replicate(value, num_bits, 6);
-    }
+[[nodiscard]] constexpr u32 Replicate(u32 v, u32 num_bits, u32 to_bit) {
+    auto const mask = u32(1 << num_bits) - 1;
+    auto val = v;
+    for (; num_bits < to_bit; num_bits <<= 1)
+        val |= val << u32(num_bits);
+    auto const val_mask = u32(1 << to_bit) - 1;
+    return (v & ~val_mask) | (val & val_mask);
 }

 class Pixel {
@ -734,9 +638,9 @@ public:
            // Do nothing
            return val;
        } else if (oldDepth == 0) {
-            return static_cast<ChannelType>((1 << 8) - 1);
+            return ChannelType((1 << 8) - 1);
        } else if (8 > oldDepth) {
-            return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
+            return ChannelType(Replicate(u32(val), oldDepth, 8));
        } else {
            // oldDepth > newDepth
            const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
@ -868,14 +772,14 @@ static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, co

        assert(bitlen >= 1);

-        u32 A = 0, B = 0, C = 0, D = 0;
        // A is just the lsb replicated 9 times.
-        A = ReplicateBitTo9(bitval & 1);
+        u32 A = (bitval & 1) ? ((1 << 9) - 1) : 0;
+        u32 B = 0, C = 0, D = 0;

        switch (val.encoding) {
        // Replicate bits
        case IntegerEncoding::JustBits:
-            out[outIdx++] = FastReplicateTo8(bitval, bitlen);
+            out[outIdx++] = Replicate(bitval, bitlen, 8);
            break;

        // Use algorithm in C.2.13
@ -993,13 +897,14 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
    u32 bitval = val.bit_value;
    u32 bitlen = val.num_bits;

-    u32 A = ReplicateBitTo7(bitval & 1);
+    // A is just LSB repeated 7 times
+    u32 A = (bitval & 1) ? ((1 << 7) - 1) : 0;
    u32 B = 0, C = 0, D = 0;

    u32 result = 0;
    switch (val.encoding) {
    case IntegerEncoding::JustBits:
-        result = FastReplicateTo6(bitval, bitlen);
+        result = Replicate(bitval, bitlen, 6);
        break;

    case IntegerEncoding::Trit: {
@ -1631,9 +1536,9 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
            Pixel p;
            for (u32 c = 0; c < 4; c++) {
                u32 C0 = endpoints[partition][0].Component(c);
-                C0 = ReplicateByteTo16(C0);
                u32 C1 = endpoints[partition][1].Component(c);
-                C1 = ReplicateByteTo16(C1);
+                C0 = (C0 & 0xff) | ((C0 & 0xff) << 8);
+                C1 = (C1 & 0xff) | ((C0 & 0xff) << 8);

                u32 plane = 0;
                if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {