|
|
@ -36,6 +36,12 @@ struct EncodingData { |
|
|
uint data; |
|
|
uint data; |
|
|
}; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
struct PartitionTable { |
|
|
|
|
|
uint s1, s2, s3, s4, s5, s6, s7, s8; |
|
|
|
|
|
uint rnum; |
|
|
|
|
|
bool small_block; |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 { |
|
|
layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 { |
|
|
uvec4 astc_data[]; |
|
|
uvec4 astc_data[]; |
|
|
}; |
|
|
}; |
|
|
@ -62,26 +68,40 @@ const uint encoding_values[22] = uint[]( |
|
|
(QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)), |
|
|
(QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)), |
|
|
(TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u))); |
|
|
(TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u))); |
|
|
|
|
|
|
|
|
// Input ASTC texture globals |
|
|
|
|
|
int total_bitsread = 0; |
|
|
|
|
|
uvec4 local_buff; |
|
|
|
|
|
|
|
|
// Shared memory for workgroup processing |
|
|
|
|
|
shared uvec4 local_buff; |
|
|
|
|
|
shared int total_bitsread; |
|
|
|
|
|
|
|
|
// Color data globals |
|
|
// Color data globals |
|
|
uvec4 color_endpoint_data; |
|
|
|
|
|
int color_bitsread = 0; |
|
|
|
|
|
|
|
|
shared uvec4 color_endpoint_data; |
|
|
|
|
|
shared int color_bitsread; |
|
|
|
|
|
|
|
|
// Global "vector" to be pushed into when decoding |
|
|
// Global "vector" to be pushed into when decoding |
|
|
// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode |
|
|
|
|
|
// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode |
|
|
|
|
|
// So the maximum would be 144 (12 x 12) elements, x 2 for two planes |
|
|
|
|
|
#define DIVCEIL(number, divisor) (number + divisor - 1) / divisor |
|
|
#define DIVCEIL(number, divisor) (number + divisor - 1) / divisor |
|
|
#define ARRAY_NUM_ELEMENTS 144 |
|
|
#define ARRAY_NUM_ELEMENTS 144 |
|
|
#define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4) |
|
|
#define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4) |
|
|
uint result_vector[ARRAY_NUM_ELEMENTS * 2]; |
|
|
|
|
|
|
|
|
|
|
|
int result_index = 0; |
|
|
|
|
|
uint result_vector_max_index; |
|
|
|
|
|
bool result_limit_reached = false; |
|
|
|
|
|
|
|
|
shared uint result_vector[ARRAY_NUM_ELEMENTS * 2]; |
|
|
|
|
|
|
|
|
|
|
|
shared int result_index; |
|
|
|
|
|
shared uint result_vector_max_index; |
|
|
|
|
|
shared bool result_limit_reached; |
|
|
|
|
|
|
|
|
|
|
|
// avoid intermediate result_vector storage during color decode phase |
|
|
|
|
|
shared bool write_color_values; |
|
|
|
|
|
shared uint color_values_direct[32]; |
|
|
|
|
|
shared uint color_out_index; |
|
|
|
|
|
shared uint color_num_values; |
|
|
|
|
|
|
|
|
|
|
|
// Shared variables for DecompressBlock interthread communication |
|
|
|
|
|
shared uvec4 endpoints0[4]; |
|
|
|
|
|
shared uvec4 endpoints1[4]; |
|
|
|
|
|
shared PartitionTable pt; |
|
|
|
|
|
shared uvec2 size_params; |
|
|
|
|
|
shared uint num_partitions; |
|
|
|
|
|
shared uint partition_index; |
|
|
|
|
|
shared uint plane_index; |
|
|
|
|
|
shared bool dual_plane; |
|
|
|
|
|
shared vec4 fill_color; |
|
|
|
|
|
|
|
|
// EncodingData helpers |
|
|
// EncodingData helpers |
|
|
uint Encoding(EncodingData val) { |
|
|
uint Encoding(EncodingData val) { |
|
|
@ -114,9 +134,110 @@ EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint |
|
|
return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) | |
|
|
return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) | |
|
|
((bit_val) << 16u) | ((quint_trit_val) << 24u)); |
|
|
((bit_val) << 16u) | ((quint_trit_val) << 24u)); |
|
|
} |
|
|
} |
|
|
|
|
|
uint ReplicateBitTo9(uint bit); |
|
|
|
|
|
uint FastReplicateTo8(uint value, uint num_bits); |
|
|
|
|
|
|
|
|
|
|
|
void EmitColorValue(EncodingData val) { |
|
|
|
|
|
// write directly to color_values_direct[] |
|
|
|
|
|
const uint encoding = Encoding(val); |
|
|
|
|
|
const uint bitlen = NumBits(val); |
|
|
|
|
|
const uint bitval = BitValue(val); |
|
|
|
|
|
|
|
|
|
|
|
if (encoding == JUST_BITS) { |
|
|
|
|
|
color_values_direct[++color_out_index] = FastReplicateTo8(bitval, bitlen); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uint A = ReplicateBitTo9((bitval & 1)); |
|
|
|
|
|
uint B = 0, C = 0, D = QuintTritValue(val); |
|
|
|
|
|
|
|
|
|
|
|
if (encoding == TRIT) { |
|
|
|
|
|
switch (bitlen) { |
|
|
|
|
|
case 1: |
|
|
|
|
|
C = 204; |
|
|
|
|
|
break; |
|
|
|
|
|
case 2: { |
|
|
|
|
|
C = 93; |
|
|
|
|
|
const uint b = (bitval >> 1) & 1; |
|
|
|
|
|
B = (b << 8) | (b << 4) | (b << 2) | (b << 1); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 3: { |
|
|
|
|
|
C = 44; |
|
|
|
|
|
const uint cb = (bitval >> 1) & 3; |
|
|
|
|
|
B = (cb << 7) | (cb << 2) | cb; |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 4: { |
|
|
|
|
|
C = 22; |
|
|
|
|
|
const uint dcb = (bitval >> 1) & 7; |
|
|
|
|
|
B = (dcb << 6) | dcb; |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 5: { |
|
|
|
|
|
C = 11; |
|
|
|
|
|
const uint edcb = (bitval >> 1) & 0xF; |
|
|
|
|
|
B = (edcb << 5) | (edcb >> 2); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 6: { |
|
|
|
|
|
C = 5; |
|
|
|
|
|
const uint fedcb = (bitval >> 1) & 0x1F; |
|
|
|
|
|
B = (fedcb << 4) | (fedcb >> 4); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} else { // QUINT |
|
|
|
|
|
switch (bitlen) { |
|
|
|
|
|
case 1: |
|
|
|
|
|
C = 113; |
|
|
|
|
|
break; |
|
|
|
|
|
case 2: { |
|
|
|
|
|
C = 54; |
|
|
|
|
|
const uint b = (bitval >> 1) & 1; |
|
|
|
|
|
B = (b << 8) | (b << 3) | (b << 2); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 3: { |
|
|
|
|
|
C = 26; |
|
|
|
|
|
const uint cb = (bitval >> 1) & 3; |
|
|
|
|
|
B = (cb << 7) | (cb << 1) | (cb >> 1); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 4: { |
|
|
|
|
|
C = 13; |
|
|
|
|
|
const uint dcb = (bitval >> 1) & 7; |
|
|
|
|
|
B = (dcb << 6) | (dcb >> 1); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 5: { |
|
|
|
|
|
C = 6; |
|
|
|
|
|
const uint edcb = (bitval >> 1) & 0xF; |
|
|
|
|
|
B = (edcb << 5) | (edcb >> 3); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uint T = (D * C) + B; |
|
|
|
|
|
T ^= A; |
|
|
|
|
|
T = (A & 0x80) | (T >> 2); |
|
|
|
|
|
color_values_direct[++color_out_index] = T; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void ResultEmplaceBack(EncodingData val) { |
|
|
void ResultEmplaceBack(EncodingData val) { |
|
|
|
|
|
if (write_color_values) { |
|
|
|
|
|
if (color_out_index >= color_num_values) { |
|
|
|
|
|
// avoid decoding more than needed by this phase |
|
|
|
|
|
result_limit_reached = true; |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
EmitColorValue(val); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
if (result_index >= result_vector_max_index) { |
|
|
if (result_index >= result_vector_max_index) { |
|
|
// Alert callers to avoid decoding more than needed by this phase |
|
|
// Alert callers to avoid decoding more than needed by this phase |
|
|
result_limit_reached = true; |
|
|
result_limit_reached = true; |
|
|
@ -197,32 +318,31 @@ uint Hash52(uint p) { |
|
|
return p; |
|
|
return p; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) { |
|
|
|
|
|
if ((block_dims.y * block_dims.x) < 32) { |
|
|
|
|
|
x <<= 1; |
|
|
|
|
|
y <<= 1; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
seed += (partition_count - 1) * 1024; |
|
|
|
|
|
|
|
|
PartitionTable GetPartitionTable(uint seed, uint partition_count) { |
|
|
|
|
|
PartitionTable pt; |
|
|
|
|
|
pt.small_block = (block_dims.y * block_dims.x) < 32; |
|
|
|
|
|
|
|
|
const uint rnum = Hash52(uint(seed)); |
|
|
|
|
|
uint seed1 = uint(rnum & 0xF); |
|
|
|
|
|
uint seed2 = uint((rnum >> 4) & 0xF); |
|
|
|
|
|
uint seed3 = uint((rnum >> 8) & 0xF); |
|
|
|
|
|
uint seed4 = uint((rnum >> 12) & 0xF); |
|
|
|
|
|
uint seed5 = uint((rnum >> 16) & 0xF); |
|
|
|
|
|
uint seed6 = uint((rnum >> 20) & 0xF); |
|
|
|
|
|
uint seed7 = uint((rnum >> 24) & 0xF); |
|
|
|
|
|
uint seed8 = uint((rnum >> 28) & 0xF); |
|
|
|
|
|
|
|
|
|
|
|
seed1 = (seed1 * seed1); |
|
|
|
|
|
seed2 = (seed2 * seed2); |
|
|
|
|
|
seed3 = (seed3 * seed3); |
|
|
|
|
|
seed4 = (seed4 * seed4); |
|
|
|
|
|
seed5 = (seed5 * seed5); |
|
|
|
|
|
seed6 = (seed6 * seed6); |
|
|
|
|
|
seed7 = (seed7 * seed7); |
|
|
|
|
|
seed8 = (seed8 * seed8); |
|
|
|
|
|
|
|
|
seed += (partition_count - 1) * 1024; |
|
|
|
|
|
uint rnum = Hash52(uint(seed)); |
|
|
|
|
|
pt.rnum = rnum; |
|
|
|
|
|
|
|
|
|
|
|
uint seed1 = (rnum & 0xF); |
|
|
|
|
|
seed1 *= seed1; |
|
|
|
|
|
uint seed2 = (rnum >> 4) & 0xF; |
|
|
|
|
|
seed2 *= seed2; |
|
|
|
|
|
uint seed3 = (rnum >> 8) & 0xF; |
|
|
|
|
|
seed3 *= seed3; |
|
|
|
|
|
uint seed4 = (rnum >> 12) & 0xF; |
|
|
|
|
|
seed4 *= seed4; |
|
|
|
|
|
uint seed5 = (rnum >> 16) & 0xF; |
|
|
|
|
|
seed5 *= seed5; |
|
|
|
|
|
uint seed6 = (rnum >> 20) & 0xF; |
|
|
|
|
|
seed6 *= seed6; |
|
|
|
|
|
uint seed7 = (rnum >> 24) & 0xF; |
|
|
|
|
|
seed7 *= seed7; |
|
|
|
|
|
uint seed8 = (rnum >> 28) & 0xF; |
|
|
|
|
|
seed8 *= seed8; |
|
|
|
|
|
|
|
|
uint sh1, sh2; |
|
|
uint sh1, sh2; |
|
|
if ((seed & 1) > 0) { |
|
|
if ((seed & 1) > 0) { |
|
|
@ -232,31 +352,37 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) { |
|
|
sh1 = (partition_count == 3) ? 6 : 5; |
|
|
sh1 = (partition_count == 3) ? 6 : 5; |
|
|
sh2 = (seed & 2) > 0 ? 4 : 5; |
|
|
sh2 = (seed & 2) > 0 ? 4 : 5; |
|
|
} |
|
|
} |
|
|
seed1 >>= sh1; |
|
|
|
|
|
seed2 >>= sh2; |
|
|
|
|
|
seed3 >>= sh1; |
|
|
|
|
|
seed4 >>= sh2; |
|
|
|
|
|
seed5 >>= sh1; |
|
|
|
|
|
seed6 >>= sh2; |
|
|
|
|
|
seed7 >>= sh1; |
|
|
|
|
|
seed8 >>= sh2; |
|
|
|
|
|
|
|
|
|
|
|
uint a = seed1 * x + seed2 * y + (rnum >> 14); |
|
|
|
|
|
uint b = seed3 * x + seed4 * y + (rnum >> 10); |
|
|
|
|
|
uint c = seed5 * x + seed6 * y + (rnum >> 6); |
|
|
|
|
|
uint d = seed7 * x + seed8 * y + (rnum >> 2); |
|
|
|
|
|
|
|
|
pt.s1 = seed1 >> sh1; |
|
|
|
|
|
pt.s2 = seed2 >> sh2; |
|
|
|
|
|
pt.s3 = seed3 >> sh1; |
|
|
|
|
|
pt.s4 = seed4 >> sh2; |
|
|
|
|
|
pt.s5 = seed5 >> sh1; |
|
|
|
|
|
pt.s6 = seed6 >> sh2; |
|
|
|
|
|
pt.s7 = seed7 >> sh1; |
|
|
|
|
|
pt.s8 = seed8 >> sh2; |
|
|
|
|
|
|
|
|
|
|
|
return pt; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uint SelectPartition(PartitionTable pt, uint x, uint y, uint partition_count) { |
|
|
|
|
|
if (pt.small_block) { |
|
|
|
|
|
x <<= 1; |
|
|
|
|
|
y <<= 1; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uint a = pt.s1 * x + pt.s2 * y + (pt.rnum >> 14); |
|
|
|
|
|
uint b = pt.s3 * x + pt.s4 * y + (pt.rnum >> 10); |
|
|
|
|
|
uint c = pt.s5 * x + pt.s6 * y + (pt.rnum >> 6); |
|
|
|
|
|
uint d = pt.s7 * x + pt.s8 * y + (pt.rnum >> 2); |
|
|
|
|
|
|
|
|
a &= 0x3F; |
|
|
a &= 0x3F; |
|
|
b &= 0x3F; |
|
|
b &= 0x3F; |
|
|
c &= 0x3F; |
|
|
c &= 0x3F; |
|
|
d &= 0x3F; |
|
|
d &= 0x3F; |
|
|
|
|
|
|
|
|
if (partition_count < 4) { |
|
|
|
|
|
d = 0; |
|
|
|
|
|
} |
|
|
|
|
|
if (partition_count < 3) { |
|
|
|
|
|
c = 0; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
if (partition_count < 4) d = 0; |
|
|
|
|
|
if (partition_count < 3) c = 0; |
|
|
|
|
|
|
|
|
if (a >= b && a >= c && a >= d) { |
|
|
if (a >= b && a >= c && a >= d) { |
|
|
return 0; |
|
|
return 0; |
|
|
@ -457,7 +583,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) { |
|
|
|
|
|
|
|
|
void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { |
|
|
uint num_values = 0; |
|
|
uint num_values = 0; |
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
num_values += ((modes[i] >> 2) + 1) << 1; |
|
|
num_values += ((modes[i] >> 2) + 1) << 1; |
|
|
@ -471,104 +597,21 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, o |
|
|
break; |
|
|
break; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
DecodeIntegerSequence(range - 1, num_values); |
|
|
|
|
|
uint out_index = 0; |
|
|
|
|
|
for (int itr = 0; itr < result_index; ++itr) { |
|
|
|
|
|
if (out_index >= num_values) { |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
const EncodingData val = GetEncodingFromVector(itr); |
|
|
|
|
|
const uint encoding = Encoding(val); |
|
|
|
|
|
const uint bitlen = NumBits(val); |
|
|
|
|
|
const uint bitval = BitValue(val); |
|
|
|
|
|
uint A = 0, B = 0, C = 0, D = 0; |
|
|
|
|
|
A = ReplicateBitTo9((bitval & 1)); |
|
|
|
|
|
switch (encoding) { |
|
|
|
|
|
case JUST_BITS: |
|
|
|
|
|
color_values[++out_index] = FastReplicateTo8(bitval, bitlen); |
|
|
|
|
|
break; |
|
|
|
|
|
case TRIT: { |
|
|
|
|
|
D = QuintTritValue(val); |
|
|
|
|
|
switch (bitlen) { |
|
|
|
|
|
case 1: |
|
|
|
|
|
C = 204; |
|
|
|
|
|
break; |
|
|
|
|
|
case 2: { |
|
|
|
|
|
C = 93; |
|
|
|
|
|
const uint b = (bitval >> 1) & 1; |
|
|
|
|
|
B = (b << 8) | (b << 4) | (b << 2) | (b << 1); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 3: { |
|
|
|
|
|
C = 44; |
|
|
|
|
|
const uint cb = (bitval >> 1) & 3; |
|
|
|
|
|
B = (cb << 7) | (cb << 2) | cb; |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 4: { |
|
|
|
|
|
C = 22; |
|
|
|
|
|
const uint dcb = (bitval >> 1) & 7; |
|
|
|
|
|
B = (dcb << 6) | dcb; |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 5: { |
|
|
|
|
|
C = 11; |
|
|
|
|
|
const uint edcb = (bitval >> 1) & 0xF; |
|
|
|
|
|
B = (edcb << 5) | (edcb >> 2); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 6: { |
|
|
|
|
|
C = 5; |
|
|
|
|
|
const uint fedcb = (bitval >> 1) & 0x1F; |
|
|
|
|
|
B = (fedcb << 4) | (fedcb >> 4); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case QUINT: { |
|
|
|
|
|
D = QuintTritValue(val); |
|
|
|
|
|
switch (bitlen) { |
|
|
|
|
|
case 1: |
|
|
|
|
|
C = 113; |
|
|
|
|
|
break; |
|
|
|
|
|
case 2: { |
|
|
|
|
|
C = 54; |
|
|
|
|
|
const uint b = (bitval >> 1) & 1; |
|
|
|
|
|
B = (b << 8) | (b << 3) | (b << 2); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 3: { |
|
|
|
|
|
C = 26; |
|
|
|
|
|
const uint cb = (bitval >> 1) & 3; |
|
|
|
|
|
B = (cb << 7) | (cb << 1) | (cb >> 1); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 4: { |
|
|
|
|
|
C = 13; |
|
|
|
|
|
const uint dcb = (bitval >> 1) & 7; |
|
|
|
|
|
B = (dcb << 6) | (dcb >> 1); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
case 5: { |
|
|
|
|
|
C = 6; |
|
|
|
|
|
const uint edcb = (bitval >> 1) & 0xF; |
|
|
|
|
|
B = (edcb << 5) | (edcb >> 3); |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
if (encoding != JUST_BITS) { |
|
|
|
|
|
uint T = (D * C) + B; |
|
|
|
|
|
T ^= A; |
|
|
|
|
|
T = (A & 0x80) | (T >> 2); |
|
|
|
|
|
color_values[++out_index] = T; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Decode directly into color_values_direct[] |
|
|
|
|
|
write_color_values = true; |
|
|
|
|
|
color_out_index = 0; |
|
|
|
|
|
color_num_values = num_values; |
|
|
|
|
|
for (uint i = 0; i < 32; ++i) { |
|
|
|
|
|
color_values_direct[i] = 0; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
DecodeIntegerSequence(range - 1, num_values); |
|
|
|
|
|
|
|
|
|
|
|
write_color_values = false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ivec2 BitTransferSigned(int a, int b) { |
|
|
ivec2 BitTransferSigned(int a, int b) { |
|
|
ivec2 transferred; |
|
|
ivec2 transferred; |
|
|
transferred.y = b >> 1; |
|
|
transferred.y = b >> 1; |
|
|
@ -730,7 +773,7 @@ uint UnquantizeTexelWeight(EncodingData val) { |
|
|
uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val); |
|
|
uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val); |
|
|
if (encoding == JUST_BITS) { |
|
|
if (encoding == JUST_BITS) { |
|
|
return (bitlen >= 1 && bitlen <= 5) |
|
|
return (bitlen >= 1 && bitlen <= 5) |
|
|
? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1))) |
|
|
|
|
|
|
|
|
? ((bitval * 64) + ((1 << bitlen) - 1) / 2) / ((1 << bitlen) - 1) |
|
|
: FastReplicateTo6(bitval, bitlen); |
|
|
: FastReplicateTo6(bitval, bitlen); |
|
|
} else if (encoding == TRIT || encoding == QUINT) { |
|
|
} else if (encoding == TRIT || encoding == QUINT) { |
|
|
uint B = 0, C = 0, D = 0; |
|
|
uint B = 0, C = 0, D = 0; |
|
|
@ -864,27 +907,32 @@ int FindLayout(uint mode) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void FillError(ivec3 coord) { |
|
|
void FillError(ivec3 coord) { |
|
|
for (uint j = 0; j < block_dims.y; j++) { |
|
|
|
|
|
for (uint i = 0; i < block_dims.x; i++) { |
|
|
|
|
|
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0)); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
const uint total_texels = block_dims.x * block_dims.y; |
|
|
|
|
|
for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { |
|
|
|
|
|
uint x = tid % block_dims.x; |
|
|
|
|
|
uint y = tid / block_dims.x; |
|
|
|
|
|
imageStore(dest_image, coord + ivec3(x, y, 0), vec4(0.0, 0.0, 0.0, 0.0)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void FillVoidExtentLDR(ivec3 coord) { |
|
|
void FillVoidExtentLDR(ivec3 coord) { |
|
|
SkipBits(52); |
|
|
|
|
|
const uint r_u = StreamBits(16); |
|
|
|
|
|
const uint g_u = StreamBits(16); |
|
|
|
|
|
const uint b_u = StreamBits(16); |
|
|
|
|
|
const uint a_u = StreamBits(16); |
|
|
|
|
|
const float a = float(a_u) / 65535.0f; |
|
|
|
|
|
const float r = float(r_u) / 65535.0f; |
|
|
|
|
|
const float g = float(g_u) / 65535.0f; |
|
|
|
|
|
const float b = float(b_u) / 65535.0f; |
|
|
|
|
|
for (uint j = 0; j < block_dims.y; j++) { |
|
|
|
|
|
for (uint i = 0; i < block_dims.x; i++) { |
|
|
|
|
|
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Thread 0 decodes color |
|
|
|
|
|
|
|
|
|
|
|
if (gl_LocalInvocationIndex == 0) { |
|
|
|
|
|
SkipBits(52); |
|
|
|
|
|
const uint r_u = StreamBits(16); |
|
|
|
|
|
const uint g_u = StreamBits(16); |
|
|
|
|
|
const uint b_u = StreamBits(16); |
|
|
|
|
|
const uint a_u = StreamBits(16); |
|
|
|
|
|
fill_color = vec4(float(r_u) / 65535.0f, float(g_u) / 65535.0f, float(b_u) / 65535.0f, float(a_u) / 65535.0f); |
|
|
|
|
|
} |
|
|
|
|
|
barrier(); |
|
|
|
|
|
|
|
|
|
|
|
const uint total_texels = block_dims.x * block_dims.y; |
|
|
|
|
|
for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { |
|
|
|
|
|
uint x = tid % block_dims.x; |
|
|
|
|
|
uint y = tid / block_dims.x; |
|
|
|
|
|
imageStore(dest_image, coord + ivec3(x, y, 0), fill_color); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@ -966,160 +1014,156 @@ uint DecodeMaxWeight(uint mode) { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void DecompressBlock(ivec3 coord) { |
|
|
void DecompressBlock(ivec3 coord) { |
|
|
uint mode = StreamBits(11); |
|
|
|
|
|
if (IsError(mode)) { |
|
|
|
|
|
FillError(coord); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
if ((mode & 0x1ff) == 0x1fc) { |
|
|
|
|
|
// params.void_extent_ldr = true; |
|
|
|
|
|
FillVoidExtentLDR(coord); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
const uvec2 size_params = DecodeBlockSize(mode); |
|
|
|
|
|
if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) { |
|
|
|
|
|
FillError(coord); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
const uint num_partitions = StreamBits(2) + 1; |
|
|
|
|
|
const uint mode_layout = FindLayout(mode); |
|
|
|
|
|
const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); |
|
|
|
|
|
if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) { |
|
|
|
|
|
FillError(coord); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
uint partition_index = 1; |
|
|
|
|
|
uvec4 color_endpoint_mode = uvec4(0); |
|
|
|
|
|
uint ced_pointer = 0; |
|
|
|
|
|
uint base_cem = 0; |
|
|
|
|
|
if (num_partitions == 1) { |
|
|
|
|
|
color_endpoint_mode.x = StreamBits(4); |
|
|
|
|
|
partition_index = 0; |
|
|
|
|
|
} else { |
|
|
|
|
|
partition_index = StreamBits(10); |
|
|
|
|
|
base_cem = StreamBits(6); |
|
|
|
|
|
} |
|
|
|
|
|
const uint base_mode = base_cem & 3; |
|
|
|
|
|
const uint max_weight = DecodeMaxWeight(mode); |
|
|
|
|
|
const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight); |
|
|
|
|
|
uint remaining_bits = 128 - weight_bits - total_bitsread; |
|
|
|
|
|
uint extra_cem_bits = 0; |
|
|
|
|
|
if (base_mode > 0) { |
|
|
|
|
|
switch (num_partitions) { |
|
|
|
|
|
case 2: |
|
|
|
|
|
extra_cem_bits += 2; |
|
|
|
|
|
break; |
|
|
|
|
|
case 3: |
|
|
|
|
|
extra_cem_bits += 5; |
|
|
|
|
|
break; |
|
|
|
|
|
case 4: |
|
|
|
|
|
extra_cem_bits += 8; |
|
|
|
|
|
break; |
|
|
|
|
|
default: |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
remaining_bits -= extra_cem_bits; |
|
|
|
|
|
const uint plane_selector_bits = dual_plane ? 2 : 0; |
|
|
|
|
|
remaining_bits -= plane_selector_bits; |
|
|
|
|
|
if (remaining_bits > 128) { |
|
|
|
|
|
// Bad data, more remaining bits than 4 bytes |
|
|
|
|
|
// return early |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
// Read color data... |
|
|
|
|
|
const uint color_data_bits = remaining_bits; |
|
|
|
|
|
while (remaining_bits > 0) { |
|
|
|
|
|
const int nb = int(min(remaining_bits, 32U)); |
|
|
|
|
|
const uint b = StreamBits(nb); |
|
|
|
|
|
color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); |
|
|
|
|
|
++ced_pointer; |
|
|
|
|
|
remaining_bits -= nb; |
|
|
|
|
|
} |
|
|
|
|
|
const uint plane_index = uint(StreamBits(plane_selector_bits)); |
|
|
|
|
|
if (base_mode > 0) { |
|
|
|
|
|
const uint extra_cem = StreamBits(extra_cem_bits); |
|
|
|
|
|
uint cem = (extra_cem << 6) | base_cem; |
|
|
|
|
|
cem >>= 2; |
|
|
|
|
|
uvec4 C = uvec4(0); |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
C[i] = (cem & 1); |
|
|
|
|
|
cem >>= 1; |
|
|
|
|
|
} |
|
|
|
|
|
uvec4 M = uvec4(0); |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
M[i] = cem & 3; |
|
|
|
|
|
cem >>= 2; |
|
|
|
|
|
} |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
color_endpoint_mode[i] = base_mode; |
|
|
|
|
|
if (C[i] == 0) { |
|
|
|
|
|
--color_endpoint_mode[i]; |
|
|
|
|
|
|
|
|
if (gl_LocalInvocationIndex == 0) { |
|
|
|
|
|
uint mode = StreamBits(11); |
|
|
|
|
|
bool early_exit = false; |
|
|
|
|
|
if (IsError(mode)) { |
|
|
|
|
|
size_params = uvec2(0); |
|
|
|
|
|
early_exit = true; |
|
|
|
|
|
} else if ((mode & 0x1ff) == 0x1fc) { |
|
|
|
|
|
size_params = uvec2(0xFFFFFFFF); |
|
|
|
|
|
early_exit = true; |
|
|
|
|
|
} else { |
|
|
|
|
|
size_params = DecodeBlockSize(mode); |
|
|
|
|
|
if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) { |
|
|
|
|
|
size_params = uvec2(0); |
|
|
|
|
|
early_exit = true; |
|
|
} |
|
|
} |
|
|
color_endpoint_mode[i] <<= 2; |
|
|
|
|
|
color_endpoint_mode[i] |= M[i]; |
|
|
|
|
|
} |
|
|
} |
|
|
} else if (num_partitions > 1) { |
|
|
|
|
|
const uint cem = base_cem >> 2; |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
color_endpoint_mode[i] = cem; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!early_exit) { |
|
|
|
|
|
num_partitions = StreamBits(2) + 1; |
|
|
|
|
|
uint mode_layout = FindLayout(mode); |
|
|
|
|
|
dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); |
|
|
|
|
|
if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) { |
|
|
|
|
|
size_params = uvec2(0); |
|
|
|
|
|
early_exit = true; |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uvec4 endpoints0[4]; |
|
|
|
|
|
uvec4 endpoints1[4]; |
|
|
|
|
|
{ |
|
|
|
|
|
// This decode phase should at most push 32 elements into the vector |
|
|
|
|
|
result_vector_max_index = 32; |
|
|
|
|
|
uint color_values[32]; |
|
|
|
|
|
uint colvals_index = 0; |
|
|
|
|
|
DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values); |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values, |
|
|
|
|
|
colvals_index); |
|
|
|
|
|
|
|
|
if (!early_exit) { |
|
|
|
|
|
uint partition_index_local = 1; |
|
|
|
|
|
uvec4 color_endpoint_mode = uvec4(0); |
|
|
|
|
|
uint ced_pointer = 0; |
|
|
|
|
|
uint base_cem = 0; |
|
|
|
|
|
if (num_partitions == 1) { |
|
|
|
|
|
color_endpoint_mode.x = StreamBits(4); |
|
|
|
|
|
partition_index_local = 0; |
|
|
|
|
|
} else { |
|
|
|
|
|
partition_index_local = StreamBits(10); |
|
|
|
|
|
base_cem = StreamBits(6); |
|
|
|
|
|
} |
|
|
|
|
|
partition_index = partition_index_local; // Store to shared |
|
|
|
|
|
const uint base_mode = base_cem & 3; |
|
|
|
|
|
const uint max_weight = DecodeMaxWeight(mode); |
|
|
|
|
|
const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight); |
|
|
|
|
|
uint remaining_bits = 128 - weight_bits - total_bitsread; |
|
|
|
|
|
uint extra_cem_bits = 0; |
|
|
|
|
|
if (base_mode > 0) { |
|
|
|
|
|
switch (num_partitions) { |
|
|
|
|
|
case 2: extra_cem_bits += 2; break; |
|
|
|
|
|
case 3: extra_cem_bits += 5; break; |
|
|
|
|
|
case 4: extra_cem_bits += 8; break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
remaining_bits -= extra_cem_bits; |
|
|
|
|
|
const uint plane_selector_bits = dual_plane ? 2 : 0; |
|
|
|
|
|
remaining_bits -= plane_selector_bits; |
|
|
|
|
|
if (remaining_bits > 128) { |
|
|
|
|
|
size_params = uvec2(0); // Error |
|
|
|
|
|
} else { |
|
|
|
|
|
const uint color_data_bits = remaining_bits; |
|
|
|
|
|
while (remaining_bits > 0) { |
|
|
|
|
|
const int nb = int(min(remaining_bits, 32U)); |
|
|
|
|
|
const uint b = StreamBits(nb); |
|
|
|
|
|
color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); |
|
|
|
|
|
++ced_pointer; |
|
|
|
|
|
remaining_bits -= nb; |
|
|
|
|
|
} |
|
|
|
|
|
plane_index = uint(StreamBits(plane_selector_bits)); |
|
|
|
|
|
if (base_mode > 0) { |
|
|
|
|
|
const uint extra_cem = StreamBits(extra_cem_bits); |
|
|
|
|
|
uint cem = (extra_cem << 6) | base_cem; |
|
|
|
|
|
cem >>= 2; |
|
|
|
|
|
uvec4 C = uvec4(0); |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
C[i] = (cem & 1); cem >>= 1; |
|
|
|
|
|
} |
|
|
|
|
|
uvec4 M = uvec4(0); |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
M[i] = cem & 3; cem >>= 2; |
|
|
|
|
|
} |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
color_endpoint_mode[i] = base_mode; |
|
|
|
|
|
if (C[i] == 0) --color_endpoint_mode[i]; |
|
|
|
|
|
color_endpoint_mode[i] <<= 2; |
|
|
|
|
|
color_endpoint_mode[i] |= M[i]; |
|
|
|
|
|
} |
|
|
|
|
|
} else if (num_partitions > 1) { |
|
|
|
|
|
const uint cem = base_cem >> 2; |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
color_endpoint_mode[i] = cem; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
result_limit_reached = false; |
|
|
|
|
|
uint colvals_index = 0; |
|
|
|
|
|
DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); |
|
|
|
|
|
for (uint i = 0; i < num_partitions; i++) { |
|
|
|
|
|
ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values_direct, colvals_index); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
color_endpoint_data = local_buff; |
|
|
|
|
|
color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; |
|
|
|
|
|
const uint clear_byte_start = (weight_bits >> 3) + 1; |
|
|
|
|
|
const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & uint(((1 << (weight_bits % 8)) - 1)); |
|
|
|
|
|
const uint vec_index = (clear_byte_start - 1) >> 2; |
|
|
|
|
|
color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); |
|
|
|
|
|
for (uint i = clear_byte_start; i < 16; ++i) { |
|
|
|
|
|
const uint idx = i >> 2; |
|
|
|
|
|
color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
result_index = 0; |
|
|
|
|
|
color_bitsread = 0; |
|
|
|
|
|
result_limit_reached = false; |
|
|
|
|
|
result_vector_max_index = size_params.x * size_params.y; |
|
|
|
|
|
if (dual_plane) result_vector_max_index *= 2; |
|
|
|
|
|
DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane)); |
|
|
|
|
|
UnquantizeTexelWeights(size_params, dual_plane); |
|
|
|
|
|
|
|
|
|
|
|
if (num_partitions > 1) { |
|
|
|
|
|
pt = GetPartitionTable(partition_index, num_partitions); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
color_endpoint_data = local_buff; |
|
|
|
|
|
color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; |
|
|
|
|
|
const uint clear_byte_start = (weight_bits >> 3) + 1; |
|
|
|
|
|
|
|
|
barrier(); |
|
|
|
|
|
|
|
|
const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & |
|
|
|
|
|
uint(((1 << (weight_bits % 8)) - 1)); |
|
|
|
|
|
const uint vec_index = (clear_byte_start - 1) >> 2; |
|
|
|
|
|
color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, |
|
|
|
|
|
int((clear_byte_start - 1) % 4) * 8, 8); |
|
|
|
|
|
for (uint i = clear_byte_start; i < 16; ++i) { |
|
|
|
|
|
const uint idx = i >> 2; |
|
|
|
|
|
color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); |
|
|
|
|
|
|
|
|
if (size_params.x == 0) { |
|
|
|
|
|
FillError(coord); |
|
|
|
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// Re-init vector variables for next decode phase |
|
|
|
|
|
result_index = 0; |
|
|
|
|
|
color_bitsread = 0; |
|
|
|
|
|
result_limit_reached = false; |
|
|
|
|
|
|
|
|
|
|
|
// The limit for the Unquantize phase, avoids decoding more data than needed. |
|
|
|
|
|
result_vector_max_index = size_params.x * size_params.y; |
|
|
|
|
|
if (dual_plane) { |
|
|
|
|
|
result_vector_max_index *= 2; |
|
|
|
|
|
|
|
|
if (size_params.x == 0xFFFFFFFF) { |
|
|
|
|
|
FillVoidExtentLDR(coord); |
|
|
|
|
|
return; |
|
|
} |
|
|
} |
|
|
DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane)); |
|
|
|
|
|
|
|
|
|
|
|
UnquantizeTexelWeights(size_params, dual_plane); |
|
|
|
|
|
for (uint j = 0; j < block_dims.y; j++) { |
|
|
|
|
|
for (uint i = 0; i < block_dims.x; i++) { |
|
|
|
|
|
uint local_partition = 0; |
|
|
|
|
|
if (num_partitions > 1) { |
|
|
|
|
|
local_partition = Select2DPartition(partition_index, i, j, num_partitions); |
|
|
|
|
|
} |
|
|
|
|
|
const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); |
|
|
|
|
|
const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); |
|
|
|
|
|
const uvec4 weight_vec = GetUnquantizedWeightVector(j, i, size_params, plane_index, dual_plane); |
|
|
|
|
|
const vec4 Cf = |
|
|
|
|
|
vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); |
|
|
|
|
|
const vec4 p = (Cf / 65535.0f); |
|
|
|
|
|
imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); |
|
|
|
|
|
|
|
|
const uint total_texels = block_dims.x * block_dims.y; |
|
|
|
|
|
for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { |
|
|
|
|
|
uint x = tid % block_dims.x; |
|
|
|
|
|
uint y = tid / block_dims.x; |
|
|
|
|
|
|
|
|
|
|
|
uint local_partition = 0; |
|
|
|
|
|
if (num_partitions > 1) { |
|
|
|
|
|
local_partition = SelectPartition(pt, x, y, num_partitions); |
|
|
} |
|
|
} |
|
|
|
|
|
const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); |
|
|
|
|
|
const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); |
|
|
|
|
|
const uvec4 weight_vec = GetUnquantizedWeightVector(y, x, size_params, plane_index, dual_plane); |
|
|
|
|
|
const vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); |
|
|
|
|
|
const vec4 p = (Cf / 65535.0f); |
|
|
|
|
|
imageStore(dest_image, coord + ivec3(x, y, 0), p.gbar); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@ -1132,7 +1176,8 @@ uint SwizzleOffset(uvec2 pos) { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void main() { |
|
|
void main() { |
|
|
uvec3 pos = gl_GlobalInvocationID; |
|
|
|
|
|
|
|
|
uvec3 block_id = gl_WorkGroupID; |
|
|
|
|
|
uvec3 pos = block_id; |
|
|
pos.x <<= BYTES_PER_BLOCK_LOG2; |
|
|
pos.x <<= BYTES_PER_BLOCK_LOG2; |
|
|
const uint swizzle = SwizzleOffset(pos.xy); |
|
|
const uint swizzle = SwizzleOffset(pos.xy); |
|
|
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; |
|
|
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; |
|
|
@ -1144,10 +1189,21 @@ void main() { |
|
|
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; |
|
|
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; |
|
|
offset += swizzle; |
|
|
offset += swizzle; |
|
|
|
|
|
|
|
|
const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); |
|
|
|
|
|
|
|
|
if (gl_LocalInvocationIndex == 0) { |
|
|
|
|
|
total_bitsread = 0; |
|
|
|
|
|
result_index = 0; |
|
|
|
|
|
color_bitsread = 0; |
|
|
|
|
|
write_color_values = false; |
|
|
|
|
|
result_limit_reached = false; |
|
|
|
|
|
color_out_index = 0; |
|
|
|
|
|
color_num_values = 0; |
|
|
|
|
|
local_buff = astc_data[offset / 16]; |
|
|
|
|
|
} |
|
|
|
|
|
barrier(); |
|
|
|
|
|
|
|
|
|
|
|
ivec3 coord = ivec3(block_id * uvec3(block_dims, 1)); |
|
|
if (any(greaterThanEqual(coord, imageSize(dest_image)))) { |
|
|
if (any(greaterThanEqual(coord, imageSize(dest_image)))) { |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
local_buff = astc_data[offset / 16]; |
|
|
|
|
|
DecompressBlock(coord); |
|
|
DecompressBlock(coord); |
|
|
} |
|
|
} |