[astc] add support for workgroup in astc

2026-02-04 02:51:18 +01:00 · 2026-02-02 00:12:32 +01:00
parent 8ed0ed5828
commit 670764b535
2 changed files with 374 additions and 318 deletions
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -36,6 +36,12 @@ struct EncodingData {
    uint data;
 };

+struct PartitionTable {
+    uint s1, s2, s3, s4, s5, s6, s7, s8;
+    uint rnum;
+    bool small_block;
+};
+
 layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 {
    uvec4 astc_data[];
 };
@@ -62,26 +68,40 @@ const uint encoding_values[22] = uint[](
    (QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u)),
    (TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)));

-// Input ASTC texture globals
-int total_bitsread = 0;
-uvec4 local_buff;
+// Shared memory for workgroup processing
+shared uvec4 local_buff;
+shared int total_bitsread;

 // Color data globals
-uvec4 color_endpoint_data;
-int color_bitsread = 0;
+shared uvec4 color_endpoint_data;
+shared int color_bitsread;

 // Global "vector" to be pushed into when decoding
-// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode
-// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode
-// So the maximum would be 144 (12 x 12) elements, x 2 for two planes
 #define DIVCEIL(number, divisor) (number + divisor - 1) / divisor
 #define ARRAY_NUM_ELEMENTS 144
 #define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4)
-uint result_vector[ARRAY_NUM_ELEMENTS * 2];
+shared uint result_vector[ARRAY_NUM_ELEMENTS * 2];

-int result_index = 0;
-uint result_vector_max_index;
-bool result_limit_reached = false;
+shared int result_index;
+shared uint result_vector_max_index;
+shared bool result_limit_reached;
+
+// avoid intermediate result_vector storage during color decode phase
+shared bool write_color_values;
+shared uint color_values_direct[32];
+shared uint color_out_index;
+shared uint color_num_values;
+
+// Shared variables for DecompressBlock interthread communication
+shared uvec4 endpoints0[4];
+shared uvec4 endpoints1[4];
+shared PartitionTable pt;
+shared uvec2 size_params;
+shared uint num_partitions;
+shared uint partition_index;
+shared uint plane_index;
+shared bool dual_plane;
+shared vec4 fill_color;

 // EncodingData helpers
 uint Encoding(EncodingData val) {
@@ -114,9 +134,110 @@ EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint
    return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) |
                        ((bit_val) << 16u) | ((quint_trit_val) << 24u));
 }
+uint ReplicateBitTo9(uint bit);
+uint FastReplicateTo8(uint value, uint num_bits);
+
+void EmitColorValue(EncodingData val) {
+    // write directly to color_values_direct[]
+    const uint encoding = Encoding(val);
+    const uint bitlen = NumBits(val);
+    const uint bitval = BitValue(val);
+
+    if (encoding == JUST_BITS) {
+        color_values_direct[++color_out_index] = FastReplicateTo8(bitval, bitlen);
+        return;
+    }
+
+    uint A = ReplicateBitTo9((bitval & 1));
+    uint B = 0, C = 0, D = QuintTritValue(val);
+
+    if (encoding == TRIT) {
+        switch (bitlen) {
+            case 1:
+            C = 204;
+            break;
+            case 2: {
+                C = 93;
+                const uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+                break;
+            }
+            case 3: {
+                C = 44;
+                const uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+                break;
+            }
+            case 4: {
+                C = 22;
+                const uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+                break;
+            }
+            case 5: {
+                C = 11;
+                const uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+                break;
+            }
+            case 6: {
+                C = 5;
+                const uint fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+                break;
+            }
+        }
+    } else { // QUINT
+        switch (bitlen) {
+            case 1:
+            C = 113;
+            break;
+            case 2: {
+                C = 54;
+                const uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+                break;
+            }
+            case 3: {
+                C = 26;
+                const uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+                break;
+            }
+            case 4: {
+                C = 13;
+                const uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+                break;
+            }
+            case 5: {
+                C = 6;
+                const uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+                break;
+            }
+        }
+    }
+
+    uint T = (D * C) + B;
+    T ^= A;
+    T = (A & 0x80) | (T >> 2);
+    color_values_direct[++color_out_index] = T;
+}
+


 void ResultEmplaceBack(EncodingData val) {
+    if (write_color_values) {
+        if (color_out_index >= color_num_values) {
+            // avoid decoding more than needed by this phase
+            result_limit_reached = true;
+            return;
+        }
+        EmitColorValue(val);
+        return;
+    }
+
    if (result_index >= result_vector_max_index) {
        // Alert callers to avoid decoding more than needed by this phase
        result_limit_reached = true;
@@ -197,32 +318,31 @@ uint Hash52(uint p) {
    return p;
 }

-uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) {
-    if ((block_dims.y * block_dims.x) < 32) {
-        x <<= 1;
-        y <<= 1;
-    }
+
+PartitionTable GetPartitionTable(uint seed, uint partition_count) {
+    PartitionTable pt;
+    pt.small_block = (block_dims.y * block_dims.x) < 32;

    seed += (partition_count - 1) * 1024;
+    uint rnum = Hash52(uint(seed));
+    pt.rnum = rnum;

-    const uint rnum = Hash52(uint(seed));
-    uint seed1 = uint(rnum & 0xF);
-    uint seed2 = uint((rnum >> 4) & 0xF);
-    uint seed3 = uint((rnum >> 8) & 0xF);
-    uint seed4 = uint((rnum >> 12) & 0xF);
-    uint seed5 = uint((rnum >> 16) & 0xF);
-    uint seed6 = uint((rnum >> 20) & 0xF);
-    uint seed7 = uint((rnum >> 24) & 0xF);
-    uint seed8 = uint((rnum >> 28) & 0xF);
-
-    seed1 = (seed1 * seed1);
-    seed2 = (seed2 * seed2);
-    seed3 = (seed3 * seed3);
-    seed4 = (seed4 * seed4);
-    seed5 = (seed5 * seed5);
-    seed6 = (seed6 * seed6);
-    seed7 = (seed7 * seed7);
-    seed8 = (seed8 * seed8);
+    uint seed1 = (rnum & 0xF);
+    seed1 *= seed1;
+    uint seed2 = (rnum >> 4) & 0xF;
+    seed2 *= seed2;
+    uint seed3 = (rnum >> 8) & 0xF;
+    seed3 *= seed3;
+    uint seed4 = (rnum >> 12) & 0xF;
+    seed4 *= seed4;
+    uint seed5 = (rnum >> 16) & 0xF;
+    seed5 *= seed5;
+    uint seed6 = (rnum >> 20) & 0xF;
+    seed6 *= seed6;
+    uint seed7 = (rnum >> 24) & 0xF;
+    seed7 *= seed7;
+    uint seed8 = (rnum >> 28) & 0xF;
+    seed8 *= seed8;

    uint sh1, sh2;
    if ((seed & 1) > 0) {
@@ -232,31 +352,37 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) {
        sh1 = (partition_count == 3) ? 6 : 5;
        sh2 = (seed & 2) > 0 ? 4 : 5;
    }
-    seed1 >>= sh1;
-    seed2 >>= sh2;
-    seed3 >>= sh1;
-    seed4 >>= sh2;
-    seed5 >>= sh1;
-    seed6 >>= sh2;
-    seed7 >>= sh1;
-    seed8 >>= sh2;

-    uint a = seed1 * x + seed2 * y + (rnum >> 14);
-    uint b = seed3 * x + seed4 * y + (rnum >> 10);
-    uint c = seed5 * x + seed6 * y + (rnum >> 6);
-    uint d = seed7 * x + seed8 * y + (rnum >> 2);
+    pt.s1 = seed1 >> sh1;
+    pt.s2 = seed2 >> sh2;
+    pt.s3 = seed3 >> sh1;
+    pt.s4 = seed4 >> sh2;
+    pt.s5 = seed5 >> sh1;
+    pt.s6 = seed6 >> sh2;
+    pt.s7 = seed7 >> sh1;
+    pt.s8 = seed8 >> sh2;
+
+    return pt;
+    }
+
+uint SelectPartition(PartitionTable pt, uint x, uint y, uint partition_count) {
+    if (pt.small_block) {
+        x <<= 1;
+        y <<= 1;
+    }
+
+    uint a = pt.s1 * x + pt.s2 * y + (pt.rnum >> 14);
+    uint b = pt.s3 * x + pt.s4 * y + (pt.rnum >> 10);
+    uint c = pt.s5 * x + pt.s6 * y + (pt.rnum >> 6);
+    uint d = pt.s7 * x + pt.s8 * y + (pt.rnum >> 2);

    a &= 0x3F;
    b &= 0x3F;
    c &= 0x3F;
    d &= 0x3F;

-    if (partition_count < 4) {
-        d = 0;
-    }
-    if (partition_count < 3) {
-        c = 0;
-    }
+    if (partition_count < 4) d = 0;
+    if (partition_count < 3) c = 0;

    if (a >= b && a >= c && a >= d) {
        return 0;
@@ -457,7 +583,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) {
    }
 }

-void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, out uint color_values[32]) {
+void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
    uint num_values = 0;
    for (uint i = 0; i < num_partitions; i++) {
        num_values += ((modes[i] >> 2) + 1) << 1;
@@ -471,104 +597,21 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits, o
            break;
        }
    }
-    DecodeIntegerSequence(range - 1, num_values);
-    uint out_index = 0;
-    for (int itr = 0; itr < result_index; ++itr) {
-        if (out_index >= num_values) {
-            break;
-        }
-        const EncodingData val = GetEncodingFromVector(itr);
-        const uint encoding = Encoding(val);
-        const uint bitlen = NumBits(val);
-        const uint bitval = BitValue(val);
-        uint A = 0, B = 0, C = 0, D = 0;
-        A = ReplicateBitTo9((bitval & 1));
-        switch (encoding) {
-        case JUST_BITS:
-            color_values[++out_index] = FastReplicateTo8(bitval, bitlen);
-            break;
-        case TRIT: {
-            D = QuintTritValue(val);
-            switch (bitlen) {
-            case 1:
-                C = 204;
-                break;
-            case 2: {
-                C = 93;
-                const uint b = (bitval >> 1) & 1;
-                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
-                break;
-            }
-            case 3: {
-                C = 44;
-                const uint cb = (bitval >> 1) & 3;
-                B = (cb << 7) | (cb << 2) | cb;
-                break;
-            }
-            case 4: {
-                C = 22;
-                const uint dcb = (bitval >> 1) & 7;
-                B = (dcb << 6) | dcb;
-                break;
-            }
-            case 5: {
-                C = 11;
-                const uint edcb = (bitval >> 1) & 0xF;
-                B = (edcb << 5) | (edcb >> 2);
-                break;
-            }
-            case 6: {
-                C = 5;
-                const uint fedcb = (bitval >> 1) & 0x1F;
-                B = (fedcb << 4) | (fedcb >> 4);
-                break;
-            }
-            }
-            break;
-        }
-        case QUINT: {
-            D = QuintTritValue(val);
-            switch (bitlen) {
-            case 1:
-                C = 113;
-                break;
-            case 2: {
-                C = 54;
-                const uint b = (bitval >> 1) & 1;
-                B = (b << 8) | (b << 3) | (b << 2);
-                break;
-            }
-            case 3: {
-                C = 26;
-                const uint cb = (bitval >> 1) & 3;
-                B = (cb << 7) | (cb << 1) | (cb >> 1);
-                break;
-            }
-            case 4: {
-                C = 13;
-                const uint dcb = (bitval >> 1) & 7;
-                B = (dcb << 6) | (dcb >> 1);
-                break;
-            }
-            case 5: {
-                C = 6;
-                const uint edcb = (bitval >> 1) & 0xF;
-                B = (edcb << 5) | (edcb >> 3);
-                break;
-            }
-            }
-            break;
-        }
-        }
-        if (encoding != JUST_BITS) {
-            uint T = (D * C) + B;
-            T ^= A;
-            T = (A & 0x80) | (T >> 2);
-            color_values[++out_index] = T;
-        }
+    // Decode directly into color_values_direct[]
+    write_color_values = true;
+    color_out_index = 0;
+    color_num_values = num_values;
+    for (uint i = 0; i < 32; ++i) {
+        color_values_direct[i] = 0;
    }
+
+    DecodeIntegerSequence(range - 1, num_values);
+
+    write_color_values = false;
 }

+
+
 ivec2 BitTransferSigned(int a, int b) {
    ivec2 transferred;
    transferred.y = b >> 1;
@@ -730,7 +773,7 @@ uint UnquantizeTexelWeight(EncodingData val) {
    uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val);
    if (encoding == JUST_BITS) {
        return (bitlen >= 1 && bitlen <= 5)
-            ? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1)))
+            ? ((bitval * 64) + ((1 << bitlen) - 1) / 2) / ((1 << bitlen) - 1)
            : FastReplicateTo6(bitval, bitlen);
    } else if (encoding == TRIT || encoding == QUINT) {
        uint B = 0, C = 0, D = 0;
@@ -864,27 +907,32 @@ int FindLayout(uint mode) {


 void FillError(ivec3 coord) {
-    for (uint j = 0; j < block_dims.y; j++) {
-        for (uint i = 0; i < block_dims.x; i++) {
-            imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0));
-        }
+    const uint total_texels = block_dims.x * block_dims.y;
+    for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+        uint x = tid % block_dims.x;
+        uint y = tid / block_dims.x;
+        imageStore(dest_image, coord + ivec3(x, y, 0), vec4(0.0, 0.0, 0.0, 0.0));
    }
 }

 void FillVoidExtentLDR(ivec3 coord) {
-    SkipBits(52);
-    const uint r_u = StreamBits(16);
-    const uint g_u = StreamBits(16);
-    const uint b_u = StreamBits(16);
-    const uint a_u = StreamBits(16);
-    const float a = float(a_u) / 65535.0f;
-    const float r = float(r_u) / 65535.0f;
-    const float g = float(g_u) / 65535.0f;
-    const float b = float(b_u) / 65535.0f;
-    for (uint j = 0; j < block_dims.y; j++) {
-        for (uint i = 0; i < block_dims.x; i++) {
-            imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
-        }
+    // Thread 0 decodes color
+
+    if (gl_LocalInvocationIndex == 0) {
+        SkipBits(52);
+        const uint r_u = StreamBits(16);
+        const uint g_u = StreamBits(16);
+        const uint b_u = StreamBits(16);
+        const uint a_u = StreamBits(16);
+        fill_color = vec4(float(r_u) / 65535.0f, float(g_u) / 65535.0f, float(b_u) / 65535.0f, float(a_u) / 65535.0f);
+    }
+    barrier();
+
+    const uint total_texels = block_dims.x * block_dims.y;
+    for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+        uint x = tid % block_dims.x;
+        uint y = tid / block_dims.x;
+        imageStore(dest_image, coord + ivec3(x, y, 0), fill_color);
    }
 }

@@ -966,160 +1014,156 @@ uint DecodeMaxWeight(uint mode) {
 }

 void DecompressBlock(ivec3 coord) {
-    uint mode = StreamBits(11);
-    if (IsError(mode)) {
+    if (gl_LocalInvocationIndex == 0) {
+        uint mode = StreamBits(11);
+        bool early_exit = false;
+        if (IsError(mode)) {
+            size_params = uvec2(0);
+            early_exit = true;
+        } else if ((mode & 0x1ff) == 0x1fc) {
+            size_params = uvec2(0xFFFFFFFF);
+            early_exit = true;
+        } else {
+            size_params = DecodeBlockSize(mode);
+            if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) {
+               size_params = uvec2(0);
+               early_exit = true;
+            }
+        }
+
+        if (!early_exit) {
+            num_partitions = StreamBits(2) + 1;
+            uint mode_layout = FindLayout(mode);
+            dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
+            if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) {
+                size_params = uvec2(0);
+                early_exit = true;
+            }
+        }
+
+        if (!early_exit) {
+            uint partition_index_local = 1;
+            uvec4 color_endpoint_mode = uvec4(0);
+            uint ced_pointer = 0;
+            uint base_cem = 0;
+            if (num_partitions == 1) {
+                color_endpoint_mode.x = StreamBits(4);
+                partition_index_local = 0;
+            } else {
+                partition_index_local = StreamBits(10);
+                base_cem = StreamBits(6);
+            }
+            partition_index = partition_index_local; // Store to shared
+            const uint base_mode = base_cem & 3;
+            const uint max_weight = DecodeMaxWeight(mode);
+            const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight);
+            uint remaining_bits = 128 - weight_bits - total_bitsread;
+            uint extra_cem_bits = 0;
+            if (base_mode > 0) {
+                switch (num_partitions) {
+                case 2: extra_cem_bits += 2; break;
+                case 3: extra_cem_bits += 5; break;
+                case 4: extra_cem_bits += 8; break;
+                }
+            }
+            remaining_bits -= extra_cem_bits;
+            const uint plane_selector_bits = dual_plane ? 2 : 0;
+            remaining_bits -= plane_selector_bits;
+            if (remaining_bits > 128) {
+                size_params = uvec2(0); // Error
+            } else {
+                const uint color_data_bits = remaining_bits;
+                while (remaining_bits > 0) {
+                    const int nb = int(min(remaining_bits, 32U));
+                    const uint b = StreamBits(nb);
+                    color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
+                    ++ced_pointer;
+                    remaining_bits -= nb;
+                }
+                plane_index = uint(StreamBits(plane_selector_bits));
+                if (base_mode > 0) {
+                    const uint extra_cem = StreamBits(extra_cem_bits);
+                    uint cem = (extra_cem << 6) | base_cem;
+                    cem >>= 2;
+                    uvec4 C = uvec4(0);
+                    for (uint i = 0; i < num_partitions; i++) {
+                        C[i] = (cem & 1); cem >>= 1;
+                    }
+                    uvec4 M = uvec4(0);
+                    for (uint i = 0; i < num_partitions; i++) {
+                        M[i] = cem & 3; cem >>= 2;
+                    }
+                    for (uint i = 0; i < num_partitions; i++) {
+                        color_endpoint_mode[i] = base_mode;
+                        if (C[i] == 0) --color_endpoint_mode[i];
+                        color_endpoint_mode[i] <<= 2;
+                        color_endpoint_mode[i] |= M[i];
+                    }
+                } else if (num_partitions > 1) {
+                    const uint cem = base_cem >> 2;
+                    for (uint i = 0; i < num_partitions; i++) {
+                        color_endpoint_mode[i] = cem;
+                    }
+                }
+
+                result_limit_reached = false;
+                uint colvals_index = 0;
+                DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
+                for (uint i = 0; i < num_partitions; i++) {
+                    ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values_direct, colvals_index);
+                }
+
+                color_endpoint_data = local_buff;
+                color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx;
+                const uint clear_byte_start = (weight_bits >> 3) + 1;
+                const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & uint(((1 << (weight_bits % 8)) - 1));
+                const uint vec_index = (clear_byte_start - 1) >> 2;
+                color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8);
+                for (uint i = clear_byte_start; i < 16; ++i) {
+                    const uint idx = i >> 2;
+                    color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8);
+                }
+
+                result_index = 0;
+                color_bitsread = 0;
+                result_limit_reached = false;
+                result_vector_max_index = size_params.x * size_params.y;
+                if (dual_plane) result_vector_max_index *= 2;
+                DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane));
+                UnquantizeTexelWeights(size_params, dual_plane);
+
+                if (num_partitions > 1) {
+                    pt = GetPartitionTable(partition_index, num_partitions);
+                }
+            }
+        }
+    }
+    barrier();
+
+    if (size_params.x == 0) {
        FillError(coord);
        return;
    }
-    if ((mode & 0x1ff) == 0x1fc) {
-        // params.void_extent_ldr = true;
+    if (size_params.x == 0xFFFFFFFF) {
        FillVoidExtentLDR(coord);
        return;
    }
-    const uvec2 size_params = DecodeBlockSize(mode);
-    if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) {
-        FillError(coord);
-        return;
-    }
-    const uint num_partitions = StreamBits(2) + 1;
-    const uint mode_layout = FindLayout(mode);
-    const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
-    if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) {
-        FillError(coord);
-        return;
-    }
-    uint partition_index = 1;
-    uvec4 color_endpoint_mode = uvec4(0);
-    uint ced_pointer = 0;
-    uint base_cem = 0;
-    if (num_partitions == 1) {
-        color_endpoint_mode.x = StreamBits(4);
-        partition_index = 0;
-    } else {
-        partition_index = StreamBits(10);
-        base_cem = StreamBits(6);
-    }
-    const uint base_mode = base_cem & 3;
-    const uint max_weight = DecodeMaxWeight(mode);
-    const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight);
-    uint remaining_bits = 128 - weight_bits - total_bitsread;
-    uint extra_cem_bits = 0;
-    if (base_mode > 0) {
-        switch (num_partitions) {
-        case 2:
-            extra_cem_bits += 2;
-            break;
-        case 3:
-            extra_cem_bits += 5;
-            break;
-        case 4:
-            extra_cem_bits += 8;
-            break;
-        default:
-            return;
-        }
-    }
-    remaining_bits -= extra_cem_bits;
-    const uint plane_selector_bits = dual_plane ? 2 : 0;
-    remaining_bits -= plane_selector_bits;
-    if (remaining_bits > 128) {
-        // Bad data, more remaining bits than 4 bytes
-        // return early
-        return;
-    }
-    // Read color data...
-    const uint color_data_bits = remaining_bits;
-    while (remaining_bits > 0) {
-        const int nb = int(min(remaining_bits, 32U));
-        const uint b = StreamBits(nb);
-        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
-        ++ced_pointer;
-        remaining_bits -= nb;
-    }
-    const uint plane_index = uint(StreamBits(plane_selector_bits));
-    if (base_mode > 0) {
-        const uint extra_cem = StreamBits(extra_cem_bits);
-        uint cem = (extra_cem << 6) | base_cem;
-        cem >>= 2;
-        uvec4 C = uvec4(0);
-        for (uint i = 0; i < num_partitions; i++) {
-            C[i] = (cem & 1);
-            cem >>= 1;
-        }
-        uvec4 M = uvec4(0);
-        for (uint i = 0; i < num_partitions; i++) {
-            M[i] = cem & 3;
-            cem >>= 2;
-        }
-        for (uint i = 0; i < num_partitions; i++) {
-            color_endpoint_mode[i] = base_mode;
-            if (C[i] == 0) {
-                --color_endpoint_mode[i];
-            }
-            color_endpoint_mode[i] <<= 2;
-            color_endpoint_mode[i] |= M[i];
-        }
-    } else if (num_partitions > 1) {
-        const uint cem = base_cem >> 2;
-        for (uint i = 0; i < num_partitions; i++) {
-            color_endpoint_mode[i] = cem;
-        }
-    }

-    uvec4 endpoints0[4];
-    uvec4 endpoints1[4];
-    {
-        // This decode phase should at most push 32 elements into the vector
-        result_vector_max_index = 32;
-        uint color_values[32];
-        uint colvals_index = 0;
-        DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits, color_values);
-        for (uint i = 0; i < num_partitions; i++) {
-            ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], color_values,
-                             colvals_index);
-        }
-    }
-    color_endpoint_data = local_buff;
-    color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx;
-    const uint clear_byte_start = (weight_bits >> 3) + 1;
-
-    const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) &
-                             uint(((1 << (weight_bits % 8)) - 1));
-    const uint vec_index = (clear_byte_start - 1) >> 2;
-    color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert,
-                                                    int((clear_byte_start - 1) % 4) * 8, 8);
-    for (uint i = clear_byte_start; i < 16; ++i) {
-        const uint idx = i >> 2;
-        color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8);
-    }
-
-    // Re-init vector variables for next decode phase
-    result_index = 0;
-    color_bitsread = 0;
-    result_limit_reached = false;
-
-    // The limit for the Unquantize phase, avoids decoding more data than needed.
-    result_vector_max_index = size_params.x * size_params.y;
-    if (dual_plane) {
-        result_vector_max_index *= 2;
-    }
-    DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane));
-
-    UnquantizeTexelWeights(size_params, dual_plane);
-    for (uint j = 0; j < block_dims.y; j++) {
-        for (uint i = 0; i < block_dims.x; i++) {
-            uint local_partition = 0;
-            if (num_partitions > 1) {
-                local_partition = Select2DPartition(partition_index, i, j, num_partitions);
-            }
-            const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]);
-            const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]);
-            const uvec4 weight_vec = GetUnquantizedWeightVector(j, i, size_params, plane_index, dual_plane);
-            const vec4 Cf =
-                vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
-            const vec4 p = (Cf / 65535.0f);
-            imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
+    const uint total_texels = block_dims.x * block_dims.y;
+    for (uint tid = gl_LocalInvocationIndex; tid < total_texels; tid += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+        uint x = tid % block_dims.x;
+        uint y = tid / block_dims.x;
+
+        uint local_partition = 0;
+        if (num_partitions > 1) {
+            local_partition = SelectPartition(pt, x, y, num_partitions);
        }
+        const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]);
+        const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]);
+        const uvec4 weight_vec = GetUnquantizedWeightVector(y, x, size_params, plane_index, dual_plane);
+        const vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
+        const vec4 p = (Cf / 65535.0f);
+        imageStore(dest_image, coord + ivec3(x, y, 0), p.gbar);
    }
 }

@@ -1132,7 +1176,8 @@ uint SwizzleOffset(uvec2 pos) {
 }

 void main() {
-    uvec3 pos = gl_GlobalInvocationID;
+    uvec3 block_id = gl_WorkGroupID;
+    uvec3 pos = block_id;
    pos.x <<= BYTES_PER_BLOCK_LOG2;
    const uint swizzle = SwizzleOffset(pos.xy);
    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
@@ -1144,10 +1189,21 @@ void main() {
    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
    offset += swizzle;

-    const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
+    if (gl_LocalInvocationIndex == 0) {
+       total_bitsread = 0;
+       result_index = 0;
+       color_bitsread = 0;
+       write_color_values = false;
+       result_limit_reached = false;
+       color_out_index = 0;
+       color_num_values = 0;
+       local_buff = astc_data[offset / 16];
+    }
+    barrier();
+
+    ivec3 coord = ivec3(block_id * uvec3(block_dims, 1));
    if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
        return;
    }
-    local_buff = astc_data[offset / 16];
    DecompressBlock(coord);
 }
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -586,8 +586,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
    });
    for (const VideoCommon::SwizzleParameters& swizzle : swizzles) {
        const size_t input_offset = swizzle.buffer_offset + map.offset;
-        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U);
-        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U);
+        const u32 num_dispatches_x = swizzle.num_tiles.width;
+        const u32 num_dispatches_y = swizzle.num_tiles.height;
        const u32 num_dispatches_z = image.info.resources.layers;

        compute_pass_descriptor_queue.Acquire();