minimal device tuning

2026-04-10 06:46:54 +02:00
parent 9403602b19
commit bb254903f9
1 changed files with 3 additions and 1 deletions
@@ -9944,7 +9944,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
            const uint32_t CHW = IC * KH * KW;
            // Cap X workgroups to limit concurrent IC channel reads.
            // The shader loops over X to cover the full CHW dimension.
-            const uint32_t x_elements = std::min(CHW, std::max(512u, OW * KH * KW));
+            // AMD prefers a lower limit
+            const uint32_t min_cap = ctx->device->subgroup_size > 32 ? 512u : 4096u;
+            const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW));
            elements = { x_elements, OW, OH * batch };
            elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
            elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);