minimal device tuning

This commit is contained in:
Ruben Ortlam
2026-04-10 06:46:54 +02:00
parent 9403602b19
commit bb254903f9
+3 -1
View File
@@ -9944,7 +9944,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
const uint32_t CHW = IC * KH * KW;
// Cap X workgroups to limit concurrent IC channel reads.
// The shader loops over X to cover the full CHW dimension.
const uint32_t x_elements = std::min(CHW, std::max(512u, OW * KH * KW));
// AMD prefers a lower limit
const uint32_t min_cap = ctx->device->subgroup_size > 32 ? 512u : 4096u;
const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW));
elements = { x_elements, OW, OH * batch };
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);