forked from wylab/llama.cpp
minimal device tuning
This commit is contained in:
@@ -9944,7 +9944,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
const uint32_t CHW = IC * KH * KW;
|
||||
// Cap X workgroups to limit concurrent IC channel reads.
|
||||
// The shader loops over X to cover the full CHW dimension.
|
||||
const uint32_t x_elements = std::min(CHW, std::max(512u, OW * KH * KW));
|
||||
// AMD prefers a lower limit
|
||||
const uint32_t min_cap = ctx->device->subgroup_size > 32 ? 512u : 4096u;
|
||||
const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW));
|
||||
elements = { x_elements, OW, OH * batch };
|
||||
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
|
||||
Reference in New Issue
Block a user