diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 1aa64b65d..e9eb92a11 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -703,10 +703,10 @@ static __global__ void dequantize_block_turbo3_0_kernel(const void * __restrict_ // FWHT butterfly stages (7 stages for n=128) for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) { - if (tid < 64) { // 128/2 = 64 butterflies per stage - int group = tid / h; - int pos = tid % h; - int i = group * h * 2 + pos; + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + if (i + h < TURBO_HEAD_DIM_GPU) { float a = smem[i]; float b = smem[i + h]; smem[i] = a + b; @@ -760,10 +760,10 @@ static __global__ void dequantize_block_turbo4_0_kernel(const void * __restrict_ // FWHT butterfly stages (7 stages for n=128) for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) { - if (tid < 64) { // 128/2 = 64 butterflies per stage - int group = tid / h; - int pos = tid % h; - int i = group * h * 2 + pos; + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + if (i + h < TURBO_HEAD_DIM_GPU) { float a = smem[i]; float b = smem[i + h]; smem[i] = a + b; diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 306660060..a658a568c 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -238,10 +238,10 @@ static __global__ void k_set_rows_turbo3( // Step 3: FWHT butterfly stages (7 stages for n=128) for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) { - if (tid < 64) { - int group = tid / h; - int pos = tid % h; - int i = group * h * 2 + pos; + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + if (i + h < TURBO_HEAD_DIM_SR) { float a = smem[i]; float b = smem[i + h]; smem[i] = a + b; @@ -381,10 +381,10 @@ static __global__ void k_set_rows_turbo4( // Step 3: FWHT butterfly stages (7 stages for n=128) for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) { - if (tid < 64) { - int group = tid / h; - int pos = tid % h; - int i = group * h * 2 + pos; + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + if (i + h < TURBO_HEAD_DIM_SR) { float a = smem[i]; float b = smem[i + h]; smem[i] = a + b;