diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 1aa64b65d..e9eb92a11 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -703,10 +703,10 @@ static __global__ void dequantize_block_turbo3_0_kernel(const void * __restrict_
 
     // FWHT butterfly stages (7 stages for n=128)
     for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) {
-        if (tid < 64) {  // 128/2 = 64 butterflies per stage
-            int group = tid / h;
-            int pos = tid % h;
-            int i = group * h * 2 + pos;
+        int group = tid / h;
+        int pos = tid % h;
+        int i = group * h * 2 + pos;
+        if (i + h < TURBO_HEAD_DIM_GPU) {
             float a = smem[i];
             float b = smem[i + h];
             smem[i]     = a + b;
@@ -760,10 +760,10 @@ static __global__ void dequantize_block_turbo4_0_kernel(const void * __restrict_
 
     // FWHT butterfly stages (7 stages for n=128)
     for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) {
-        if (tid < 64) {  // 128/2 = 64 butterflies per stage
-            int group = tid / h;
-            int pos = tid % h;
-            int i = group * h * 2 + pos;
+        int group = tid / h;
+        int pos = tid % h;
+        int i = group * h * 2 + pos;
+        if (i + h < TURBO_HEAD_DIM_GPU) {
             float a = smem[i];
             float b = smem[i + h];
             smem[i]     = a + b;
diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu
index 306660060..a658a568c 100644
--- a/ggml/src/ggml-cuda/set-rows.cu
+++ b/ggml/src/ggml-cuda/set-rows.cu
@@ -238,10 +238,10 @@ static __global__ void k_set_rows_turbo3(
 
     // Step 3: FWHT butterfly stages (7 stages for n=128)
     for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) {
-        if (tid < 64) {
-            int group = tid / h;
-            int pos = tid % h;
-            int i = group * h * 2 + pos;
+        int group = tid / h;
+        int pos = tid % h;
+        int i = group * h * 2 + pos;
+        if (i + h < TURBO_HEAD_DIM_SR) {
             float a = smem[i];
             float b = smem[i + h];
             smem[i]     = a + b;
@@ -381,10 +381,10 @@ static __global__ void k_set_rows_turbo4(
 
     // Step 3: FWHT butterfly stages (7 stages for n=128)
     for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) {
-        if (tid < 64) {
-            int group = tid / h;
-            int pos = tid % h;
-            int i = group * h * 2 + pos;
+        int group = tid / h;
+        int pos = tid % h;
+        int i = group * h * 2 + pos;
+        if (i + h < TURBO_HEAD_DIM_SR) {
             float a = smem[i];
             float b = smem[i + h];
             smem[i]     = a + b;