CUDA: better error for FA kernel with 0 occupancy (#16643 )

ggml: add ggml_can_fuse_subgraph (#16662 )
* ggml: add ggml_can_fuse_subgraph * ggml-cuda: use ggml_can_fuse_subgraph for topk-moe * format * 1. remove inputs from signature as they are transient nodes 2. add check for views: view_src should be part of the subgraph * - combine check into one loop - check all view_src parents - other minor review comments * remove redudant if test * - rename and other minor review comments * add assert about count < 32
2025-10-21 15:27:53 +02:00 · 2025-10-21 16:43:14 +08:00 · 2025-10-20 22:26:17 -07:00 · 2025-10-20 22:16:08 -05:00 · 2025-10-21 00:21:12 +02:00
15 changed files with 221 additions and 42 deletions
@@ -72,7 +72,7 @@ Legend:
 |                     OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
@@ -9379,8 +9379,8 @@
 "SYCL0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","SYCL"
 "SYCL0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","SYCL"
 "SYCL0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","SYCL"
-"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","SYCL"
-"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","SYCL"
+"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","yes","SYCL"
+"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","yes","SYCL"
 "SYCL0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","SYCL"
 "SYCL0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","0","no","SYCL"
 "SYCL0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","SYCL"
@@ -895,6 +895,7 @@ void launch_fattn(
    const dim3 block_dim(warp_size, nwarps, 1);
    int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
    CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+    GGML_ASSERT(max_blocks_per_sm > 0);
    int parallel_blocks = max_blocks_per_sm;

    dim3 blocks_num;
@@ -2821,15 +2821,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
    std::initializer_list<enum ggml_op> topk_moe_ops           = ggml_cuda_topk_moe_ops(false);
    std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);

-    if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
-
-        if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
-            return false;
-        }
-
-        for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
-            if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
-        }
+    if (ops.size() == topk_moe_ops_with_norm.size() &&
+        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx];
        ggml_tensor * weights = cgraph->nodes[node_idx+8];

@@ -2838,16 +2831,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
        }
    }

-    if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
-
-        if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
-            return false;
-        }
-
-        for (size_t i = 0; i < topk_moe_ops.size(); i++) {
-            if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
-        }
-
+    if (ops.size() == topk_moe_ops.size() &&
+        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx];
        ggml_tensor * weights = cgraph->nodes[node_idx+4];
        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@@ -647,6 +647,36 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
    return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
 }

+GGML_API bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
+                                         const int *                node_idxs,
+                                         int                        count,
+                                         const enum ggml_op *       ops,
+                                         const int *                outputs,
+                                         int                        num_outputs);
+
+// Returns true if the subgraph formed by {node_idxs} can be fused
+// checks whethers all nodes which are not part of outputs can be elided
+// by checking if their num_uses are confined to the subgraph
+static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
+                                          int                        node_idx,
+                                          int                        count,
+                                          const enum ggml_op *       ops,
+                                          const int *                outputs,
+                                          int                        num_outputs) {
+    GGML_ASSERT(count < 32);
+    if (node_idx + count > cgraph->n_nodes) {
+        return false;
+    }
+
+    int idxs[32];
+
+    for (int i = 0; i < count; ++i) {
+        idxs[i] = node_idx + i;
+    }
+
+    return ggml_can_fuse_subgraph_ext(cgraph, idxs, count, ops, outputs, num_outputs);
+}
+
 #ifdef __cplusplus
 }
 #endif
@@ -660,6 +690,13 @@ inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::
    return ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size());
 }

+inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph *          cgraph,
+                                   int                                 start_idx,
+                                   std::initializer_list<enum ggml_op> ops,
+                                   std::initializer_list<int>          outputs = {}) {
+    return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
+}
+
 // expose GGUF internals for test code
 GGML_API size_t gguf_type_size(enum gguf_type type);
 GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
@@ -15,13 +15,12 @@

 #include <CL/cl.h>

+#include <inttypes.h>
 #include <string.h>

 #include <cstddef>
 #include <cstdint>
-#include <atomic>
 #include <fstream>
-#include <limits>
 #include <vector>
 #include <string>
 #include <cmath>
@@ -533,25 +532,17 @@ struct ggml_backend_opencl_context {
        }

        // Dump a csv
-        float total_kernel_time = 0;
-        fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
+        fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
        for (const ProfilingInfo & info : profiling_info) {
-            total_kernel_time += info.cmd_duration_ns/1.e6f;
-            fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
+            fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
                info.op_name.c_str(), info.kernel_name.c_str(),
-                info.cmd_queued_duration_ns/1.e6f,
-                info.cmd_submit_duration_ns/1.e6f,
                info.cmd_duration_ns/1.e6f,
-                info.cmd_complete_duration_ns/1.e6f,
-                info.cmd_total_duration_ns/1.e6f,
                info.global_size[0], info.global_size[1], info.global_size[2],
                info.local_size[0], info.local_size[1], info.local_size[2],
                info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
        }
        fclose(fperf);

-        GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
-
        // Dump a simple chrome trace
        FILE* ftrace = fopen("cl_trace.json", "w");
        if (!ftrace) {
@@ -561,14 +552,14 @@ struct ggml_backend_opencl_context {

        fprintf(ftrace, "[\n");
        for (const ProfilingInfo & info : profiling_info) {
-            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Host\"},\n",
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
                info.kernel_name.c_str(), info.cmd_queued/1000);
-            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Host\"},\n",
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
                info.kernel_name.c_str(), info.cmd_submit/1000);

-            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Device\"},\n",
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
                info.kernel_name.c_str(), info.cmd_start/1000);
-            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Device\"},\n",
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
                info.kernel_name.c_str(), info.cmd_end/1000);
        }
        fclose(ftrace);
@@ -7652,6 +7643,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
    const cl_ulong nb21 = src2->nb[1];
    const cl_ulong nb20 = src2->nb[0];

+    UNUSED(nb20);
+
    const int ne0 = dst->ne[0];
    const int ne1 = dst->ne[1];

@@ -37,5 +37,7 @@
 #include "softmax.hpp"
 #include "tsembd.hpp"
 #include "wkv.hpp"
+#include "pad_reflect_1d.hpp"
+

 #endif  // GGML_SYCL_BACKEND_HPP
@@ -3744,6 +3744,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_CONCAT:
            ggml_sycl_op_concat(ctx, dst);
            break;
+        case GGML_OP_PAD_REFLECT_1D:
+            ggml_sycl_op_pad_reflect_1d(ctx,dst);
+            break;
        case GGML_OP_UPSCALE:
            ggml_sycl_upscale(ctx, dst);
            break;
@@ -4455,6 +4458,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_DIV:
        case GGML_OP_REPEAT:
            return true;
+        case GGML_OP_PAD_REFLECT_1D:
+            return ggml_is_contiguous(op->src[0]) && op-> type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_SIN:
@@ -0,0 +1,72 @@
+#include "pad_reflect_1d.hpp"
+
+void pad_reflect_1d_f32(const float* src,float* dst,
+    const int64_t ne0, const int64_t ne02, const int p0, const int p1,
+    const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
+    const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
+    const sycl::nd_item<3> &item_ct1){
+
+    const int i0 = item_ct1.get_group(0) * SYCL_CONCAT_BLOCK_SIZE + item_ct1.get_local_id(0);
+    const int i1 = item_ct1.get_group(1);
+    const int g2 = item_ct1.get_group(2);
+    const int i2 = g2 % ne02;
+    const int i3 = g2 / ne02;
+
+    if (i0 >= p0 + ne0 + p1) return;
+
+    int t = i0 - p0;
+    int period = 2 * ne0 -2;
+    int m = t % period;
+    m += (m < 0) * period;
+    int center = ne0 -1;
+    int srci0 = center - abs(center - m);
+
+    int offest_src = i3*nb3 + i2*nb2 + i1*nb1 + srci0*nb0;
+    int offest_dst =  i3*nb03 +  i2*nb02 +  i1*nb01 +  i0*nb00;
+    dst[offest_dst] = src[offest_src];
+
+}
+
+void ggml_sycl_op_pad_reflect_1d(ggml_backend_sycl_context& ctx, ggml_tensor* dst){
+
+    const ggml_tensor * src0 = dst->src[0];
+    queue_ptr           stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int32_t * opts = (const int32_t *) dst->op_params;
+    const int p0 = opts[0];
+    const int p1 = opts[1];
+
+    const int64_t ne0 = src0->ne[0];
+
+    const int64_t ne00 = dst->ne[0];
+    const int64_t ne01 = dst->ne[1];
+    const int64_t ne02 = dst->ne[2];
+    const int64_t ne03 = dst->ne[3];
+
+    const int64_t nb00 = dst->nb[0];
+    const int64_t nb01 = dst->nb[1];
+    const int64_t nb02 = dst->nb[2];
+    const int64_t nb03 = dst->nb[3];
+    const int64_t nb0 = src0->nb[0];
+    const int64_t nb1 = src0->nb[1];
+    const int64_t nb2 = src0->nb[2];
+    const int64_t nb3 = src0->nb[3];
+
+    int num_blocks = (ne00 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
+    sycl::range<3> global(num_blocks * SYCL_CONCAT_BLOCK_SIZE, ne01, ne02*ne03);
+    sycl::range<3> local(SYCL_CONCAT_BLOCK_SIZE, 1, 1);
+
+    stream->parallel_for(
+        sycl::nd_range<3>(global,
+                            local),
+        [=](sycl::nd_item<3> item_ct1) { pad_reflect_1d_f32(
+            (const float *) src0->data, (float *) dst->data,
+            ne0, ne02, p0, p1,
+            nb0, nb1, nb2, nb3,
+            nb00, nb01, nb02, nb03
+            , item_ct1);
+         });
+}
@@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_PAD_REFLECT_1D_HPP
+#define GGML_SYCL_PAD_REFLECT_1D_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_pad_reflect_1d(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
+
+#endif // GGML_SYCL_PAD_REFLECT_1D_HPP
@@ -345,7 +345,7 @@ void main() {

    float Lfrcp[Br];
    [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
-        Lfrcp[r] = 1.0 / Lf[r];
+        Lfrcp[r] = (Lf[r] == 0.0) ? 0.0 : (1.0 / Lf[r]);
    }

    [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
@@ -380,7 +380,7 @@ void main() {

    float Lfrcp[rows_per_thread];
    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-        Lfrcp[r] = 1.0 / Lf[r];
+        Lfrcp[r] = (Lf[r] == 0.0) ? 0.0 : (1.0 / Lf[r]);
    }

    [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
@@ -121,7 +121,11 @@ void main() {
    const float NEG_FLT_MAX_OVER_2 = uintBitsToFloat(0xFEFFFFFF);

    L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
+#if defined(ACC_TYPE_MAX)
+    M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-ACC_TYPE_MAX / ACC_TYPE(2));
+#else
    M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(NEG_FLT_MAX_OVER_2);
+#endif

    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> slopeMat = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(1.0);

@@ -294,7 +298,7 @@ void main() {

    [[unroll]]
    for (int k = 0; k < Ldiag.length(); ++k) {
-        Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k];
+        Ldiag[k] = (Ldiag[k] == 0.0) ? ACC_TYPE(0.0) : (ACC_TYPE(1.0) / Ldiag[k]);
    }

    O = Ldiag*O;
@@ -91,7 +91,7 @@ void main() {
        L = L*ms + vs;
    }

-    L = 1.0 / L;
+    L = (L == 0.0) ? 0.0 : 1.0 / L;

    // D dimension is split across workgroups in the y dimension
    uint d = tid + gl_WorkGroupID.y * BLOCK_SIZE;
@@ -6964,6 +6964,78 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    GGML_LOG_INFO("========================================\n");
 }

+static int ggml_node_list_find_tensor(const struct ggml_cgraph * cgraph,
+                                      const int *                idxs,
+                                      int                        count,
+                                      const struct ggml_tensor * tensor) {
+    GGML_ASSERT(cgraph && idxs);
+    for (int i = 0; i < count; ++i) {
+        const int node_idx = idxs[i];
+
+        if (node_idx >= cgraph->n_nodes) {
+            return -1;
+        }
+        if (cgraph->nodes[node_idx] == tensor) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
+                                const int *                node_idxs,
+                                int                        count,
+                                const enum ggml_op *       ops,
+                                const int *                outputs,
+                                int                        num_outputs) {
+    GGML_ASSERT(outputs && num_outputs > 0);
+
+    for (int i = 0; i < count; ++i) {
+        if (node_idxs[i] >= cgraph->n_nodes) {
+            return false;
+        }
+
+        const struct ggml_tensor * node = cgraph->nodes[node_idxs[i]];
+
+        if (node->op != ops[i]) {
+            return false;
+        }
+
+        if (ggml_node_list_find_tensor(cgraph, outputs, num_outputs, node) != -1) {
+            continue;
+        }
+
+        if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
+            return false;
+        }
+
+        int subgraph_uses = 0;
+        for (int j = i + 1; j < count; ++j) {
+            const struct ggml_tensor * other_node = cgraph->nodes[node_idxs[j]];
+            for (int src_idx = 0; src_idx < GGML_MAX_SRC; src_idx++) {
+                if (other_node->src[src_idx] == node) {
+                    subgraph_uses++;
+                }
+            }
+        }
+
+        if (subgraph_uses != ggml_node_get_use_count(cgraph, node_idxs[i])) {
+            return false;
+        }
+
+        // if node is a view, check if the view_src and all it's parent view_srcs are within the subgraph
+        struct ggml_tensor * view_src = node->view_src;
+        while (view_src) {
+            if (ggml_node_list_find_tensor(cgraph, node_idxs, count, view_src) == -1) {
+                return false;
+            }
+            view_src = view_src->view_src;
+        }
+    }
+
+    return true;
+}
+
 // check if node is part of the graph
 static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
    if (cgraph == NULL) {
Author	SHA1	Message	Date
Johannes Gäßler	51d1a8c997	CUDA: better error for FA kernel with 0 occupancy (#16643 )	2025-10-21 15:27:53 +02:00
Aman Gupta	4926419c4d	ggml: add ggml_can_fuse_subgraph (#16662 ) * ggml: add ggml_can_fuse_subgraph * ggml-cuda: use ggml_can_fuse_subgraph for topk-moe * format * 1. remove inputs from signature as they are transient nodes 2. add check for views: view_src should be part of the subgraph * - combine check into one loop - check all view_src parents - other minor review comments * remove redudant if test * - rename and other minor review comments * add assert about count < 32	2025-10-21 16:43:14 +08:00
lhez	6ea37f5739	opencl: fix warnings and clean up profiling (#16688 ) * opencl: remove unused headers, fix warnings * opencl: clean up profiling, only keep kernel time	2025-10-20 22:26:17 -07:00
Jeff Bolz	fb349848f3	vulkan: Handle FA with all -inf mask values (#16447 )	2025-10-20 22:16:08 -05:00
YehuditE	6de8ed7519	sycl : add PAD_REFLECT_D1 operator support (#16145 ) * sycl: add PAD_REFLECT_D1 operator support * docs(ops): regenerate docs/ops.md * remove trailing whitespaces * style: fix editorconfig issues — trim trailing spaces and normalize EOLs * fix: move PAD_REFLECT_1D case outside of fall-through block	2025-10-21 00:21:12 +02:00