sync : ggml

ggml-ci
metal : tune soft_max number of threads (whisper/0)
2026-06-17 19:17:37 +02:00 · 2024-05-14 19:08:09 +03:00 · 2024-05-14 19:08:09 +03:00 · 2024-05-14 19:08:09 +03:00 · 2024-05-14 19:08:09 +03:00 · 2024-05-14 19:08:09 +03:00
5 changed files with 2179 additions and 5 deletions
@@ -120,9 +120,16 @@ extern "C" {
 #ifndef __F16C__
 #define __F16C__
 #endif
+#endif
+
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __SSE3__
 #define __SSE3__
 #endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
 #endif

 // 16-bit float
@@ -1378,7 +1378,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);

                        if (ne00%4 == 0) {
-                            while (nth < ne00/4 && nth < 256) {
+                            while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
                                nth *= 2;
                            }
                            if (use_f16) {
@@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
                            }
                        } else {
-                            while (nth < ne00 && nth < 1024) {
+                            while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
                                nth *= 2;
                            }
                            if (use_f16) {
@@ -1306,6 +1306,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
 #define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
 #define GGML_F16_VEC_SET1   GGML_F32x4_SET1
 #define GGML_F16_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
 #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
 // Use vec_xl, not vec_ld, in case the load address is not aligned.
 #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
@@ -1 +1 @@
-30f54cbb3ada3e4c5bc6924de3e5918e5be4ff11
+fafd5e7f89382b8cfb51e3dac8d4f1500ca44918
Author	SHA1	Message	Date
Georgi Gerganov	a5e3fde857	sync : ggml ggml-ci	2024-05-14 19:08:09 +03:00
Georgi Gerganov	f308ea7059	metal : tune soft_max number of threads (whisper/0)	2024-05-14 19:08:09 +03:00
Georgi Gerganov	c3c88f296a	ggml : try fix ppc64 (whisper/0)	2024-05-14 19:08:09 +03:00
Przemysław Pawełczyk	182adefcf3	ggml : expose SSE3 and SSSE3 for MSVC when AVX is available (whisper/2128)	2024-05-14 19:08:09 +03:00
Hong Bo PENG	0d26d8ccd8	ggml : optimize for ppc64le using VSX intrinsics (ggml/784) * optimize for ppc64le using VSX intrinsics * 1. code clean up by removing comments about overflow concern. 2. fix typo in suffix of scaling. * Continue to fix typo in suffix of scaling for QK_K <> 256 --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-05-14 19:08:09 +03:00