Compare commits

...

6 Commits

Author SHA1 Message Date
Akarshan Biswas 8b69686136 SYCL: fix rms_norm_mul_add for tensor dim not a multiple of sg_size (#15592)
The original implementation unconditionally returned true for this operation, leading to a failure when the tensor's first dimension (ne[0]) was not a multiple of WARP_SIZE. This caused an GGML_ASSERT(ncols % WARP_SIZE == 0) failure in ggml-sycl/norm.cpp.

This change updates the ggml_backend_sycl_device_supports_op check to correctly return true for GGML_OP_RMS_NORM only when the first dimension of the tensor is a multiple of WARP_SIZE, ensuring the operation can be performed without error.
2025-08-27 00:27:49 +05:30
fidoriel 8ce3ff1d91 mtmd : fix mtmd ios build (#15579) 2025-08-26 20:05:50 +02:00
Eve 44b1efa41a tests: add performance test for mul mat id (#15543) 2025-08-26 15:42:49 +00:00
shalinib-ibm a6a58d6478 llamafile: PowerPC Sgemm Optimization (#15558)
This patch improves GEMM for FP32 Data Type on PowerPC

Implements GEMM on large blocks with configurable block size mc, nc, kc
(default: 256, 256, 256).
Packing Function optimized to access blocks as per memory layout.
GEMM Optimized to work on larger blocks.
Isolated Packing from GEMM Operations for better MMA utilization.

Verified functionality and correctness uing llama-cli and stand alone
test case (performs matmul and compares final mattrix C result with base).

Minor code refactoring changes:
Replace macro with inline function
Code Indent made consistent with 4 spaces

Performance Testing:

Observed 50% ~ 70% improvement in Prompt Processing Speed mesured using
llama-bench with Meta-Llama3-8B FP32 Model.  Similar gains observed with
Mistral-7b-Instruct-v0.3 Model.

model                   Size                Params     Backend       Threads   Test    Patch   Base
llama 8B all F32        29.92 GiB           8.03 B      CPU           20       pp512   98.58   60.3
llama 8B all F32        29.92 GiB           8.03 B      CPU           20       pp1024  95.88   57.36
llama 8B all F32        29.92 GiB           8.03 B      CPU           20       pp2048  85.46   53.26
llama 8B all F32        29.92 GiB           8.03 B      CPU           20       pp4096  68.66   45.78
llama 8B all F32        29.92 GiB           8.03 B      CPU           20       pp6144  57.35   40.44

25 ~ 30% improvement in llama-batched-bench with Metla-Llama3-8B in
Prompt Processing Speed for large prompts (256, 512, 1024, 2048, 4096)tokens with various batch
sizes ( 1, 2, 4, 8, 16)

Signed-off-by: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
2025-08-26 23:35:25 +08:00
Georgi Gerganov 0373486dbc graph : fix assert in memory-less build_attn (#15590)
ggml-ci
2025-08-26 17:45:17 +03:00
Daniel Bevenius 62cef26ac5 model-conversion : add qat-q4 quantization targets (#15588)
This commit adds two targets to the Makefile for quantizing of
Quantization Aware Trained (QAT) models to Q4_0 format.

The motivation for this is that this sets the token embedding and the
output tensors data types to Q8_0 instead of the default Q6_K. This is
someting that we wish to enforce for QAT Q4_0 models that are to be
uploaded to ggml-org on Huggingface to guarantee the best quality.
2025-08-26 16:12:29 +02:00
8 changed files with 330 additions and 142 deletions
+25 -5
View File
@@ -1,4 +1,5 @@
# Validation functions
MAKEFLAGS += --no-print-directory
define validate_model_path
@if [ -z "$(MODEL_PATH)" ]; then \
echo "Error: MODEL_PATH must be provided either as:"; \
@@ -17,6 +18,13 @@ define validate_embedding_model_path
fi
endef
define quantize_model
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
@echo "Export the quantized model path to $(2) variable in your environment"
endef
###
### Casual Model targets/recipes
###
@@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
causal-quantize-Q4_0: causal-quantize-model
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
# token embedding and output types to Q8_0 instead of the default Q6_K.
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
causal-quantize-qat-Q4_0: causal-quantize-model
causal-quantize-model:
@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)
causal-run-quantized-model:
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
@@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
embedding-quantize-Q4_0: embedding-quantize-model
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
# token embedding and output types to Q8_0 instead of the default Q6_K.
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
embedding-quantize-qat-Q4_0: embedding-quantize-model
embedding-quantize-model:
@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
embedding-run-quantized-model:
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
+24
View File
@@ -137,6 +137,18 @@ Then the quantized model can be run using the following command:
(venv) $ make causal-run-quantized-model
```
### Quantizing QAT (Quantization Aware Training) models
When quantizing to `Q4_0`, the default data type for the token embedding weights
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
recommended to use `Q8_0` instead for the embeddings and output tensors.
The reason is that although `Q6_K` is smaller in size, it requires more compute
to unpack, which can hurt performance during output generation when the entire
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
provides practically full quality with better computational efficiency.
```console
(venv) $ make causal-quantize-qat-Q4_0
```
## Embedding Language Model Conversion
@@ -238,6 +250,18 @@ Then the quantized model can be run using the following command:
(venv) $ make embedding-run-quantized-model
```
### Quantizing QAT (Quantization Aware Training) models
When quantizing to `Q4_0`, the default data type for the token embedding weights
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
recommended to use `Q8_0` instead for the embeddings and output tensors.
The reason is that although `Q6_K` is smaller in size, it requires more compute
to unpack, which can hurt performance during output generation when the entire
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
provides practically full quality with better computational efficiency.
```console
(venv) $ make embedding-quantize-qat-Q4_0
```
## Perplexity Evaluation
### Simple perplexity evaluation
@@ -4,6 +4,8 @@ set -e
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
QUANTIZED_MODEL=$CONVERTED_MODEL
# Final check if we have a model path
@@ -14,6 +16,11 @@ if [ -z "$CONVERTED_MODEL" ]; then
exit 1
fi
if [ -z "$QUANTIZED_TYPE" ]; then
echo "Error: QUANTIZED_TYPE is required" >&2
exit 1
fi
echo $CONVERTED_MODEL
# Process the quantized model filename
@@ -26,9 +33,16 @@ else
exit 1
fi
cmake --build ../../build --target llama-quantize -j8
../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
echo $TOKEN_EMBD_TYPE
echo $OUTPUT_TYPE
CMD_ARGS=("../../build/bin/llama-quantize")
[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
"${CMD_ARGS[@]}"
echo "Quantized model saved to: $QUANTIZED_MODEL"
+241 -132
View File
@@ -2169,94 +2169,117 @@ class tinyBLAS_Q0_PPC {
class tinyBLAS_PPC {
public:
tinyBLAS_PPC(int64_t k,
const float *A, int64_t lda,
const float *B, int64_t ldb,
float *C, int64_t ldc,
const float * A, int64_t lda,
const float * B, int64_t ldb,
float * C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
}
void matmul(int64_t m, int64_t n) {
mnpack(0, m, 0, n);
int64_t mc = 256; int64_t nc = 256; int64_t kc = 256;
if (m % mc == 0 && n % nc == 0 && k % kc == 0) {
matmul_tiled(m, n, mc, nc, kc);
} else {
mnpack(0, m, 0, n);
}
}
private:
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
inline void vector_permute_store_4(vector float *src, float *vecOffset) {
vector float t1, t2, t3, t4, t5, t6, t7, t8;
t1 = vec_mergeh(src[0], src[1]);
t2 = vec_mergeh(src[2], src[3]);
t3 = vec_mergel(src[0], src[1]);
t4 = vec_mergel(src[2], src[3]);
t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t1, t2, 3);
t7 = vec_xxpermdi(t3, t4, 0);
t8 = vec_xxpermdi(t3, t4, 3);
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset + 4);
vec_xst(t7, 0, vecOffset + 8);
vec_xst(t8, 0, vecOffset + 12);
}
inline void vector_permute_store_8(vector float *src, float *vecOffset) {
vector float t1, t2, t3, t4, t5, t6, t7, t8;
t1 = vec_mergeh(src[0], src[1]);
t2 = vec_mergeh(src[2], src[3]);
t3 = vec_mergeh(src[4], src[5]);
t4 = vec_mergeh(src[6], src[7]);
t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t3, t4, 0);
t7 = vec_xxpermdi(t1, t2, 3);
t8 = vec_xxpermdi(t3, t4, 3);
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset + 4);
vec_xst(t7, 0, vecOffset + 8);
vec_xst(t8, 0, vecOffset + 12);
t1 = vec_mergel(src[0], src[1]);
t2 = vec_mergel(src[2], src[3]);
t3 = vec_mergel(src[4], src[5]);
t4 = vec_mergel(src[6], src[7]);
t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t3, t4, 0);
t7 = vec_xxpermdi(t1, t2, 3);
t8 = vec_xxpermdi(t3, t4, 3);
vec_xst(t5, 0, vecOffset + 16);
vec_xst(t6, 0, vecOffset + 20);
vec_xst(t7, 0, vecOffset + 24);
vec_xst(t8, 0, vecOffset + 28);
inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
vec_t vec_C[4];
__builtin_mma_disassemble_acc(vec_C, ACC);
for (int I = 0; I < 4; I++) {
for (int J = 0; J < 4; J++) {
*((float *)(C+ii+((jj+J)*ldc)+I)) = *((float *)&vec_C[I]+J);
}
}
}
void packTranspose(const float* a, int64_t lda, int rows, int cols, float* vec) {
inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
vec_t vec_C[4];
__builtin_mma_disassemble_acc(vec_C, ACC);
for (int I = 0; I < 4; I++) {
for (int J = 0; J < 4; J++) {
float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
*c_ptr += *((float *)&vec_C[I]+J);
}
}
}
inline void vector_permute_store_4(vector float * src, float * vecOffset) {
vector float t1, t2, t3, t4, t5, t6, t7, t8;
t1 = vec_mergeh(src[0], src[1]);
t2 = vec_mergeh(src[2], src[3]);
t3 = vec_mergel(src[0], src[1]);
t4 = vec_mergel(src[2], src[3]);
t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t1, t2, 3);
t7 = vec_xxpermdi(t3, t4, 0);
t8 = vec_xxpermdi(t3, t4, 3);
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset + 4);
vec_xst(t7, 0, vecOffset + 8);
vec_xst(t8, 0, vecOffset + 12);
}
inline void vector_permute_store_8(vector float * src, float * vecOffset) {
vector float t1, t2, t3, t4, t5, t6, t7, t8;
t1 = vec_mergeh(src[0], src[1]);
t2 = vec_mergeh(src[2], src[3]);
t3 = vec_mergeh(src[4], src[5]);
t4 = vec_mergeh(src[6], src[7]);
t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t3, t4, 0);
t7 = vec_xxpermdi(t1, t2, 3);
t8 = vec_xxpermdi(t3, t4, 3);
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset + 4);
vec_xst(t7, 0, vecOffset + 8);
vec_xst(t8, 0, vecOffset + 12);
t1 = vec_mergel(src[0], src[1]);
t2 = vec_mergel(src[2], src[3]);
t3 = vec_mergel(src[4], src[5]);
t4 = vec_mergel(src[6], src[7]);
t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t3, t4, 0);
t7 = vec_xxpermdi(t1, t2, 3);
t8 = vec_xxpermdi(t3, t4, 3);
vec_xst(t5, 0, vecOffset + 16);
vec_xst(t6, 0, vecOffset + 20);
vec_xst(t7, 0, vecOffset + 24);
vec_xst(t8, 0, vecOffset + 28);
}
void packTranspose(const float * a, int64_t lda, int rows, int cols, float * vec) {
int64_t i, j;
float * aoffsets[8];
float *aoffset = NULL, *boffset = NULL;
float * aoffset = NULL, * boffset = NULL;
__vector_pair arr[8];
vector float c[8][2] = {0};
vector float c1[8] = {0};
vector float c2[8] = {0};
aoffset = const_cast<float*>(a);
aoffset = const_cast<float *>(a);
boffset = vec;
j = (rows >> 3);
if (j > 0) {
do {
aoffsets[0] = aoffset;
for (int it = 1; it< 8; it++)
for (int it = 1; it < 8; it++)
aoffsets[it] = aoffsets[it-1] + lda;
aoffset += 8 * lda;
i = (cols >> 3);
if (i > 0) {
do {
for (int it = 0; it< 8; it++) {
for (int it = 0; it < 8; it++) {
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[it]);
__builtin_vsx_disassemble_pair(c[it], &arr[it]);
c1[it] = c[it][0];
@@ -2264,11 +2287,14 @@ class tinyBLAS_PPC {
}
vector_permute_store_8(c1, boffset);
vector_permute_store_8(c2, boffset+32);
for (int it = 0; it < 4; it++)
aoffsets[it] = aoffsets[it] + 8*lda;
vector_permute_store_8(c2, boffset + 32);
boffset += 64;
i--;
if (i > 0) {
for (int it = 0; it < 8; it++) {
aoffsets[it] = aoffsets[it] + 8;
}
}
} while(i > 0);
}
if (cols & 4) {
@@ -2295,9 +2321,9 @@ class tinyBLAS_PPC {
c2[it] = c[it][1];
}
vector_permute_store_4(c1, boffset);
vector_permute_store_4(c2, boffset+16);
vector_permute_store_4(c2, boffset + 16);
for (int it = 0; it < 4; it++)
aoffsets[it] += 8*lda;
aoffsets[it] += 8 * lda;
boffset += 32;
i--;
} while(i > 0);
@@ -2325,15 +2351,15 @@ class tinyBLAS_PPC {
vec_t vec_A[4], vec_B[4], vec_C[4];
acc_t acc_0;
__builtin_mma_xxsetaccz(&acc_0);
for (int l = 0; l < k; l+=4) {
packTranspose(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
for (int l = 0; l < k; l += 4) {
packTranspose(A + (ii * lda) + l, lda, 4, 4, (float *)vec_A);
packTranspose(B + (jj * ldb) + l, ldb, 4, 4, (float *)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
}
SAVE_ACC(&acc_0, ii, jj);
save_acc(&acc_0, ii, jj);
}
void KERNEL_4x8(int64_t ii, int64_t jj) {
@@ -2341,9 +2367,9 @@ class tinyBLAS_PPC {
acc_t acc_0, acc_1;
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
for (int64_t l = 0; l < k; l+=4) {
packTranspose(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
packTranspose(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
for (int64_t l = 0; l < k; l += 4) {
packTranspose(A + (ii * lda) + l, lda, 4, 4, (float *)vec_A);
packTranspose(B + (jj * ldb) + l, ldb, 8, 4, (float *)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
@@ -2353,8 +2379,8 @@ class tinyBLAS_PPC {
__builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]);
__builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]);
}
SAVE_ACC(&acc_0, ii, jj);
SAVE_ACC(&acc_1, ii, jj+4);
save_acc(&acc_0, ii, jj);
save_acc(&acc_1, ii, jj + 4);
}
void KERNEL_8x4(int64_t ii, int64_t jj) {
@@ -2362,9 +2388,9 @@ class tinyBLAS_PPC {
acc_t acc_0, acc_1;
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
for (int64_t l = 0; l < k; l+=4) {
packTranspose(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
for (int64_t l = 0; l < k; l += 4) {
packTranspose(A + (ii * lda) + l, lda, 8, 4, (float *)vec_A);
packTranspose(B + (jj * ldb) + l, ldb, 4, 4, (float *)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
@@ -2374,8 +2400,8 @@ class tinyBLAS_PPC {
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]);
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]);
}
SAVE_ACC(&acc_0, ii, jj);
SAVE_ACC(&acc_1, ii+4, jj);
save_acc(&acc_0, ii, jj);
save_acc(&acc_1, ii + 4, jj);
}
void KERNEL_8x8(int64_t ii, int64_t jj) {
@@ -2386,19 +2412,96 @@ class tinyBLAS_PPC {
__builtin_mma_xxsetaccz(&acc_2);
__builtin_mma_xxsetaccz(&acc_3);
for (int l = 0; l < k; l+=8) {
packTranspose(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
packTranspose(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
packTranspose(A + (ii * lda) + l, lda, 8, 8, (float *)vec_A);
packTranspose(B + (jj * ldb) + l, ldb, 8, 8, (float *)vec_B);
for(int x = 0; x < 16; x+=2) {
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
__builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x + 1]);
__builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x + 1], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x + 1], vec_B[x + 1]);
}
}
save_acc(&acc_0, ii, jj);
save_acc(&acc_1, ii, jj + 4);
save_acc(&acc_2, ii + 4, jj);
save_acc(&acc_3, ii + 4, jj + 4);
}
inline void MMA_16x8(vec_t * vec_A0, vec_t * vec_A1, vec_t * vec_B, acc_t * acc) {
for (int x = 0; x < 16; x += 2) {
__builtin_mma_xvf32gerpp(&acc[0], vec_A0[x + 0], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc[1], vec_A0[x + 0], vec_B[x + 1]);
__builtin_mma_xvf32gerpp(&acc[2], vec_A0[x + 1], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc[3], vec_A0[x + 1], vec_B[x + 1]);
__builtin_mma_xvf32gerpp(&acc[4], vec_A1[x + 0], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc[5], vec_A1[x + 0], vec_B[x + 1]);
__builtin_mma_xvf32gerpp(&acc[6], vec_A1[x + 1], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc[7], vec_A1[x + 1], vec_B[x + 1]);
}
}
void KERNEL(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, vec_t * vec_A, vec_t * vec_B, int64_t kk) {
for (int64_t i = 0; i < mc; i += 16) {
int A_base_addr = (mc / 8) * (i / 8) * 16;
for (int64_t j = 0; j < nc; j += 8) {
int B_base_addr = (nc / 8) * (j / 8) * 16;
acc_t acc[8];
vec_t A0_block[16]; vec_t A1_block[16];
for (int x = 0; x < 8; x++)
__builtin_mma_xxsetaccz(&acc[x]);
for (int64_t l = 0; l < kc; l += 8) {
int A0_block_idx = A_base_addr + (l / 8) * 16;
int A1_block_idx = A0_block_idx + (mc / 8) * 16;
int B_block_idx = B_base_addr + (l / 8) * 16;
vec_t* A0_block = &vec_A[A0_block_idx];
vec_t* A1_block = &vec_A[A1_block_idx];
vec_t* B_block = &vec_B[B_block_idx];
MMA_16x8(A0_block, A1_block, B_block, acc);
}
if (kk == 0) {
save_acc(&acc[0], ii + i, jj + j);
save_acc(&acc[1], ii + i, jj + j + 4);
save_acc(&acc[2], ii + i + 4, jj + j);
save_acc(&acc[3], ii + i + 4, jj + j + 4);
save_acc(&acc[4], ii + i + 8, jj + j);
save_acc(&acc[5], ii + i + 8, jj + j + 4);
save_acc(&acc[6], ii + i + 12, jj + j);
save_acc(&acc[7], ii + i + 12, jj + j + 4);
} else {
add_save_acc(&acc[0], ii + i, jj + j);
add_save_acc(&acc[1], ii + i, jj + j + 4);
add_save_acc(&acc[2], ii + i + 4, jj + j);
add_save_acc(&acc[3], ii + i + 4, jj + j + 4);
add_save_acc(&acc[4], ii + i + 8, jj + j);
add_save_acc(&acc[5], ii + i + 8, jj + j + 4);
add_save_acc(&acc[6], ii + i + 12, jj + j);
add_save_acc(&acc[7], ii + i + 12, jj + j + 4);
}
}
}
}
void matmul_tiled(int64_t m , int64_t n, int64_t mc, int64_t nc, int64_t kc) {
int64_t ytiles = m / mc;
int64_t xtiles = n / nc;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles) {
end = tiles;
}
for (int64_t job = start; job < end; ++job) {
int64_t ii = (job / xtiles) * mc;
int64_t jj = (job % xtiles) * nc;
for (int64_t kk = 0; kk < k; kk += kc) {
vec_t A_pack[kc * mc / 4];
vec_t B_pack[kc * nc / 4];
packTranspose(A + (ii * lda) + kk, lda, kc, mc, (float *)A_pack);
packTranspose(B + (jj * ldb) + kk, ldb, kc, nc, (float *)B_pack);
KERNEL(ii, jj, mc, nc, kc, A_pack, B_pack, kk);
}
}
SAVE_ACC(&acc_0, ii, jj);
SAVE_ACC(&acc_1, ii, jj+4);
SAVE_ACC(&acc_2, ii+4, jj);
SAVE_ACC(&acc_3, ii+4, jj+4);
}
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
@@ -2406,35 +2509,35 @@ class tinyBLAS_PPC {
int n_rem = MIN(n - n0, 8);
int mc = 0, nc = 0;
if (m_rem >= 8 && n_rem >= 8) {
mc = 8;
nc = 8;
gemm<8, 8>(m0, m, n0, n);
mc = 8;
nc = 8;
gemm<8, 8>(m0, m, n0, n);
} else if (m_rem >= 4 && n_rem >= 8) {
mc = 4;
nc = 8;
gemm<4, 8>(m0, m, n0, n);
mc = 4;
nc = 8;
gemm<4, 8>(m0, m, n0, n);
} else if (m_rem >= 8 && n_rem >= 4) {
mc = 8;
nc = 4;
gemm<8, 4>(m0, m, n0, n);
mc = 8;
nc = 4;
gemm<8, 4>(m0, m, n0, n);
} else if (m_rem >= 4 && n_rem >= 4) {
mc = 4;
nc = 4;
gemm<4, 4>(m0, m, n0, n);
mc = 4;
nc = 4;
gemm<4, 4>(m0, m, n0, n);
} else {
mc = (m_rem >= 4) ? 4 : m_rem;
nc = (n_rem >= 4) ? 4 : n_rem;
if (mc == 0 || nc == 0)
return;
return;
gemm_small(m0, m, n0, n, mc, nc);
}
int64_t mp = m0 + ((m - m0) / mc) * mc;
int64_t np = n0 + ((n - n0) / nc) * nc;
mnpack(mp, m, n0, np);
mnpack(m0, m, np, n);
}
}
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
@@ -2449,30 +2552,30 @@ class tinyBLAS_PPC {
vec_t vec_C[4];
acc_t acc_0;
__builtin_mma_xxsetaccz(&acc_0);
vec_t vec_A[4] {0}, vec_B[4] = {0};
for (int l=0; l<k; l+=4) {
vec_t vec_A[4] = {0}, vec_B[4] = {0};
for (int l = 0; l < k; l += 4) {
/* 'GEMV Forwarding' concept is used in first two conditional loops.
* when one of the matrix has a single row/column, the elements are
* broadcasted, instead of using packing routine to prepack the
* matrix elements.
*/
if (RM == 1) {
float* a = const_cast<float*>(A+(ii)*lda+l);
packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
float * a = const_cast<float *>(A + (ii) * lda + l);
packTranspose(B + (jj * ldb) + l, ldb, RN, 4, (float *)vec_B);
vec_A[0] = (vec_t)vec_xl(0,a);
vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
vec_A[1] = (vec_t)vec_splats(*((float *)&vec_A+1));
vec_A[2] = (vec_t)vec_splats(*((float *)&vec_A+2));
vec_A[3] = (vec_t)vec_splats(*((float *)&vec_A+3));
} else if (RN == 1) {
packTranspose(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
float* b = const_cast<float*>(B+(jj)*ldb+l);
packTranspose(A + (ii * lda) + l, lda, RM, 4, (float *)vec_A);
float * b = const_cast<float *>(B + (jj) * ldb + l);
vec_B[0] = (vec_t)vec_xl(0,b);
vec_B[1] = (vec_t)vec_splats(*((float*)&vec_B+1));
vec_B[2] = (vec_t)vec_splats(*((float*)&vec_B+2));
vec_B[3] = (vec_t)vec_splats(*((float*)&vec_B+3));
vec_B[1] = (vec_t)vec_splats(*((float *)&vec_B+1));
vec_B[2] = (vec_t)vec_splats(*((float *)&vec_B+2));
vec_B[3] = (vec_t)vec_splats(*((float *)&vec_B+3));
} else {
packTranspose(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
packTranspose(A + (ii * lda) + l, lda, RM, 4, (float *)vec_A);
packTranspose(B + (jj * ldb) + l, ldb, RN, 4, (float *)vec_B);
}
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
@@ -2482,12 +2585,27 @@ class tinyBLAS_PPC {
__builtin_mma_disassemble_acc(vec_C, &acc_0);
for (int I = 0; I < RM; I++) {
for (int J = 0; J < RN; J++) {
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
*((float *)(C+ii+((jj+J)*ldc)+I)) = *((float *)&vec_C[I]+J);
}
}
}
}
template<int RM, int RN>
inline void kernel(int64_t ii, int64_t jj) {
if constexpr(RM == 4 && RN == 4) {
KERNEL_4x4(ii, jj);
} else if constexpr(RM == 4 && RN == 8) {
KERNEL_4x8(ii, jj);
} else if constexpr(RM == 8 && RN == 4) {
KERNEL_8x4(ii, jj);
} else if constexpr(RM == 8 && RN == 8) {
KERNEL_8x8(ii, jj);
} else {
static_assert(false, "RN/RM values not supported");
}
}
template <int RM, int RN>
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
@@ -2496,27 +2614,18 @@ class tinyBLAS_PPC {
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (RM == 4 && RN == 4) {
kernel = &tinyBLAS_PPC::KERNEL_4x4;
} else if (RM == 4 && RN == 8) {
kernel = &tinyBLAS_PPC::KERNEL_4x8;
} else if (RM == 8 && RN == 4) {
kernel = &tinyBLAS_PPC::KERNEL_8x4;
} else if (RM == 8 && RN == 8) {
kernel = &tinyBLAS_PPC::KERNEL_8x8;
}
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
(this->*kernel)(ii, jj);
kernel<RM, RN>(ii, jj);
}
}
const float *const A;
const float *const B;
float *C;
const float * const A;
const float * const B;
float * C;
const int64_t k;
const int64_t lda;
const int64_t ldb;
+2 -1
View File
@@ -4364,11 +4364,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
#endif
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
return true;
case GGML_OP_L2_NORM:
case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_RMS_NORM:
return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
case GGML_OP_SCALE:
return true;
case GGML_OP_CONT:
+1 -1
View File
@@ -1376,7 +1376,7 @@ ggml_tensor * llm_graph_context::build_attn(
// [TAG_NO_CACHE_PAD]
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
assert(!ubatch.equal_seqs());
assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
ggml_tensor * q = q_cur;
ggml_tensor * k = k_cur;
+18
View File
@@ -6400,6 +6400,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
}
// qwen3-30b-a3b
for (int bs : {1, 4, 8, 512}) {
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));
}
}
}
// gpt-oss-20b
for (int bs : {1, 4, 8, 512}) {
for (ggml_type type_a : {GGML_TYPE_MXFP4}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 2880, bs, 2880, 1));
}
}
}
for (int K : {3, 5}) {
for (int IC : {256, 2560}) {
for (int IW_IH : {32, 64, 256}) {
+3 -1
View File
@@ -55,6 +55,8 @@ add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
set(TARGET llama-mtmd-cli)
add_executable (${TARGET} mtmd-cli.cpp)
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install (TARGETS ${TARGET} RUNTIME)
if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS")
install(TARGETS ${TARGET} RUNTIME)
endif()
target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
target_compile_features(${TARGET} PRIVATE cxx_std_17)