correct cmd name

cann: fix buffer_num and runtime speed slowly error (#8865 )
readme : add ramalama to the availables UI (#8811 )
2026-07-02 02:27:41 +02:00 · 2024-08-06 00:15:33 +08:00 · 2024-08-05 21:10:37 +08:00 · 2024-08-05 15:45:01 +03:00
4 changed files with 17 additions and 15 deletions
@@ -153,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
+- [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force


-.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
@@ -1670,10 +1670,6 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
                    // TODO: fix me
                    // Current groupsize should not be greater than k-1 in
                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
-                    if (op->src[0]->ne[0]-1 > QK8_0) {
-                        return true;
-                    }
-                    return false;
                case GGML_TYPE_Q4_0:
                    return true;
                default:
@@ -12,6 +12,9 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *output_ne_ub) {
+        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
+        //                         permute=[0,0,0,0]):
+        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();

@@ -61,13 +64,13 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
        pipe.InitBuffer(output_queue, BUFFER_NUM,
                            Group_Size * sizeof(int8_t) / 2);
-        pipe.InitBuffer(cast_queue , BUFFER_NUM, Group_Size * sizeof(float));
-        pipe.InitBuffer(work_queue, BUFFER_NUM, Group_Size*sizeof(float));
-        pipe.InitBuffer(max_queue, BUFFER_NUM, Group_Size*sizeof(float));
-        pipe.InitBuffer(min_queue, BUFFER_NUM, Group_Size*sizeof(float));
-        pipe.InitBuffer(scale_queue, BUFFER_NUM, 16*sizeof(half));
-        pipe.InitBuffer(int8_queue, BUFFER_NUM, Group_Size * sizeof(int8_t));
-        pipe.InitBuffer(half_queue, BUFFER_NUM, Group_Size * sizeof(half));
+        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
+        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
+        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
    }

    __aicore__ inline void copy_in(uint32_t offset) {
@@ -178,13 +181,15 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                half scale = calculate_group(i, j);
                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
+                // Copy Group_Size/2 length data each time.
+                if (scale_local_offset == Group_Size / 2) {
                    scale_local_offset = 0;
                    // TODO: OPTIMIZE ME
                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    DataCopy(scale_gm[scale_global_offset], scale_local,
+                                      Group_Size / 2);
                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
+                    scale_global_offset += Group_Size / 2;
                }
            }
        }
Author	SHA1	Message	Date
Neo Zhang	16dab13bde	correct cmd name	2024-08-06 00:15:33 +08:00
wangshuai09	bc0f887e15	cann: fix buffer_num and runtime speed slowly error (#8865 )	2024-08-05 21:10:37 +08:00
Eric Curtin	b42978e7e4	readme : add ramalama to the availables UI (#8811 ) ramalama is a repo agnostic boring CLI tool that supports pulling from ollama, huggingface and oci registries. Signed-off-by: Eric Curtin <ecurtin@redhat.com>	2024-08-05 15:45:01 +03:00