mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-29 17:17:40 +02:00
Compare commits
19 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ff76e18516 | |||
| 39f852f440 | |||
| 2b00fa7997 | |||
| d6a04f872d | |||
| c9c8575a1a | |||
| df4b7945ae | |||
| 449ccfb6f5 | |||
| 1b28061400 | |||
| 8db003a19d | |||
| 0996c5597f | |||
| 5bb2c5dbd2 | |||
| 67155ab7f5 | |||
| 5af118efda | |||
| d2b496bff4 | |||
| b34e023480 | |||
| 51b6038636 | |||
| cb9c933eb2 | |||
| 6cd4e03444 | |||
| 8d300bd35f |
@@ -434,7 +434,7 @@ endif
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
|
||||
ifndef RISCV
|
||||
ifndef RISCV_CROSS_COMPILE
|
||||
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
# Use all CPU extensions that are available:
|
||||
@@ -514,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
||||
MK_CXXFLAGS += -mlasx
|
||||
endif
|
||||
|
||||
else
|
||||
ifneq ($(filter riscv64%,$(UNAME_M)),)
|
||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
else # RISC-V CROSS COMPILATION
|
||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
@@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||
|
||||
|
||||
+14
-16
@@ -818,7 +818,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
[](gpt_params & params) {
|
||||
params.special = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(llama_arg(
|
||||
{"-cnv", "--conversation"},
|
||||
format(
|
||||
@@ -1417,20 +1417,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (arg_next == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
else if (arg_next == "row") {
|
||||
} else if (arg_next == "row") {
|
||||
#ifdef GGML_USE_SYCL
|
||||
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
||||
exit(1);
|
||||
#endif // GGML_USE_SYCL
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(llama_arg(
|
||||
@@ -1450,14 +1448,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
}
|
||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
params.tensor_split[i] = 0.0f;
|
||||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(llama_arg(
|
||||
@@ -1465,9 +1463,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
||||
[](gpt_params & params, int value) {
|
||||
params.main_gpu = value;
|
||||
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(llama_arg(
|
||||
|
||||
+33
-16
@@ -56,14 +56,6 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
|
||||
#define GGML_USE_CUDA_SYCL
|
||||
#endif
|
||||
|
||||
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
|
||||
#define GGML_USE_CUDA_SYCL_VULKAN
|
||||
#endif
|
||||
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
@@ -949,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
||||
|
||||
static bool starts_with(const std::string & str, const std::string & prefix) {
|
||||
// While we wait for C++20's std::string::starts_with...
|
||||
return str.rfind(prefix, 0) == 0;
|
||||
}
|
||||
|
||||
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
||||
int remaining_attempts = max_attempts;
|
||||
|
||||
while (remaining_attempts > 0) {
|
||||
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res == CURLE_OK) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
||||
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
||||
|
||||
remaining_attempts--;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
||||
|
||||
// Initialize libcurl
|
||||
@@ -1057,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
if (res != CURLE_OK) {
|
||||
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1134,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
||||
};
|
||||
|
||||
// start the download
|
||||
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
auto res = curl_easy_perform(curl.get());
|
||||
if (res != CURLE_OK) {
|
||||
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
|
||||
fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1812,6 +1828,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||
|
||||
@@ -2771,6 +2771,8 @@ class Rwkv6Model(Model):
|
||||
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
|
||||
@@ -31,6 +31,7 @@ import re
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
|
||||
from hashlib import sha256
|
||||
from enum import IntEnum, auto
|
||||
@@ -125,12 +126,27 @@ def download_model(model):
|
||||
if tokt == TOKENIZER_TYPE.UGM:
|
||||
files.append("spiece.model")
|
||||
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
if os.path.isdir(repo):
|
||||
# If repo is a path on the file system, copy the directory
|
||||
for file in files:
|
||||
src_path = os.path.join(repo, file)
|
||||
dst_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(dst_path):
|
||||
logger.info(f"{name}: File {dst_path} already exists - skipping")
|
||||
continue
|
||||
if os.path.isfile(src_path):
|
||||
shutil.copy2(src_path, dst_path)
|
||||
logger.info(f"{name}: Copied {src_path} to {dst_path}")
|
||||
else:
|
||||
logger.warning(f"{name}: Source file {src_path} does not exist")
|
||||
else:
|
||||
# If repo is a URL, download the files
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
|
||||
|
||||
for model in models:
|
||||
|
||||
@@ -3,32 +3,10 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// mutates the input string
|
||||
static std::vector<int> parse_list(char * p) {
|
||||
std::vector<int> ret;
|
||||
|
||||
char * q = p;
|
||||
|
||||
while (*p) {
|
||||
if (*p == ',') {
|
||||
*p = '\0';
|
||||
ret.push_back(std::atoi(q));
|
||||
q = p + 1;
|
||||
}
|
||||
|
||||
++p;
|
||||
}
|
||||
|
||||
ret.push_back(std::atoi(q));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||
|
||||
@@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||
|
||||
TENSOR_DUMP(gf->nodes[0]);
|
||||
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
||||
|
||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||
|
||||
@@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
|
||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
||||
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
||||
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||
printf("=====================================================================================\n");
|
||||
@@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
||||
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
|
||||
@@ -226,8 +226,8 @@ static ggml_status compute_piter(
|
||||
result.eigenvectors.resize(params.n_batch);
|
||||
result.distances.resize(params.n_batch);
|
||||
// get output nodes
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
auto node = gf->nodes[i];
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||
auto node = ggml_graph_node(gf, i);
|
||||
int iter = -1;
|
||||
// find b_tensor (without copying data from device)
|
||||
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||
|
||||
@@ -370,7 +370,7 @@ struct lora_merge_ctx {
|
||||
|
||||
// write data to output file
|
||||
{
|
||||
auto result = gf->nodes[gf->n_nodes - 1];
|
||||
auto * result = ggml_graph_node(gf, -1);
|
||||
size_t len = ggml_nbytes(result);
|
||||
if (read_buf.size() < len) {
|
||||
read_buf.resize(len);
|
||||
|
||||
@@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
|
||||
@@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||
ggml_build_forward_expand(gf, flatten);
|
||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||
|
||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||
|
||||
@@ -18,8 +18,8 @@ struct llava_context {
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
@@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
set(TARGET llama-quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -4,33 +4,23 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
GGML_SYCL_SINGLE_GPU=1
|
||||
else
|
||||
GGML_SYCL_DEVICE=0
|
||||
GGML_SYCL_SINGLE_GPU=0
|
||||
fi
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
MODEL_FILE=llama-2-7b.Q4_0.gguf
|
||||
NGL=33
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
|
||||
fi
|
||||
|
||||
#use main GPU only
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
Generated
+10
-10
@@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1725024810,
|
||||
"narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
|
||||
"lastModified": 1725234343,
|
||||
"narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
|
||||
"rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1724819573,
|
||||
"narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
|
||||
"lastModified": 1725634671,
|
||||
"narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
|
||||
"rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -36,14 +36,14 @@
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1722555339,
|
||||
"narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
|
||||
"lastModified": 1725233747,
|
||||
"narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
|
||||
+25
-63
@@ -358,6 +358,7 @@ extern "C" {
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
struct ggml_cgraph;
|
||||
|
||||
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||
enum ggml_type {
|
||||
@@ -575,23 +576,9 @@ extern "C" {
|
||||
GGML_TENSOR_FLAG_PARAM = 4,
|
||||
};
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_type type;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
@@ -655,7 +642,7 @@ extern "C" {
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
@@ -671,35 +658,6 @@ extern "C" {
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
@@ -2017,8 +1975,6 @@ extern "C" {
|
||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||
|
||||
#define GGML_N_TASKS_MAX -1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_map_custom1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -2088,30 +2044,35 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||
|
||||
// graph allocation in a context
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
|
||||
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
||||
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
@@ -2509,6 +2470,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cann/aclnn_ops.h"
|
||||
#include "ggml-cann/common.h"
|
||||
@@ -1942,7 +1943,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
||||
GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_cann_set_device(ctx->device);
|
||||
ggml_backend_t cann_backend =
|
||||
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
||||
/* .interface = */ ggml_backend_cann_interface,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/common.cuh"
|
||||
|
||||
@@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
||||
@@ -2742,6 +2742,7 @@ struct mmq_args {
|
||||
int64_t ne00; int64_t ne01; int64_t stride01;
|
||||
int64_t ne10; int64_t ne11; int64_t stride11;
|
||||
int64_t ne0;
|
||||
bool use_stream_k;
|
||||
};
|
||||
|
||||
template<ggml_type type>
|
||||
@@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
const dim3 block_nums_xy_tiling(nty, ntx, 1);
|
||||
|
||||
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
|
||||
if (!use_stream_k) {
|
||||
if (!args.use_stream_k) {
|
||||
if (args.ne01 % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
|
||||
|
||||
Vendored
-39
@@ -130,42 +130,3 @@
|
||||
#define cudaKernelNodeParams musaKernelNodeParams
|
||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||
#define cudaStreamEndCapture musaStreamEndCapture
|
||||
|
||||
// XXX: Clang builtins mapping
|
||||
#define __vsub4 __vsub4_musa
|
||||
#define __vcmpeq4 __vcmpeq4_musa
|
||||
#define __vcmpne4 __vcmpne4_musa
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||
@@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||
|
||||
@@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// computation graph
|
||||
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-kompute.h"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#import "ggml-metal.h"
|
||||
|
||||
#import "ggml-impl.h"
|
||||
#import "ggml-backend-impl.h"
|
||||
#import "ggml.h"
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
@@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// create multiple command buffers and enqueue them
|
||||
// then, we encode the graph into the command buffers in parallel
|
||||
|
||||
const int n_nodes = gf->n_nodes;
|
||||
const int n_nodes = gf->n_nodes;
|
||||
const int n_cb = ctx->n_cb;
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "ggml-rpc.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <cinttypes>
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
#include <sycl/half_type.hpp>
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
@@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_CLAMP:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_IM2COL:
|
||||
// TODO: add support for the new F32 operations
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-vulkan-shaders.hpp"
|
||||
|
||||
+87
-33
@@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_GELU_QUICK_FP16
|
||||
#define GGML_N_TASKS_MAX (-1)
|
||||
|
||||
#define GGML_SOFT_MAX_UNROLL 4
|
||||
#define GGML_VEC_DOT_UNROLL 2
|
||||
@@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
#define GGML_F32x4_ADD vaddq_f32
|
||||
#define GGML_F32x4_MUL vmulq_f32
|
||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
@@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||
#define GGML_F16x8_ADD vaddq_f16
|
||||
#define GGML_F16x8_MUL vmulq_f16
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
||||
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x8
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||
@@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||
#endif
|
||||
|
||||
//
|
||||
// ggml object
|
||||
//
|
||||
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
//
|
||||
// ggml context
|
||||
//
|
||||
@@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||
}
|
||||
|
||||
int ggml_graph_size(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->size;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
|
||||
if (i < 0) {
|
||||
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||
return cgraph->nodes[cgraph->n_nodes + i];
|
||||
}
|
||||
|
||||
GGML_ASSERT(i < cgraph->n_nodes);
|
||||
return cgraph->nodes[i];
|
||||
}
|
||||
|
||||
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->nodes;
|
||||
}
|
||||
|
||||
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->n_nodes;
|
||||
}
|
||||
|
||||
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(cgraph->size > cgraph->n_nodes);
|
||||
cgraph->nodes[cgraph->n_nodes] = tensor;
|
||||
cgraph->n_nodes++;
|
||||
}
|
||||
|
||||
// Android's libc implementation "bionic" does not support setting affinity
|
||||
#if defined(__gnu_linux__)
|
||||
static void set_numa_thread_affinity(int thread_n) {
|
||||
@@ -23242,6 +23288,14 @@ int ggml_cpu_has_arm_fma(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_riscv_v(void) {
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_metal(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
|
||||
+30
-25
@@ -9877,8 +9877,8 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||
// find result_norm tensor for input
|
||||
struct ggml_tensor * inp = nullptr;
|
||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||
inp = gf->nodes[i];
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
inp = ggml_graph_node(gf, i);
|
||||
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||
break;
|
||||
} else {
|
||||
@@ -16076,19 +16076,21 @@ static int llama_decode_internal(
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
||||
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
|
||||
|
||||
if (batch_all.token) {
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
||||
|
||||
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
||||
@@ -16205,8 +16207,8 @@ static int llama_decode_internal(
|
||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||
|
||||
// the output is always the last tensor in the graph
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
||||
struct ggml_tensor * res = ggml_graph_node(gf, -1);
|
||||
struct ggml_tensor * embd = ggml_graph_node(gf, -2);
|
||||
|
||||
if (lctx.n_outputs == 0) {
|
||||
// no output
|
||||
@@ -16215,9 +16217,9 @@ static int llama_decode_internal(
|
||||
} else if (cparams.embeddings) {
|
||||
res = nullptr; // do not extract logits for embedding case
|
||||
embd = nullptr;
|
||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
||||
embd = gf->nodes[i];
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||
embd = ggml_graph_node(gf, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -16375,19 +16377,21 @@ static int llama_encode_internal(
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
if (batch.token) {
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
||||
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
||||
|
||||
@@ -16432,15 +16436,15 @@ static int llama_encode_internal(
|
||||
// there are two cases here
|
||||
if (llama_model_has_decoder(&lctx.model)) {
|
||||
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
||||
embd = gf->nodes[gf->n_nodes - 1];
|
||||
embd = ggml_graph_node(gf, -1);
|
||||
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
||||
} else {
|
||||
// second case is an encoder-only T5 model
|
||||
if (cparams.embeddings) {
|
||||
// only output embeddings if required
|
||||
embd = gf->nodes[gf->n_nodes - 1];
|
||||
embd = ggml_graph_node(gf, -1);
|
||||
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||
embd = gf->nodes[gf->n_nodes - 2];
|
||||
embd = ggml_graph_node(gf, -2);
|
||||
}
|
||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||
}
|
||||
@@ -18488,7 +18492,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||
}
|
||||
}
|
||||
@@ -20668,6 +20672,7 @@ const char * llama_print_system_info(void) {
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
|
||||
@@ -519,7 +519,7 @@ struct test_case {
|
||||
|
||||
// add sentinels as graph nodes so that they are checked in the callback
|
||||
for (ggml_tensor * sentinel : sentinels) {
|
||||
gf->nodes[gf->n_nodes++] = sentinel;
|
||||
ggml_graph_add_node(gf, sentinel);
|
||||
}
|
||||
|
||||
// randomize tensors
|
||||
@@ -679,9 +679,9 @@ struct test_case {
|
||||
|
||||
// duplicate the op
|
||||
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
||||
int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
|
||||
int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||
for (int i = 1; i < n_runs; i++) {
|
||||
gf->nodes[gf->n_nodes++] = out;
|
||||
ggml_graph_add_node(gf, out);
|
||||
}
|
||||
|
||||
// calculate memory
|
||||
@@ -696,11 +696,11 @@ struct test_case {
|
||||
}
|
||||
return size;
|
||||
};
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||
if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
|
||||
continue;
|
||||
}
|
||||
mem += tensor_op_size(gf->nodes[i]);
|
||||
mem += tensor_op_size(ggml_graph_node(gf, i));
|
||||
}
|
||||
|
||||
// run
|
||||
@@ -804,7 +804,7 @@ struct test_case {
|
||||
ggml_graph_cpy(gf, gb);
|
||||
ggml_build_backward_expand(ctx, gf, gb, false);
|
||||
if (expect.size() != 1 || expect[0] != 0.0f) {
|
||||
GGML_ASSERT(gb->n_nodes > gf->n_nodes);
|
||||
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user