mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 10:46:43 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2256f36b79 | |||
| 7359016c7c | |||
| 813416991a | |||
| 5589921ef8 | |||
| 49f44b5c55 | |||
| 6685cc41c2 | |||
| ceebbb5b21 | |||
| 6daa69ee81 |
@@ -10,7 +10,6 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
@@ -291,7 +290,7 @@ In order to build llama.cpp you have three different options.
|
||||
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
|
||||
opencl clblast openblas
|
||||
|
||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||
```
|
||||
|
||||
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
|
||||
@@ -614,9 +613,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
# obtain the original LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||
# [Optional] for models using BPE tokenizers
|
||||
ls ./models
|
||||
65B 30B 13B 7B vocab.json
|
||||
# [Optional] for models using BPE tokenizers
|
||||
ls ./models
|
||||
65B 30B 13B 7B vocab.json
|
||||
|
||||
# install Python dependencies
|
||||
python3 -m pip install -r requirements.txt
|
||||
@@ -624,8 +623,8 @@ python3 -m pip install -r requirements.txt
|
||||
# convert the 7B model to ggml FP16 format
|
||||
python3 convert.py models/7B/
|
||||
|
||||
# [Optional] for models using BPE tokenizers
|
||||
python convert.py models/7B/ --vocabtype bpe
|
||||
# [Optional] for models using BPE tokenizers
|
||||
python convert.py models/7B/ --vocabtype bpe
|
||||
|
||||
# quantize the model to 4-bits (using q4_0 method)
|
||||
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
|
||||
|
||||
+18
-10
@@ -39,6 +39,17 @@ static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool file_exists(const std::string &path) {
|
||||
std::ifstream f(path.c_str());
|
||||
return f.good();
|
||||
}
|
||||
|
||||
static bool file_is_empty(const std::string &path) {
|
||||
std::ifstream f;
|
||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||
return f.tellg() == 0;
|
||||
}
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
@@ -215,12 +226,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!path_session.empty()) {
|
||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
|
||||
// fopen to check for existing session
|
||||
FILE * fp = std::fopen(path_session.c_str(), "rb");
|
||||
if (fp != NULL) {
|
||||
std::fclose(fp);
|
||||
|
||||
if (!file_exists(path_session)) {
|
||||
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
|
||||
} else if (file_is_empty(path_session)) {
|
||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
} else {
|
||||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
@@ -229,10 +240,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
|
||||
} else {
|
||||
LOG_TEE("%s: session file does not exist, will create\n", __func__);
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
|
||||
+86
-46
@@ -4,34 +4,35 @@ This example demonstrates a simple HTTP API server and a simple web front end to
|
||||
|
||||
Command line options:
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
|
||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||
- `--numa`: Attempt optimizations that help on some NUMA systems.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||
- `--port`: Set the port to listen. Default: `8080`.
|
||||
- `--path`: path from which to serve static files (default examples/server/public)
|
||||
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
||||
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
|
||||
- `--embedding`: Enable embedding extraction, Default: disabled.
|
||||
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
|
||||
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
|
||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||
- `--numa`: Attempt optimizations that help on some NUMA systems.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||
- `--port`: Set the port to listen. Default: `8080`.
|
||||
- `--path`: path from which to serve static files (default examples/server/public)
|
||||
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
||||
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
|
||||
- `--embedding`: Enable embedding extraction, Default: disabled.
|
||||
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
|
||||
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
|
||||
## Build
|
||||
|
||||
server is build alongside everything else from the root of the project
|
||||
@@ -52,21 +53,23 @@ server is build alongside everything else from the root of the project
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
### Unix-based systems (Linux, macOS, etc.)
|
||||
|
||||
```bash
|
||||
./server -m models/7B/ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
### Windows:
|
||||
### Windows
|
||||
|
||||
```powershell
|
||||
server.exe -m models\7B\ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
|
||||
|
||||
### Docker:
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
|
||||
|
||||
@@ -120,12 +123,13 @@ node index.js
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
- **GET** `/health`: Returns the current state of the server:
|
||||
- `{"status": "loading model"}` if the model is still being loaded.
|
||||
- `{"status": "error"}` if the model failed to load.
|
||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||
|
||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||
- **GET** `/health`: Returns the current state of the server:
|
||||
- `{"status": "loading model"}` if the model is still being loaded.
|
||||
- `{"status": "error"}` if the model failed to load.
|
||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||
|
||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||
|
||||
*Options:*
|
||||
|
||||
@@ -189,14 +193,13 @@ node index.js
|
||||
|
||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
### Result JSON:
|
||||
|
||||
* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||
### Result JSON
|
||||
|
||||
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||
|
||||
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"content": "<the token selected by the model>",
|
||||
"probs": [
|
||||
@@ -212,6 +215,7 @@ node index.js
|
||||
]
|
||||
},
|
||||
```
|
||||
|
||||
Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||
@@ -228,7 +232,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
||||
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
||||
|
||||
- **POST** `/tokenize`: Tokenize a given text.
|
||||
- **POST** `/tokenize`: Tokenize a given text.
|
||||
|
||||
*Options:*
|
||||
|
||||
@@ -236,13 +240,13 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||
|
||||
- **POST** `/detokenize`: Convert tokens to text.
|
||||
- **POST** `/detokenize`: Convert tokens to text.
|
||||
|
||||
*Options:*
|
||||
|
||||
`tokens`: Set the tokens to detokenize.
|
||||
|
||||
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||
|
||||
*Options:*
|
||||
|
||||
@@ -250,7 +254,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||
|
||||
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||
|
||||
*Options:*
|
||||
|
||||
@@ -260,9 +264,9 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
It also accepts all the options of `/completion` except `stream` and `prompt`.
|
||||
|
||||
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
||||
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
||||
|
||||
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
|
||||
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
|
||||
|
||||
*Options:*
|
||||
|
||||
@@ -290,6 +294,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
|
||||
... or raw HTTP requests:
|
||||
|
||||
```shell
|
||||
@@ -311,6 +316,40 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
}'
|
||||
```
|
||||
|
||||
- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
|
||||
|
||||
*Options:*
|
||||
|
||||
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
|
||||
|
||||
*Examples:*
|
||||
|
||||
- input as string
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"input": "hello",
|
||||
"model":"GPT-4",
|
||||
"encoding_format": "float"
|
||||
}'
|
||||
```
|
||||
|
||||
- `input` as string array
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"input": ["hello", "world"],
|
||||
"model":"GPT-4",
|
||||
"encoding_format": "float"
|
||||
}'
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
@@ -362,6 +401,7 @@ python api_like_OAI.py
|
||||
```
|
||||
|
||||
After running the API server, you can use it in Python by setting the API base URL.
|
||||
|
||||
```python
|
||||
openai.api_base = "http://<Your api-server IP>:port"
|
||||
```
|
||||
|
||||
+1
-1
@@ -791,7 +791,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free(*buffers[i]);
|
||||
}
|
||||
free(buffers);
|
||||
free(*buffers);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
+252
-173
@@ -116,7 +116,7 @@ struct vk_device {
|
||||
vk_queue transfer_queue;
|
||||
uint32_t descriptor_set_mode;
|
||||
uint32_t subgroup_size;
|
||||
bool is_igpu;
|
||||
bool uma;
|
||||
};
|
||||
|
||||
struct vk_op_push_constants {
|
||||
@@ -675,7 +675,7 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = vk_device.physical_device.getMemoryProperties();
|
||||
|
||||
uint32_t memory_type_index = uint32_t(~0);
|
||||
uint32_t memory_type_index = UINT32_MAX;
|
||||
|
||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
||||
vk::MemoryType memory_type = mem_props.memoryTypes[i];
|
||||
@@ -685,7 +685,18 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
||||
}
|
||||
}
|
||||
|
||||
buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index });
|
||||
if (memory_type_index >= mem_props.memoryTypeCount) {
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
|
||||
try {
|
||||
buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index });
|
||||
} catch (const vk::SystemError& e) {
|
||||
// Out of Host/Device memory, clean up buffer
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
buf.size = 0;
|
||||
throw e;
|
||||
}
|
||||
buf.memory_property_flags = req_flags;
|
||||
buf.ptr = nullptr;
|
||||
|
||||
@@ -700,6 +711,47 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
||||
return buf;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlags req_flags) {
|
||||
try {
|
||||
return ggml_vk_create_buffer(size, req_flags);
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_device(size_t size) {
|
||||
vk_buffer buf;
|
||||
try {
|
||||
buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
} catch (const vk::SystemError& e) {
|
||||
if (vk_device.uma) {
|
||||
// Fall back to host memory type
|
||||
buf = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
} else {
|
||||
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||
if (buf.size == 0) {
|
||||
return;
|
||||
}
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl;
|
||||
#endif
|
||||
|
||||
buf.size = 0;
|
||||
vk_device.device.freeMemory(buf.device_memory);
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
}
|
||||
|
||||
static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
||||
return { buf, 0, VK_WHOLE_SIZE };
|
||||
}
|
||||
@@ -738,19 +790,6 @@ static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vector<vk::E
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||
if (buf.size == 0) {
|
||||
return;
|
||||
}
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl;
|
||||
#endif
|
||||
|
||||
buf.size = 0;
|
||||
vk_device.device.freeMemory(buf.device_memory);
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
}
|
||||
|
||||
static bool ggml_vk_build_shader(ggml_type type) {
|
||||
switch(type) {
|
||||
case GGML_TYPE_F16:
|
||||
@@ -1015,7 +1054,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
|
||||
vk_device.vendor_id = vk_device.properties.vendorID;
|
||||
vk_device.subgroup_size = subgroup_props.subgroupSize;
|
||||
vk_device.is_igpu = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
vk_device.uma = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
@@ -1088,7 +1127,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
if (vk_device.fp16) {
|
||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||
}
|
||||
std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl;
|
||||
std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | uma: " << vk_device.uma << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl;
|
||||
device_create_info = {
|
||||
vk::DeviceCreateFlags(),
|
||||
device_queue_create_infos,
|
||||
@@ -1210,7 +1249,7 @@ static vk_buffer ggml_vk_pool_malloc(size_t size) {
|
||||
ggml_vk_destroy_buffer(b);
|
||||
}
|
||||
|
||||
return ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
return ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_free(vk_buffer& buffer) {
|
||||
@@ -1250,10 +1289,6 @@ static void * ggml_vk_host_malloc(size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
||||
#endif
|
||||
if (getenv("GGML_VK_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vk_buffer buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
|
||||
if(!(buf.memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
||||
@@ -1298,6 +1333,20 @@ static void ggml_vk_host_free(void* ptr) {
|
||||
vk_pinned_memory.erase(vk_pinned_memory.begin() + index);
|
||||
}
|
||||
|
||||
static void ggml_vk_host_get(const void * ptr, vk_buffer *& buf, size_t& buf_offset) {
|
||||
buf = nullptr;
|
||||
buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (ptr >= addr && ptr < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)ptr) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) {
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_create_cmd_buffer(q);
|
||||
@@ -1384,6 +1433,13 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
||||
}
|
||||
}
|
||||
|
||||
static void ensure_sync_staging_buffer(size_t size) {
|
||||
if (vk_sync_staging.size < size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
||||
@@ -1391,21 +1447,13 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size
|
||||
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer* buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (tensor->data >= addr && tensor->data < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)tensor->data) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(tensor->data, buf, buf_offset);
|
||||
|
||||
const uint64_t ne0 = tensor->ne[0];
|
||||
const uint64_t ne1 = tensor->ne[1];
|
||||
@@ -1463,10 +1511,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size
|
||||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
// Create temporary larger buffer
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
staging_offset = 0;
|
||||
@@ -1512,17 +1557,9 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer* buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (src >= addr && src < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)src) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(src, buf, buf_offset);
|
||||
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
@@ -1555,10 +1592,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size
|
||||
const size_t copy_size = width*height;
|
||||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
staging_offset = 0;
|
||||
@@ -1633,17 +1667,9 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_
|
||||
GGML_ASSERT(height > 0);
|
||||
GGML_ASSERT(src->size > 0);
|
||||
// Check if dst is pinned memory
|
||||
vk_buffer* buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (dst >= addr && dst < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)dst) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(dst, buf, buf_offset);
|
||||
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
if (width == spitch && width == dpitch) {
|
||||
@@ -1677,10 +1703,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_
|
||||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
// Create temporary larger buffer
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
} else {
|
||||
@@ -1819,7 +1842,7 @@ static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offs
|
||||
|
||||
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << aligned << ")";
|
||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")";
|
||||
#endif
|
||||
if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
||||
#ifdef VK_DEBUG
|
||||
@@ -2003,8 +2026,27 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co
|
||||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset);
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src0_uma = d_Qx != nullptr;
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@@ -2034,32 +2076,24 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co
|
||||
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
||||
vk_buffer * d_Qx;
|
||||
uint64_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
vk_buffer* d_X;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (load_x) {
|
||||
d_Qx = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if (!src0_uma) {
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
qx_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
}
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if (!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qy != nullptr);
|
||||
@@ -2178,8 +2212,27 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0
|
||||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset);
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src0_uma = d_Qx != nullptr;
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@@ -2199,31 +2252,23 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0
|
||||
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
uint32_t qx_buf_offset = 0;
|
||||
vk_buffer* d_Qy;
|
||||
uint32_t qy_buf_offset = 0;
|
||||
vk_buffer* d_X;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (load_x) {
|
||||
d_Qx = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if(!src1_uma) {
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
qx_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
}
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if(!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qy != nullptr);
|
||||
@@ -2345,7 +2390,21 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor
|
||||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const uint64_t x_ne = ne00 * ne01 * ne02;
|
||||
const uint64_t y_ne = ne10 * ne11 * ne12;
|
||||
@@ -2355,22 +2414,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
vk_buffer* d_Qx = &extra_src0->buffer_gpu;
|
||||
const uint64_t qx_buf_offset = extra_src0->offset;
|
||||
vk_buffer* d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if (!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
@@ -2430,7 +2482,21 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor *
|
||||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
|
||||
@@ -2441,18 +2507,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor *
|
||||
const uint64_t qy_sz = ggml_nbytes(src1);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
vk_buffer* d_Qx = &extra_src0->buffer_gpu;
|
||||
const uint64_t qx_buf_offset = extra_src0->offset;
|
||||
vk_buffer* d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
@@ -2709,7 +2768,8 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
|
||||
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
|
||||
static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor);
|
||||
#endif
|
||||
|
||||
template<typename PC>
|
||||
@@ -2758,17 +2818,34 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
||||
return;
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
|
||||
vk_buffer * d_X = nullptr;
|
||||
size_t x_buf_offset = 0;
|
||||
vk_buffer * d_Y = nullptr;
|
||||
size_t y_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_X, x_buf_offset);
|
||||
src0_uma = d_X != nullptr;
|
||||
if (use_src1) {
|
||||
ggml_vk_host_get(src1->data, d_Y, y_buf_offset);
|
||||
src1_uma = d_Y != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, vk_device.properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
|
||||
// Workaround for tiny tensor inputs on ROPE
|
||||
if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > extra_src1->buffer_gpu.size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
@@ -2778,20 +2855,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
uint64_t d_buf_offset = (extra->offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
||||
vk_buffer* d_X = nullptr;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y = nullptr;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (transfer_src0) {
|
||||
d_X = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if(!src0_uma) {
|
||||
d_X = &extra_src0->buffer_gpu;
|
||||
x_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_X != nullptr);
|
||||
}
|
||||
if (transfer_src1) {
|
||||
d_Y = &vk_prealloc_qy;
|
||||
} else if (use_src1) {
|
||||
} else if (use_src1 && !src1_uma) {
|
||||
d_Y = &extra_src1->buffer_gpu;
|
||||
y_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Y != nullptr);
|
||||
@@ -3148,13 +3221,13 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size
|
||||
if (vk_prealloc_split_k.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_split_k);
|
||||
}
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer_check(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
}
|
||||
|
||||
vk_buffer d_X = ggml_vk_create_buffer(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer_check(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer_check(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
|
||||
Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
|
||||
@@ -3315,6 +3388,10 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
i0 = std::max(i0, 5);
|
||||
i1 = std::max(i1, 5);
|
||||
i2 = std::max(i2, 0);
|
||||
i3 = std::max(i3, 0);
|
||||
fprintf(stderr, " ");
|
||||
for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
|
||||
fprintf(stderr, "%7d ", idx1);
|
||||
@@ -3376,7 +3453,7 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3)
|
||||
vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue);
|
||||
ggml_vk_ctx_begin(ctx);
|
||||
|
||||
vk_buffer buffer = ggml_vk_create_buffer(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer buffer = ggml_vk_create_buffer_check(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
ggml_vk_h2d_tensor_2d(ctx, &buffer, 0, tensor, 0, 0, ggml_nrows(tensor));
|
||||
|
||||
@@ -3439,7 +3516,7 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) {
|
||||
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
||||
#endif
|
||||
// Check transfers are correct
|
||||
vk_buffer buffer = ggml_vk_create_buffer(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer buffer = ggml_vk_create_buffer_check(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
float * x;
|
||||
float * y;
|
||||
@@ -3666,7 +3743,7 @@ void ggml_vk_preallocate_buffers() {
|
||||
std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << " split_k_size: " << vk_prealloc_size_split_k << std::endl;
|
||||
#endif
|
||||
#if defined(VK_RUN_TESTS)
|
||||
vk_staging = ggml_vk_create_buffer(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
vk_staging = ggml_vk_create_buffer_check(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
ggml_vk_test_transfer(8192 * 1000, false);
|
||||
ggml_vk_test_transfer(8192 * 1000, true);
|
||||
|
||||
@@ -3712,42 +3789,42 @@ void ggml_vk_preallocate_buffers() {
|
||||
if (vk_prealloc_qx.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_qx);
|
||||
}
|
||||
vk_prealloc_qx = ggml_vk_create_buffer(vk_prealloc_size_qx, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_qx = ggml_vk_create_buffer_device(vk_prealloc_size_qx);
|
||||
}
|
||||
if (vk_prealloc_size_qy > 0 && vk_prealloc_qy.size < vk_prealloc_size_qy) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_qy.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_qy);
|
||||
}
|
||||
vk_prealloc_qy = ggml_vk_create_buffer(vk_prealloc_size_qy, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_qy = ggml_vk_create_buffer_device(vk_prealloc_size_qy);
|
||||
}
|
||||
if (vk_prealloc_size_x > 0 && vk_prealloc_x.size < vk_prealloc_size_x) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_x.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_x);
|
||||
}
|
||||
vk_prealloc_x = ggml_vk_create_buffer(vk_prealloc_size_x, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_x = ggml_vk_create_buffer_device(vk_prealloc_size_x);
|
||||
}
|
||||
if (vk_prealloc_size_y > 0 && vk_prealloc_y.size < vk_prealloc_size_y) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_y.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_y);
|
||||
}
|
||||
vk_prealloc_y = ggml_vk_create_buffer(vk_prealloc_size_y, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_y = ggml_vk_create_buffer_device(vk_prealloc_size_y);
|
||||
}
|
||||
if (vk_prealloc_size_split_k > 0 && vk_prealloc_split_k.size < vk_prealloc_size_split_k) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_split_k.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_split_k);
|
||||
}
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer(vk_prealloc_size_split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer_device(vk_prealloc_size_split_k);
|
||||
}
|
||||
if (vk_staging_size > 0 && vk_staging.size < vk_staging_size) {
|
||||
// Resize buffer
|
||||
if (vk_staging.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_staging);
|
||||
}
|
||||
vk_staging = ggml_vk_create_buffer(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
vk_staging = ggml_vk_create_buffer_check(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4138,6 +4215,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
||||
|
||||
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -4163,14 +4241,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
||||
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
||||
}
|
||||
|
||||
if (extra->offset + ggml_nbytes(tensor) > extra->buffer_gpu.size) {
|
||||
std::cerr << "ERROR: Trying to assign tensor " << tensor << " outside of buffer size " << ctx->dev_buffer.size << " requested offset: " << extra->offset << " tensor size: " << ggml_nbytes(tensor) << std::endl;
|
||||
if (tensor->view_src != nullptr) {
|
||||
std::cerr << "view_src: " << tensor->view_src << " extra: " << tensor->view_src->extra << std::endl;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
@@ -4248,7 +4318,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
||||
#endif
|
||||
vk_buffer dev_buffer = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer dev_buffer = ggml_vk_create_buffer_device(size);
|
||||
|
||||
ggml_backend_vk_buffer_context * ctx = new ggml_backend_vk_buffer_context(dev_buffer);
|
||||
|
||||
@@ -4326,9 +4396,12 @@ GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffe
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr = ggml_vk_host_malloc(size);
|
||||
|
||||
if (ptr == nullptr) {
|
||||
void * ptr = nullptr;
|
||||
try {
|
||||
ptr = ggml_vk_host_malloc(size);
|
||||
} catch (vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
@@ -4389,7 +4462,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
@@ -4409,7 +4482,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
@@ -4429,7 +4502,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
||||
#endif
|
||||
if (dst->buffer->buft == ggml_backend_vk_buffer_type() && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
if ((dst->buffer->buft == ggml_backend_vk_buffer_type() || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
@@ -4499,7 +4572,6 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||
|
||||
bool ok = ggml_vk_compute_forward(¶ms, node);
|
||||
if (!ok) {
|
||||
std::cerr << "Vulkan disable: " << vk_disable << std::endl;
|
||||
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
@@ -4665,7 +4737,7 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
||||
// checks
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
|
||||
static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
|
||||
if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) {
|
||||
return;
|
||||
}
|
||||
@@ -4683,10 +4755,14 @@ void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const gg
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
||||
static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
i0 = std::max(i0, 5);
|
||||
i1 = std::max(i1, 5);
|
||||
i2 = std::max(i2, 0);
|
||||
i3 = std::max(i3, 0);
|
||||
fprintf(stderr, " ");
|
||||
for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
|
||||
fprintf(stderr, "%7d ", idx1);
|
||||
@@ -4698,9 +4774,9 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in
|
||||
if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
|
||||
float val;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
val = *(float *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
}
|
||||
fprintf(stderr, "% 7.2f ", val);
|
||||
} else {
|
||||
@@ -4711,14 +4787,16 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queue);
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
||||
@@ -4730,10 +4808,10 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
||||
}
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 0, 0);
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 1, 0);
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
@@ -4743,7 +4821,7 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
return;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
@@ -4779,7 +4857,7 @@ void * comp_result;
|
||||
size_t comp_size;
|
||||
size_t comp_nb[GGML_MAX_DIMS];
|
||||
size_t check_counter = 0;
|
||||
void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
@@ -4796,8 +4874,9 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
|
||||
struct ggml_init_params iparams = {
|
||||
.mem_size = 1024*1024*1024,
|
||||
.mem_buffer = NULL,
|
||||
/*.mem_size =*/ 1024*1024*1024,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx = ggml_init(iparams);
|
||||
@@ -4829,7 +4908,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
||||
const int idx = i3*src0->ne[2] + i2;
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4842,7 +4921,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
if (offset + src0_size >= extra->buffer_gpu.size) {
|
||||
src0_size = extra->buffer_gpu.size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size);
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
@@ -4872,7 +4951,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
||||
const int idx = i3*src1->ne[2] + i2;
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4885,7 +4964,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
if (offset + src1_size >= extra->buffer_gpu.size) {
|
||||
src1_size = extra->buffer_gpu.size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size);
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
@@ -4969,7 +5048,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
|
||||
if (src1 == nullptr) {
|
||||
tensor_clone = ggml_dup(ctx, src0_clone);
|
||||
tensor_clone->type == tensor->type;
|
||||
tensor_clone->type = tensor->type;
|
||||
} else {
|
||||
tensor_clone = ggml_cpy(ctx, src0_clone, src1_clone);
|
||||
}
|
||||
@@ -5046,7 +5125,7 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor)
|
||||
tensor_size = extra->buffer_gpu.size - (extra->offset);
|
||||
}
|
||||
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
float first_error_result = -1.0f;
|
||||
|
||||
@@ -4136,7 +4136,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
if (ggml_vk_has_device() && params.n_gpu_layers > 0 && (
|
||||
if (params.n_gpu_layers > 0 && (
|
||||
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|
||||
|| !(
|
||||
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
||||
@@ -4145,8 +4145,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
||||
)
|
||||
)) {
|
||||
// disable Vulkan due to unsupported model architecture or quantization type
|
||||
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
|
||||
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
|
||||
params.n_gpu_layers = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user