Compare commits

..

13 Commits

Author SHA1 Message Date
Georgi Gerganov 5ab6c2132a server-parallel : add "--reverse-prompt" + compiler warning fixes 2023-10-06 14:32:19 +03:00
FSSRepo afc09db51c fix json format README 2023-10-05 15:23:58 -04:00
FSSRepo eb75395b5c remove trail whitespace 2023-10-05 15:18:47 -04:00
FSSRepo a7a6ceb7ae server handling multiple clients with cam 2023-10-05 15:12:39 -04:00
cebtenzzre 48edda30ee convert : update Falcon script for new HF config (#3448)
Also adds Falcon-180B support.
Closes #3049

Co-authored-by: jb <jonathan.t.barnard@gmail.com>
2023-10-05 15:00:34 -04:00
Kenvix ⭐ 45eba9369f build : use std::make_tuple() for compatibility with older GCC versions (#3488) 2023-10-05 20:16:39 +03:00
staviq acec9eaaa9 common : process escape sequences in reverse prompts (#3461) 2023-10-05 19:17:29 +03:00
shibe2 e2583cbc29 CLBlast: Fix handling of on-device tensor data
Fix uploading tensor data to device, including 3D, 4D, and non-contiguous tensors.
Use correct offsets into data that is already in VRAM.
Correct handling of OpenCL events when multiple commands are queued.
2023-10-05 18:25:23 +04:00
Jhen-Jie Hong e8b8d32e86 server : fix incorrect num_tokens_predicted (#3480) 2023-10-05 17:02:55 +03:00
Jhen-Jie Hong 8f3a642ec1 swift : disable ACCELERATE_NEW_LAPACK (#3481) 2023-10-05 17:00:07 +03:00
Jhen-Jie Hong 0745384449 ci : add swift build via xcodebuild (#3482) 2023-10-05 16:56:21 +03:00
Kerfuffle 019ba1dcd0 convert : fix Baichuan2 models by using vocab size in config.json (#3299)
Use local GGUF package when possible in Baichuan converter
2023-10-04 17:20:28 +03:00
Georgi Gerganov beabc8cfb0 readme : add project status link 2023-10-04 16:50:44 +03:00
13 changed files with 1399 additions and 96 deletions
+23
View File
@@ -253,6 +253,29 @@ jobs:
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
macOS-latest-swift:
runs-on: macos-latest
strategy:
matrix:
destination: ['platform=macOS,name=Any Mac', 'platform=iOS,name=Any iOS Device', 'platform=tvOS,name=Any tvOS Device']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
windows-latest-cmake:
runs-on: windows-latest
+6 -3
View File
@@ -44,9 +44,12 @@ let package = Package(
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE"),
.define("ACCELERATE_NEW_LAPACK"),
.define("ACCELERATE_LAPACK_ILP64")
.define("GGML_USE_ACCELERATE")
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
] + additionalSettings,
linkerSettings: [
.linkedFramework("Accelerate")
+1 -1
View File
@@ -5,7 +5,7 @@
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
+5 -2
View File
@@ -361,7 +361,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
params.lora_adapter.push_back({argv[i], 1.0f});
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
params.use_mmap = false;
} else if (arg == "--lora-scaled") {
if (++i >= argc) {
@@ -373,7 +373,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
@@ -616,6 +616,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
process_escapes(params.prompt);
process_escapes(params.input_prefix);
process_escapes(params.input_suffix);
for (auto & antiprompt : params.antiprompt) {
process_escapes(antiprompt);
}
}
return true;
+8 -2
View File
@@ -11,11 +11,14 @@ import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import itertools
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
if TYPE_CHECKING:
from typing import TypeAlias
@@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
vocab_size = hparams.get('vocab_size')
if vocab_size is None:
vocab_size = tokenizer.vocab_size()
for i in range(tokenizer.vocab_size()):
for i in range(vocab_size):
text: bytes
score: float
+66 -51
View File
@@ -4,6 +4,7 @@
from __future__ import annotations
import argparse
import contextlib
import json
import os
import struct
@@ -20,10 +21,10 @@ if 'NO_LOCAL_GGUF' not in os.environ:
import gguf
def count_model_parts(dir_model: Path) -> int:
def count_model_parts(dir_model: Path, prefix: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
if filename.startswith(prefix):
num_parts += 1
if num_parts > 0:
@@ -77,20 +78,26 @@ print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "RWForCausalLM":
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
num_parts = count_model_parts(dir_model, "model-00")
if num_parts:
is_safetensors = True
from safetensors import safe_open
else:
is_safetensors = False
num_parts = count_model_parts(dir_model, "pytorch_model-")
ARCH=gguf.MODEL_ARCH.FALCON
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
block_count = hparams["num_hidden_layers"]
gguf_writer.add_name("Falcon")
gguf_writer.add_context_length(2048) # not in config.json
@@ -98,9 +105,9 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
gguf_writer.add_head_count(hparams["num_attention_heads"])
if "num_kv_heads" in hparams:
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
@@ -146,8 +153,8 @@ special_vocab.add_to_gguf(gguf_writer)
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
n_head = hparams["num_attention_heads"]
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
head_dim = hparams["hidden_size"] // n_head
@@ -156,6 +163,10 @@ print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
elif is_safetensors:
part_names = (
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
@@ -165,60 +176,64 @@ for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
if is_safetensors:
ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
else:
ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
for name in model_part.keys():
data = model_part[name]
with ctx as model_part:
for name in model_part.keys():
data = model_part.get_tensor(name) if is_safetensors else model_part[name]
old_dtype = data.dtype
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
# QKV tensor transform
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
# QKV tensor transform
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
if "query_key_value" in name:
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
data = torch.cat((q,k,v)).reshape_as(data)
if "query_key_value" in name:
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
data = torch.cat((q,k,v)).reshape_as(data)
data = data.squeeze().numpy()
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
+1
View File
@@ -35,6 +35,7 @@ else()
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
add_subdirectory(server-parallel)
endif()
add_subdirectory(export-lora)
endif()
+15
View File
@@ -0,0 +1,15 @@
set(TARGET server-parallel)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp ../server/json.hpp ../server/httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
+66
View File
@@ -0,0 +1,66 @@
# llama.cpp/example/server-parallel
This example demonstrates a PoC HTTP API server that handles simulataneus requests. Long prompts are not supported.
## Quick Start
To get started right away, run the following command, making sure to use the correct path for the model you have:
### Unix-based systems (Linux, macOS, etc.):
```bash
./server-parallel -m models/7B/ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching
```
### Windows:
```powershell
server-parallel.exe -m models\7B\ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching
```
The above command will start a server that by default listens on `127.0.0.1:8080`.
## API Endpoints
- **GET** `/props`: Return the user and assistant name for generate the prompt.
*Response:*
```json
{
"user_name": "User:",
"assistant_name": "Assistant:"
}
```
- **POST** `/completion`: Given a prompt, it returns the predicted completion, just streaming mode.
*Options:*
`temperature`: Adjust the randomness of the generated text (default: 0.1).
`prompt`: Provide a prompt as a string, It should be a coherent continuation of the system prompt.
`system_prompt`: Provide a system prompt as a string.
`anti_prompt`: Provide the name of the user coherent with the system prompt.
`assistant_name`: Provide the name of the assistant coherent with the system prompt.
*Example request:*
```json
{
"system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nHuman: Hello\nAssistant: Hi, how may I help you?\nHuman:",
"anti_prompt": "Human:",
"assistant_name": "Assistant:",
"prompt": "When is the day of independency of US?",
"temperature": 0.2
}
```
*Response:*
```json
{
"content": "<token_str>"
}
```
# This example is a Proof of Concept, have some bugs and unexpected behaivors, this not supports long prompts.
+263
View File
@@ -0,0 +1,263 @@
const char* system_prompt_default =
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
User: Recommend a nice restaurant in the area.
Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
User: Who is Richard Feynman?
Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
User:)";
const char* index_html_ = R"(
<!DOCTYPE html>
<html>
<head>
<title>llama.cpp - server parallel PoC</title>
<script src="index.js"></script>
</head>
<body>
<div style="width: 90%;margin: auto;">
<h2>Server parallel - PoC</h2>
<form id="myForm">
<input type="checkbox" id="system_promt_cb" name="myCheckbox" onchange="toggleSP() ">
<label for="system_promt_cb">Use custom system prompt</label>
<br>
<div id="system_prompt_view" style="display: none;">
<textarea id="sp_text" name="systemPrompt" style="width: 100%;height: 4rem;" placeholder="System Prompt"></textarea>
<label for="user_name">User name</label>
<input type="text" id="user_name" value="" placeholder="Anti prompt" required>
<label for="assistant_name">Assistant name</label>
<input type="text" id="assistant_name" value="" placeholder="Assistant:" required>
<button type="button" id="btn_reset" onclick="clearSP() " >Clear all</button>
</div>
<br>
<label for="slot_id">Slot ID (-1 load in a idle slot)</label>
<input type="number" id="slot_id" value="-1" required>
<br>
<label for="temperature">Temperature</label>
<input type="number" id="temperature" value="0.1" required>
<br>
<label for="message">Message</label>
<input id="message" style="width: 80%;" required>
<br><br>
<button type="button" id="btn_send" onclick="perform() " >Send</button>
<br>
<br>
<button type="button" id="btn_reset" onclick="resetBtn() " >Reset</button>
</form>
<div id="conversation_view">
</div>
</div>
</body>
</html>
)";
const char* index_js_ = R"(
let conversation = [];
let current_message = -1;
const questions = [
"Who is Elon Musk?",
"Who is Jeff Bezos?",
"How to get a job at google?",
"What are you?",
"When was born Abraham Lincoln?",
];
let user_name = "";
let assistant_name = "";
function toggleSP() {
if(document.getElementById("system_promt_cb").checked) {
document.getElementById("system_prompt_view").style.display = "block";
} else {
document.getElementById("system_prompt_view").style.display = "none";
}
}
function clearSP() {
document.getElementById("sp_text").value = "";
document.getElementById("anti_prompt").value = "";
document.getElementById("assistant_name").value = "";
}
docReady(async () => {
document.getElementById("message").value =
questions[Math.floor(Math.random() * questions.length)];
// to keep the same prompt format in all clients
const response = await fetch("/props");
if (!response.ok) {
alert(`HTTP error! Status: ${response.status}`);
}
const data = await response.json();
user_name = data.user_name;
assistant_name = data.assistant_name;
});
function docReady(fn) {
// see if DOM is already available
if (
document.readyState === "complete" ||
document.readyState === "interactive"
) {
// call on next available tick
setTimeout(fn, 1);
} else {
document.addEventListener("DOMContentLoaded", fn);
}
}
function updateView() {
let conv_view = document.getElementById("conversation_view");
// build view
conv_view.innerHTML = "";
for (let index in conversation) {
conversation[index].assistant = conversation[index].assistant.replace(
user_name,
""
);
conv_view.innerHTML += `
<p><span style="font-weight: bold">User:</span> ${conversation[index].user}<p>
<p style="white-space: pre-line;"><span style="font-weight: bold">Assistant:</span> ${conversation[index].assistant}<p>`;
}
}
async function call_llama(options) {
const response = await fetch("/completion", {
method: "POST",
body: JSON.stringify(options),
headers: {
Connection: "keep-alive",
"Content-Type": "application/json",
Accept: "text/event-stream",
},
});
const reader = response.body.getReader();
let cont = true;
const decoder = new TextDecoder();
let leftover = ""; // Buffer for partially read lines
try {
let cont = true;
while (cont) {
const result = await reader.read();
if (result.done) {
document.getElementById("btn_send").disabled = false;
break;
}
// Add any leftover data to the current chunk of data
const text = leftover + decoder.decode(result.value);
// Check if the last character is a line break
const endsWithLineBreak = text.endsWith("\n");
// Split the text into lines
let lines = text.split("\n");
// If the text doesn't end with a line break, then the last line is incomplete
// Store it in leftover to be added to the next chunk of data
if (!endsWithLineBreak) {
leftover = lines.pop();
} else {
leftover = ""; // Reset leftover if we have a line break at the end
}
// Parse all sse events and add them to result
const regex = /^(\S+):\s(.*)$/gm;
for (const line of lines) {
const match = regex.exec(line);
if (match) {
result[match[1]] = match[2];
// since we know this is llama.cpp, let's just decode the json in data
if (result.data) {
result.data = JSON.parse(result.data);
conversation[current_message].assistant += result.data.content;
updateView();
}
}
}
}
} catch (e) {
if (e.name !== "AbortError") {
console.error("llama error: ", e);
}
throw e;
}
}
function generatePrompt() {
// generate a good prompt to have coherence
let prompt = "";
for (let index in conversation) {
if (index == 0) {
prompt += conversation[index].user + "\n";
} else {
prompt += user_name + conversation[index].user + "\n";
}
if (index == current_message) {
prompt += assistant_name;
} else {
prompt += assistant_name + conversation[index].assistant;
}
}
return prompt;
}
function resetBtn() {
document.getElementById("slot_id").value = "-1";
document.getElementById("temperature").value = "0.1";
document.getElementById("message").value =
questions[Math.floor(Math.random() * questions.length)];
document.getElementById("conversation_view").innerHTML = "";
conversation = [];
current_message = -1;
}
async function perform() {
var slot_id = parseInt(document.getElementById("slot_id").value);
var temperature = parseFloat(document.getElementById("temperature").value);
var prompt = " " + document.getElementById("message").value;
if (!isNaN(slot_id) && !isNaN(temperature) && prompt.length > 0) {
let options = {
slot_id,
temperature
};
if(document.getElementById("system_promt_cb").checked) {
let system_prompt = document.getElementById("sp_text").value;
let anti_prompt = document.getElementById("user_name").value;
let assistant_name_ = document.getElementById("assistant_name").value;
if(!system_prompt || !anti_prompt || !assistant_name_) {
document.getElementById("conversation_view").innerText =
"please, insert valid props.";
return;
}
conversation = [];
current_message = -1;
document.getElementById("system_promt_cb").checked = false;
document.getElementById("system_promt_cb").dispatchEvent(new Event("change"));
options.system_prompt = system_prompt;
options.anti_prompt = anti_prompt;
options.assistant_name = assistant_name_;
user_name = anti_prompt;
assistant_name = assistant_name_;
}
current_message++;
conversation.push({
user: prompt,
assistant: "",
});
updateView();
document.getElementById("message").value = "";
document.getElementById("btn_send").disabled = true;
options.prompt = generatePrompt();
await call_llama(options);
} else {
document.getElementById("conversation_view").innerText =
"please, insert valid props.";
}
}
)";
+884
View File
@@ -0,0 +1,884 @@
#include "frontend.h"
#include "common.h"
#include "llama.h"
#include "../server/httplib.h"
#include "../server/json.hpp"
#include <iostream>
#include <sstream>
#include <thread>
#include <vector>
#include <chrono>
using namespace httplib;
using namespace std;
using namespace nlohmann;
struct server_params
{
std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
};
// utils functions taken of examples/server
static bool ends_with(const std::string &str, const std::string &suffix)
{
return str.size() >= suffix.size() &&
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
}
static size_t find_partial_stop_string(const std::string &stop,
const std::string &text)
{
if (!text.empty() && !stop.empty())
{
const char text_last_char = text.back();
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
{
if (stop[char_index] == text_last_char)
{
const std::string current_partial = stop.substr(0, char_index + 1);
if (ends_with(text, current_partial))
{
return text.size() - char_index - 1;
}
}
}
}
return std::string::npos;
}
enum stop_type
{
STOP_FULL,
STOP_PARTIAL,
};
enum slot_state
{
IDLE,
PROCESSING
};
enum slot_command {
NONE,
LOAD_PROMPT,
RELEASE
};
struct llama_client_slot
{
int id;
int32_t n_prompt = 0;
int32_t n_decoded = 0;
int32_t i_batch = -1;
string prompt = "";
string sampled_token_str;
string generated_text = "";
llama_token sampled;
std::vector<llama_token> tokens_prev;
slot_state state = IDLE;
slot_command command = NONE;
bool newToken = false;
float temperature = 0.1f;
void start(string prompt_, float temp_) {
prompt = prompt_;
command = LOAD_PROMPT;
temperature = temp_;
newToken = false;
}
bool hasNewToken() {
if(newToken) {
newToken = false;
return true;
}
return false;
}
bool available() {
return state == IDLE && command == NONE;
}
void nofity() {
newToken = !newToken;
}
void release() {
if(state == PROCESSING) {
command = RELEASE;
}
}
};
struct server_parallel_context {
// example props
vector<llama_client_slot> slots;
std::string system_prompt = "";
bool update_system_prompt = true;
// broadcast to all clients to keep the same prompt format
std::string user_name = ""; // this should be the anti prompt
std::string assistant_name = ""; // this is for generate the prompt
// llama native props
gpt_params params;
llama_model *model = NULL;
llama_context *ctx = NULL;
int n_ctx;
int n_vocab;
std::vector<llama_token_data> candidates;
std::vector<llama_token> tokens_system;
int32_t n_tokens_system = 0;
llama_batch batch;
bool loadModel(gpt_params params_) {
params = params_;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr)
{
LOG_TEE("unable to load model: %s", params.model.c_str());
return false;
}
n_ctx = llama_n_ctx(ctx);
n_vocab = llama_n_vocab(model);
candidates.reserve(n_vocab);
return true;
}
void initialize() {
// create slots
LOG_TEE("Available slots:\n");
for (int i = 0; i < params.n_parallel; i++)
{
llama_client_slot slot;
slot.id = i;
slot.prompt = "default";
slot.state = IDLE;
slot.tokens_prev.resize(std::max(256, params.n_predict));
std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0);
LOG_TEE(" - slot %i\n", slot.id);
slots.push_back(slot);
}
batch = llama_batch_init(params.n_ctx, 0);
// always assign a default system prompt
system_prompt = system_prompt_default;
user_name = "User:";
assistant_name = "Assistant:";
params.antiprompt.push_back(user_name);
}
void updateSystemPrompt() {
tokens_system = ::llama_tokenize(ctx, system_prompt, true);
n_tokens_system = tokens_system.size();
batch.n_tokens = n_tokens_system;
// clear the entire KV cache
for (int i = 0; i < params.n_parallel; ++i)
{
llama_kv_cache_seq_rm(ctx, i, 0, -1);
}
for (int32_t i = 0; i < batch.n_tokens; ++i)
{
batch.token[i] = tokens_system[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
if (llama_decode(ctx, batch) != 0)
{
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i < params.n_parallel; ++i)
{
llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
}
LOG_TEE("system prompt updated\n");
update_system_prompt = false;
}
void notifySystemPromptChanged() {
// release all slots
for (llama_client_slot &slot : slots)
{
slot.release();
}
waitAllAreIdle();
// wait until system prompt load
update_system_prompt = true;
while(update_system_prompt) {
this_thread::sleep_for(chrono::milliseconds(5));
}
// system prompt loaded, continue
}
llama_client_slot* requestCompletion(json data) {
if(data.contains("system_prompt") &&
data.contains("anti_prompt") &&
data.contains("assistant_name")) {
system_prompt = data.value("system_prompt", "");
user_name = data.value("anti_prompt", "");
assistant_name = data.value("assistant_name", "");
params.antiprompt.clear();
params.antiprompt.push_back(user_name);
notifySystemPromptChanged();
}
int slot_id = data.value("slot_id", -1);
float temperature = data.value("temperature", 0.1f);
string prompt = data.value("prompt", "");
for (llama_client_slot & slot : slots)
{
if ((slot_id == -1 && slot.available()) || slot.id == slot_id)
{
slot.start(prompt, temperature);
LOG_TEE("slot %i is processing\n", slot.id);
return &slot; // return a pointer to slot (thread safe?)
}
}
return nullptr;
}
size_t findStoppingStrings(const std::string &text, const size_t last_token_size,
const stop_type type)
{
size_t stop_pos = std::string::npos;
for (const std::string &word : params.antiprompt)
{
size_t pos;
if (type == STOP_FULL)
{
const size_t tmp = word.size() + last_token_size;
const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
pos = text.find(word, from_pos);
}
else
{
pos = find_partial_stop_string(word, text);
}
if (pos != std::string::npos &&
(stop_pos == std::string::npos || pos < stop_pos))
{
stop_pos = pos;
}
}
return stop_pos;
}
void waitAllAreIdle() {
bool wait = true;
while(wait) {
wait = false;
for (auto &slot : slots)
{
if (!slot.available())
{
wait = true;
break;
}
}
}
}
bool updateSlots() {
// update the system prompt wait until all slots are idle state
if(update_system_prompt) {
updateSystemPrompt();
}
batch.n_tokens = 0;
// decode any currently ongoing sequences
for (auto & slot : slots) {
if (slot.state == PROCESSING && slot.command == RELEASE)
{
LOG_TEE("slot %i released\n", slot.id);
llama_kv_cache_seq_rm(ctx, slot.id, n_tokens_system, n_ctx);
slot.state = IDLE;
slot.command = NONE;
continue;
}
// no decode wait until the token had been send to client
// improves performance and avoid decoherence?
if (slot.state == IDLE || slot.newToken) {
continue;
}
batch.token [batch.n_tokens] = slot.sampled;
batch.pos [batch.n_tokens] = n_tokens_system + slot.n_prompt + slot.n_decoded;
batch.seq_id[batch.n_tokens] = slot.id;
batch.logits[batch.n_tokens] = true;
slot.n_decoded += 1;
slot.i_batch = batch.n_tokens;
batch.n_tokens += 1;
}
// assign workload to the slots
if (params.cont_batching || batch.n_tokens == 0) {
for (llama_client_slot & slot : slots) {
// need process the prompt
if (slot.state == IDLE && slot.command == LOAD_PROMPT) {
slot.state = PROCESSING;
slot.command = NONE;
//LOG_TEE("slot %i process prompt:\n%s%s'------------------------------\n", slot.id, system_prompt.c_str(), slot.prompt.c_str());
std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0);
// do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt;
tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
batch.token [batch.n_tokens] = tokens_prompt[i];
batch.pos [batch.n_tokens] = i + n_tokens_system;
batch.seq_id[batch.n_tokens] = slot.id;
batch.logits[batch.n_tokens] = false;
batch.n_tokens += 1;
}
// extract the logits only for the last token
if (batch.n_tokens > 0) {
batch.logits[batch.n_tokens - 1] = true;
}
slot.n_prompt = tokens_prompt.size();
slot.n_decoded = 0;
slot.i_batch = batch.n_tokens - 1;
// insert new requests one-by-one
//if (cont_batching) {
// break;
//}
}
}
}
if (batch.n_tokens == 0) {
return true;
}
// process in chunks of params.n_batch
int32_t n_batch = params.n_batch;
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
// experiment: process in powers of 2
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
// n_batch /= 2;
// i -= n_batch;
// continue;
//}
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return false;
}
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
// retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2;
i -= n_batch;
continue;
}
for (auto & slot : slots) {
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
continue;
}
params.temp = slot.temperature;
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, slot.tokens_prev, candidates, slot.i_batch - i);
// remember which tokens were sampled - used for repetition penalties during sampling
slot.tokens_prev.erase(slot.tokens_prev.begin());
slot.tokens_prev.push_back(id);
const std::string token_str = llama_token_to_piece(ctx, id);
slot.generated_text += token_str;
slot.sampled = id;
size_t stop_pos =
findStoppingStrings(slot.generated_text, token_str.size(), STOP_FULL);
slot.sampled_token_str = token_str;
// notify new token
slot.nofity();
if (slot.n_decoded > 2 &&
(id == llama_token_eos(ctx) ||
(params.n_predict > 0 &&
slot.n_decoded + slot.n_prompt >=
params.n_predict) ||
stop_pos != std::string::npos)) {
//LOG_TEE("slot %i generated text:\n%s'------------------------------\n", slot.id, slot.generated_text.c_str());
slot.generated_text.clear();
slot.release();
}
slot.i_batch = -1;
}
}
return true;
}
};
static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams)
{
printf("usage: %s [options]\n", argv0);
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_mlock_supported())
{
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported())
{
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
printf(" --numa attempt optimizations that help on some NUMA systems\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
printf(" -nommq, --no-mul-mat-q\n");
printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -a ALIAS, --alias ALIAS\n");
printf(" set an alias for the model, will be added as `model` field in completion response\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
// new arguments
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" -f FNAME, --file FNAME\n");
printf(" load a system prompt from a file.\n");
printf("\n");
}
static void server_params_parse(int argc, char **argv, server_params &sparams,
gpt_params &params)
{
gpt_params default_params;
server_params default_sparams;
std::string arg;
bool invalid_param = false;
for (int i = 1; i < argc; i++)
{
arg = argv[i];
if (arg == "--port")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
sparams.port = std::stoi(argv[i]);
}
else if (arg == "--host")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
sparams.hostname = argv[i];
}
else if (arg == "--path")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
sparams.public_path = argv[i];
}
else if (arg == "--timeout" || arg == "-to")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
sparams.read_timeout = std::stoi(argv[i]);
sparams.write_timeout = std::stoi(argv[i]);
}
else if (arg == "-m" || arg == "--model")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.model = argv[i];
}
else if (arg == "-a" || arg == "--alias")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.model_alias = argv[i];
}
else if (arg == "-h" || arg == "--help")
{
server_print_usage(argv[0], default_params, default_sparams);
exit(0);
}
else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
}
else if (arg == "--rope-freq-base")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.rope_freq_base = std::stof(argv[i]);
}
else if (arg == "--rope-freq-scale")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.rope_freq_scale = std::stof(argv[i]);
}
else if (arg == "--memory-f32" || arg == "--memory_f32")
{
params.memory_f16 = false;
}
else if (arg == "--threads" || arg == "-t")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
}
else if (arg == "-b" || arg == "--batch-size")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
}
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
#else
LOG_TEE("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
"See main README.md for information on enabling GPU BLAS support\n");
#endif
}
else if (arg == "--tensor-split" || arg == "-ts")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
std::string arg_next = argv[i];
// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
{
if (i_device < split_arg.size())
{
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
}
else
{
params.tensor_split[i_device] = 0.0f;
}
}
#else
LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
}
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
{
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = false;
#else
LOG_TEE("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
#endif // GGML_USE_CUBLAS
}
else if (arg == "--main-gpu" || arg == "-mg")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.");
#endif
}
else if (arg == "--lora")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.lora_adapter.push_back({argv[i], 1.0f});
params.use_mmap = false;
}
else if (arg == "--lora-scaled")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
const char * lora_adapter = argv[i];
if (++i >= argc)
{
invalid_param = true;
break;
}
params.lora_adapter.push_back(make_tuple(lora_adapter, std::stof(argv[i])));
params.use_mmap = false;
}
else if (arg == "--lora-base")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.lora_base = argv[i];
}
else if (arg == "--mlock")
{
params.use_mlock = true;
}
else if (arg == "--no-mmap")
{
params.use_mmap = false;
}
else if (arg == "--numa")
{
params.numa = true;
} else if (arg == "-cb" || arg == "--cont-batching")
{
params.cont_batching = true;
}
else if (arg == "-np" || arg == "--parallel")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_parallel = std::stoi(argv[i]);
} else if (arg == "-n" || arg == "--n-predict")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
} else if (arg == "-r" || arg == "--reverse-prompt")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.antiprompt.push_back(argv[i]);
}
else
{
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}
if (invalid_param)
{
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}
int main(int argc, char **argv)
{
gpt_params params;
server_params sparams;
server_params_parse(argc, argv, sparams, params);
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("server-parallel", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
llama_backend_init(params.numa);
// load the target model
params.logits_all = true;
server_parallel_context llama;
if(!llama.loadModel(params)) {
return 1;
}
llama.initialize();
Server svr;
svr.Get("/", [&](const Request & /*req*/, Response &res)
{ res.set_content(index_html_, "text/html"); });
svr.Get("/index.js", [&](const Request & /*req*/, Response &res)
{ res.set_content(index_js_, "text/html"); });
svr.Get("/props", [&llama](const Request & /*req*/, Response &res)
{
json data = {
{ "user_name", llama.user_name.c_str() },
{ "assistant_name", llama.assistant_name.c_str() }
};
res.set_content(data.dump(), "application/json"); });
svr.Post("/completion", [&llama](const Request &req, Response &res)
{
llama_client_slot* slot = llama.requestCompletion(json::parse(req.body));
// Verify if the slot exist
if (slot) {
res.set_chunked_content_provider("text/event-stream",
[slot](size_t /*offset*/, DataSink &sink) {
if(slot->available()) { // slot has been released
sink.done();
return false;
}
if(slot->hasNewToken()) { // new token notification
stringstream ss;
json res_d = {{ "content", slot->sampled_token_str }};
ss << "data: " << res_d.dump() << "\n\n";
string result = ss.str();
if(!sink.write(result.c_str(), result.size())) {
slot->release();
return false;
}
}
return true;
});
} else {
LOG_TEE("slot unavailable\n");
res.status = 404;
res.set_content("slot_error", "text/plain");
} });
thread t([&llama]()
{
bool running = true;
while (running)
{
running = llama.updateSlots();
} });
svr.set_read_timeout(sparams.read_timeout);
svr.set_write_timeout(sparams.write_timeout);
if (!svr.bind_to_port(sparams.hostname, sparams.port))
{
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
return 1;
}
// Set the base directory for serving static files
svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable:
printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
if (!svr.listen_after_bind())
{
return 1;
}
}
+7 -5
View File
@@ -504,9 +504,11 @@ struct llama_server_context
});
}
bool tg = true;
while (n_past < embd.size())
{
int n_eval = (int)embd.size() - n_past;
tg = n_eval == 1;
if (n_eval > params.n_batch)
{
n_eval = params.n_batch;
@@ -633,7 +635,9 @@ struct llama_server_context
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(result.tok);
num_tokens_predicted++;
if (tg) {
num_tokens_predicted++;
}
}
// add it to the context
@@ -1011,7 +1015,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
params.lora_adapter.push_back({argv[i], 1.0f});
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
params.use_mmap = false;
}
else if (arg == "--lora-scaled")
@@ -1027,7 +1031,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
params.use_mmap = false;
}
else if (arg == "--lora-base")
@@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)
{
const auto timings = llama_get_timings(llama.ctx);
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
return json{
{"prompt_n", timings.n_p_eval},
{"prompt_ms", timings.t_p_eval_ms},
+54 -32
View File
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
{
const int i = get_group_id(0);
const int i = get_group_id(0) + get_global_offset(0);
const int tid = get_local_id(0);
const int n = tid / 32;
const int l = tid - 32 * n;
const int is = 8 * n + l / 16;
const uint8_t q = x[i].qs[32 * n + l];
__global float *y = yy + i * QK_K + 128 * n;
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
{
int r = get_local_id(0) / 4;
int i = get_group_id(0);
int i = get_group_id(0) + get_global_offset(0);
int tid = r / 2;
int is0 = r % 2;
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
float d_all = vload_half(0, &x[i].d);
float dl = d_all * (us - 32);
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
const __global uint8_t *q = x[i].qs + 32 * n;
const __global uint8_t *hm = x[i].hmask;
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
{
const int i = get_group_id(0);
const int i = get_group_id(0) + get_global_offset(0);
const int tid = get_local_id(0);
const int il = tid / 8;
const int ir = tid % 8;
const int is = 2 * il;
const int n = 4;
__global float *y = yy + i * QK_K + 64 * il + n * ir;
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
{
const int i = get_group_id(0);
const int i = get_group_id(0) + get_global_offset(0);
const int tid = get_local_id(0);
const int il = tid / 16;
const int ir = tid % 16;
const int is = 2 * il;
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
{
const int i = get_group_id(0);
const int i = get_group_id(0) + get_global_offset(0);
const int tid = get_local_id(0);
const int ip = tid / 32;
const int il = tid - 32 * ip;
const int is = 8 * ip + il / 16;
__global float *y = yy + i * QK_K + 128 * ip + il;
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
const float d = vload_half(0, &x[i].d);
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
const uint qk = QUANT_K;
const uint qr = QUANT_R;
const int ib = i/qk; // block index
const int ib = i/qk + get_global_offset(0); // block index
const int iqs = (i%qk)/qr; // quant index
const int iybs = i - i%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2;
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
const enum ggml_type type = src->type;
const size_t ts = ggml_type_size(type);
const size_t bs = ggml_blck_size(type);
const uint64_t row_size = ts*ne0/bs;
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
if (nb0 == ts && nb1 == ts*ne0/bs) {
err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
return err;
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
if (nb0 == ts && nb1 == row_size) {
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
}
if (nb0 == ts) {
const size_t buffer_origin[3] = { offset, 0, 0 };
const size_t host_origin[3] = { 0, 0, 0 };
const size_t region[3] = { ts*ne0/bs, ne1, 1 };
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
return err;
const size_t region[3] = { row_size, ne1, 1 };
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
}
std::vector<cl_event> events;
if (ev && ne1>1) events.reserve(ne1-1);
for (uint64_t i1 = 0; i1 < ne1; i1++) {
// pretend the row is a matrix with cols=1
const size_t buffer_origin[3] = { offset, i1, 0 };
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
const size_t host_origin[3] = { 0, 0, 0 };
const size_t region[3] = { ts/bs, ne0, 1 };
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
const size_t region[3] = { ts, ne0/bs, 1 };
// if an event is requested, make the last write wait for all previous writes to complete
if (ev && i1) {
events.push_back(*ev);
}
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
if (err != CL_SUCCESS) {
break;
for (auto event : events) {
clReleaseEvent(event);
}
return err;
}
}
return err;
for (auto event : events) {
CL_CHECK(clReleaseEvent(event));
}
return CL_SUCCESS;
}
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1503,6 +1515,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
size_t x_offset = 0;
int64_t pi02 = -1;
int64_t pi03 = -1;
@@ -1513,7 +1526,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
int64_t i02 = i12 / r2;
// copy data to device
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
if (src0->backend == GGML_BACKEND_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else if (i02 != pi02 || i03 != pi03) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
pi02 = i02;
pi03 = i03;
@@ -1528,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
clblast::Transpose::kYes, clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_X, x_offset, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
@@ -1596,6 +1611,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
bool src1_cont_rows = nb10 == sizeof(float);
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
size_t x_offset = 0;
int64_t pi02 = -1;
int64_t pi03 = -1;
@@ -1606,7 +1622,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
int64_t i02 = i12 / r2;
// copy src0 to device
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
if (src0->backend == GGML_BACKEND_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else if (i02 != pi02 || i03 != pi03) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
pi02 = i02;
pi03 = i03;
@@ -1646,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
clblast::Transpose::kYes, clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_X, x_offset, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
@@ -1696,7 +1714,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
const int x_ne = ne01 * ne00;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
const size_t q_sz = ggml_type_size(type) * x_bps;
size_t x_size;
size_t y_size;
@@ -1764,9 +1783,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
// convert src0 to fp32 on device
const size_t global = x_ne / global_denom;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
// copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
@@ -1888,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
const int64_t ne3 = tensor->ne[3];
const ggml_type type = tensor->type;
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
size_t q_size;
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
tensor->data = data;
// copy tensor to device
size_t offset = 0;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
int i = i3*ne2 + i2;
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
offset += s_sz;
}
}