Compare commits

..

6 Commits

Author SHA1 Message Date
Marcus Dunn 232caf3c15 llama : fix struct decl (#2790) 2023-08-25 19:17:15 +03:00
Kawrakow d046dcee08 Faster perplexity computation (#2786)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-08-25 19:05:02 +03:00
Matt Pulver c82742ac9c llama : add llama_beam_search() (#2267)
* Add llama_beam_search().

* Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token().

* Add space around * pointers and & references.

* Add spaces around comparison and assignment operators.

* Prefer west const.

* Use llama_ prefix for structs in global namespace.

* Delete obsolete comment from an earlier revision.

* Change eos to eob in llama_beam and llama_beam_view structs.
2023-08-25 18:18:48 +03:00
Nigel Bosch 28b2c996ca convert.py : Get rope scale from HuggingFace models (#2772)
* Get rope scale from HF models

* Save rope scale only for linear scaling

* Rewrite for clarity
2023-08-25 16:41:52 +02:00
slaren 154725c543 llama-bench : add model sizes (#2771)
* llama-bench : add model sizes

* more compact markdown output

* back to GiB

* adjust column sizes
2023-08-25 15:16:19 +02:00
slaren 12e2e33a97 convert.py : export rope freq_base when converting CodeLlama from an HF model (#2773) 2023-08-25 14:08:53 +02:00
10 changed files with 721 additions and 50 deletions
+1
View File
@@ -28,6 +28,7 @@ struct gpt_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
+28 -16
View File
@@ -105,6 +105,7 @@ class Params:
f_norm_eps: float
f_rope_freq_base: Optional[float] = None
f_rope_scale: Optional[float] = None
ftype: Optional[GGMLFileType] = None
@@ -160,13 +161,19 @@ class Params:
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
n_vocab = config["vocab_size"]
n_embd = config["hidden_size"]
n_layer = config["num_hidden_layers"]
n_ff = config["intermediate_size"]
n_head = config["num_attention_heads"]
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
f_norm_eps = config["rms_norm_eps"]
n_vocab = config["vocab_size"]
n_embd = config["hidden_size"]
n_layer = config["num_hidden_layers"]
n_ff = config["intermediate_size"]
n_head = config["num_attention_heads"]
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
f_norm_eps = config["rms_norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
if "rope_scaling" in config and config["rope_scaling"].get("type") == "linear":
f_rope_scale = config["rope_scaling"].get("factor")
else:
f_rope_scale = None
n_mult = Params.find_n_mult(n_ff, n_embd)
@@ -179,15 +186,17 @@ class Params:
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = n_ff,
n_head = n_head,
n_head_kv = n_head_kv,
f_norm_eps = f_norm_eps,
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = n_ff,
n_head = n_head,
n_head_kv = n_head_kv,
f_norm_eps = f_norm_eps,
f_rope_freq_base = f_rope_freq_base,
f_rope_scale = f_rope_scale,
)
# LLaMA v2 70B params.json
@@ -771,6 +780,9 @@ class OutputFile:
if params.f_rope_freq_base:
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.f_rope_scale:
self.gguf.add_rope_scale_linear(params.f_rope_scale)
if params.ftype:
self.gguf.add_file_type(params.ftype)
+1
View File
@@ -25,6 +25,7 @@ else()
add_subdirectory(simple)
add_subdirectory(embd-input)
add_subdirectory(llama-bench)
add_subdirectory(beam_search)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
+8
View File
@@ -0,0 +1,8 @@
set(TARGET beam_search)
add_executable(${TARGET} beam_search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
+188
View File
@@ -0,0 +1,188 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
// Used for debugging to print out beam tokens.
struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
llama_context * ctx;
std::vector<llama_token> response;
};
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
callback_data.response.resize(callback_data.response.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%lu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
int main(int argc, char ** argv)
{
gpt_params params;
//params.n_gpu_layers = 200;
//---------------------------------
// Print help :
//---------------------------------
if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
params.model = argv[1];
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
if ( argc > 3 )
{
params.prompt = argv[3];
}
if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
std::cout << llama_token_to_str(ctx, id);
}
std::cout << std::flush;
int n_past = llama_get_kv_cache_token_count(ctx);
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();
beam_search_callback_data callback_data{ctx, {}};
size_t const beam_width = static_cast<size_t>(params.n_beams);
int const n_predict = 256;
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_str(ctx,token_id);
}
std::cout << std::endl;
llama_free( ctx );
llama_free_model( model );
llama_backend_free();
return 0;
}
+52 -8
View File
@@ -441,6 +441,8 @@ struct test {
static const std::string gpu_info;
std::string model_filename;
std::string model_type;
uint64_t model_size;
uint64_t model_n_params;
int n_batch;
int n_threads;
bool f32_kv;
@@ -457,8 +459,10 @@ struct test {
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
model_filename = inst.model;
char buf[128];
llama_model_type(lmodel, buf, sizeof(buf));
llama_model_desc(lmodel, buf, sizeof(buf));
model_type = buf;
model_size = llama_model_size(lmodel);
model_n_params = llama_model_n_params(lmodel);
n_batch = inst.n_batch;
n_threads = inst.n_threads;
f32_kv = inst.f32_kv;
@@ -524,7 +528,7 @@ struct test {
"build_commit", "build_number",
"cuda", "opencl", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "f16_kv",
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
"n_prompt", "n_gen", "test_time",
@@ -538,6 +542,7 @@ struct test {
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
field == "avg_ns" || field == "stddev_ns") {
@@ -573,7 +578,7 @@ struct test {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
@@ -709,8 +714,15 @@ struct markdown_printer : public printer {
return -30;
}
if (field == "t/s") {
return 15;
return 16;
}
if (field == "size" || field == "params") {
return 10;
}
if (field == "n_gpu_layers") {
return 3;
}
int width = std::max((int)field.length(), 10);
if (test::get_field_type(field) == test::STRING) {
@@ -719,9 +731,28 @@ struct markdown_printer : public printer {
return width;
}
static std::string get_field_display_name(const std::string & field) {
if (field == "n_gpu_layers") {
return "ngl";
}
if (field == "n_threads") {
return "threads";
}
if (field == "mul_mat_q") {
return "mmq";
}
if (field == "tensor_split") {
return "ts";
}
return field;
}
void print_header(const cmd_params & params) override {
// select fields to print
fields = { "model", "backend" };
fields.push_back("model");
fields.push_back("size");
fields.push_back("params");
fields.push_back("backend");
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
if (!is_cpu_backend) {
fields.push_back("n_gpu_layers");
@@ -752,7 +783,7 @@ struct markdown_printer : public printer {
fprintf(fout, "|");
for (const auto & field : fields) {
fprintf(fout, " %*s |", get_field_width(field), field.c_str());
fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
}
fprintf(fout, "\n");
fprintf(fout, "|");
@@ -769,12 +800,26 @@ struct markdown_printer : public printer {
fprintf(fout, "|");
for (const auto & field : fields) {
std::string value;
char buf[128];
if (field == "model") {
value = t.model_type;
} else if (field == "size") {
if (t.model_size < 1024*1024*1024) {
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
} else {
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
}
value = buf;
} else if (field == "params") {
if (t.model_n_params < 1000*1000*1000) {
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
} else {
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
}
value = buf;
} else if (field == "backend") {
value = test::get_backend();
} else if (field == "test") {
char buf[128];
if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
} else if (t.n_gen > 0 && t.n_prompt == 0) {
@@ -785,7 +830,6 @@ struct markdown_printer : public printer {
}
value = buf;
} else if (field == "t/s") {
char buf[128];
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
value = buf;
} else if (vmap.find(field) != vmap.end()) {
+56 -11
View File
@@ -6,6 +6,8 @@
#include <ctime>
#include <sstream>
#include <cstring>
#include <thread>
#include <mutex>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -27,6 +29,40 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs;
}
float log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
return logits[tok] - max_logit - log(sum_exp);
}
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
double& nll, double& nll2) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
double local_nll = 0, local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
local_nll += v;
local_nll2 += v*v;
}
};
for (auto& w : workers) w = std::thread(compute);
compute();
for (auto& w : workers) w.join();
}
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@@ -166,9 +202,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
int count = 0;
double nll = 0.0;
double nll2 = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.n_ctx;
const int end = start + params.n_ctx;
@@ -228,26 +267,32 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
const int first = std::min(512, params.n_ctx/2);
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
count += params.n_ctx - first - 1;
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count));
double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
}
fflush(stdout);
}
printf("\n");
nll2 /= count;
nll /= count;
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
double ppl = exp(nll);
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
}
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+77 -13
View File
@@ -1209,6 +1209,62 @@ static void log_server_request(const Request &req, const Response &res)
});
}
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
auto & llama = *static_cast<llama_server_context*>(callback_data);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
printf("%lu", n);
}
fflush(stdout);
#if 0 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams:\n";
for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
struct token_translator {
llama_context * ctx;
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
};
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
auto & gtps = llama.generated_token_probs;
auto translator = token_translator{llama.ctx};
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
llama.generated_text.reserve(llama.generated_text.size() + len);
}
for (const completion_token_output & cto : gtps) {
llama.generated_text += translator(cto);
}
}
int main(int argc, char **argv)
{
// own arguments required by this example
@@ -1291,22 +1347,30 @@ int main(int argc, char **argv)
llama.beginCompletion();
if (!llama.stream) {
size_t stop_pos = std::string::npos;
if (llama.params.n_beams) {
// Fill llama.generated_token_probs vector with final beam.
llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
llama.n_past, llama.n_remain, llama.params.n_threads);
// Translate llama.generated_token_probs to llama.generated_text.
append_to_generated_text_from_generated_token_probs(llama);
} else {
size_t stop_pos = std::string::npos;
while (llama.has_next_token) {
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
while (llama.has_next_token) {
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL);
}
stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL);
}
if (stop_pos == std::string::npos) {
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
}
if (stop_pos != std::string::npos) {
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
llama.generated_text.end());
if (stop_pos == std::string::npos) {
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
}
if (stop_pos != std::string::npos) {
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
llama.generated_text.end());
}
}
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
+268 -1
View File
@@ -4326,6 +4326,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
//
// Beam search
//
struct llama_beam {
std::vector<llama_token> tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator<(const llama_beam & rhs) const {
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
}
// Shift off first n tokens and discard them.
void shift_tokens(const size_t n) {
if (n) {
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
tokens.resize(tokens.size() - n);
}
}
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
};
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits;
const int n_vocab;
const float max_l;
const float normalizer;
struct sum_exp {
float max_l;
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
};
llama_logit_info(llama_context * ctx)
: logits(llama_get_logits(ctx))
, n_vocab(llama_n_vocab(ctx))
, max_l(*std::max_element(logits, logits + n_vocab))
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
{ }
llama_token_data get_token_data(const llama_token token_id) const {
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
return {token_id, logits[token_id], p};
}
// Return top k token_data by logit.
std::vector<llama_token_data> top_k(size_t k) {
std::vector<llama_token_data> min_heap; // min-heap by logit
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
min_heap.reserve(k_min);
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
min_heap.push_back(get_token_data(token_id));
}
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
std::make_heap(min_heap.begin(), min_heap.end(), comp);
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
if (min_heap.front().logit < logits[token_id]) {
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
min_heap.back().id = token_id;
min_heap.back().logit = logits[token_id];
std::push_heap(min_heap.begin(), min_heap.end(), comp);
}
}
return min_heap;
}
float probability_from_logit(float logit) {
return normalizer * std::exp(logit - max_l);
}
};
struct llama_beam_search_data {
llama_context * ctx;
size_t n_beams;
int n_past;
int n_predict;
int n_threads;
std::vector<llama_beam> beams;
std::vector<llama_beam> next_beams;
// Re-calculated on each loop iteration
size_t common_prefix_length;
// Used to communicate to/from callback on beams state.
std::vector<llama_beam_view> beam_views;
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
: ctx(ctx)
, n_beams(n_beams)
, n_past(n_past)
, n_predict(n_predict)
, n_threads(n_threads)
, beam_views(n_beams) {
beams.reserve(n_beams);
next_beams.reserve(n_beams);
}
// Collapse beams to a single beam given by index.
void collapse_beams(const size_t beam_idx) {
if (0u < beam_idx) {
std::swap(beams[0], beams[beam_idx]);
}
beams.resize(1);
}
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
// The repetative patterns below reflect the 2 stages of heaps:
// * Gather elements until the vector is full, then call std::make_heap() on it.
// * If the heap is full and a new element is found that should be included, pop the
// least element to the back(), replace it with the new, then push it into the heap.
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
// Min-heaps use a greater-than comparator.
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
if (beam.eob) {
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
if (next_beams.size() < n_beams) {
next_beams.push_back(std::move(beam));
if (next_beams.size() == n_beams) {
std::make_heap(next_beams.begin(), next_beams.end(), comp);
}
} else if (next_beams.front().p < beam.p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = std::move(beam);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
} else {
// beam is not at end-of-sentence, so branch with next top_k tokens.
if (!beam.tokens.empty()) {
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
}
llama_logit_info logit_info(ctx);
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
size_t i=0;
if (next_beams.size() < n_beams) {
for (; next_beams.size() < n_beams ; ++i) {
llama_beam next_beam = beam;
next_beam.tokens.push_back(next_tokens[i].id);
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
next_beams.push_back(std::move(next_beam));
}
std::make_heap(next_beams.begin(), next_beams.end(), comp);
} else {
for (; next_beams.front().p == 0.0f ; ++i) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
for (; i < n_beams ; ++i) {
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
if (next_beams.front().p < next_p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p = next_p;
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
}
}
// Find common_prefix_length based on beams.
// Requires beams is not empty.
size_t find_common_prefix_length() {
size_t common_prefix_length = beams[0].tokens.size();
for (size_t i = 1 ; i < beams.size() ; ++i) {
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
if (beams[0].tokens[j] != beams[i].tokens[j]) {
common_prefix_length = j;
break;
}
}
}
return common_prefix_length;
}
// Construct beams_state to send back to caller via the callback function.
// Side effect: set common_prefix_length = find_common_prefix_length();
llama_beams_state get_beams_state(const bool last_call) {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beam_views[i] = beams[i].view();
}
common_prefix_length = find_common_prefix_length();
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
}
// Loop:
// * while i < n_predict, AND
// * any of the beams have not yet reached end-of-beam (eob), AND
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
// (since all other beam probabilities can only decrease)
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
!beams[top_beam_index()].eob ; ++i) {
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
if (common_prefix_length) {
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
n_past += common_prefix_length;
}
// Zero-out next_beam probabilities to place them last in following min-heap.
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
for (llama_beam & beam : beams) {
beam.shift_tokens(common_prefix_length);
fill_next_beams_by_top_probabilities(beam);
}
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
beams.swap(next_beams);
renormalize_beam_probabilities(beams);
}
collapse_beams(top_beam_index());
callback(callback_data, get_beams_state(true));
}
// As beams grow, the cumulative probabilities decrease.
// Renormalize them to avoid floating point underflow.
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
}
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
size_t top_beam_index() {
return std::max_element(beams.begin(), beams.end()) - beams.begin();
}
// Copy (p,eob) for each beam which may have been changed by the callback.
void update_beams_from_beam_views() {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beams[i].p = beam_views[i].p;
beams[i].eob = beam_views[i].eob;
}
}
};
void llama_beam_search(llama_context * ctx,
llama_beam_search_callback_fn_t callback, void * callback_data,
size_t n_beams, int n_past, int n_predict, int n_threads) {
assert(ctx);
const int64_t t_start_sample_us = ggml_time_us();
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
beam_search_data.loop(callback, callback_data);
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
ctx->n_sample++;
}
//
// quantization
//
@@ -5297,13 +5548,29 @@ int llama_model_n_embd(const struct llama_model * model) {
return model->hparams.n_embd;
}
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
return snprintf(buf, buf_size, "%s %s %s",
model->name.c_str(),
llama_model_type_name(model->type),
llama_model_ftype_name(model->ftype).c_str());
}
uint64_t llama_model_size(const struct llama_model * model) {
uint64_t size = 0;
for (const auto & it : model->tensors_by_name) {
size += ggml_nbytes(it.second);
}
return size;
}
uint64_t llama_model_n_params(const struct llama_model * model) {
uint64_t nparams = 0;
for (const auto & it : model->tensors_by_name) {
nparams += ggml_nelements(it.second);
}
return nparams;
}
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
+42 -1
View File
@@ -254,7 +254,11 @@ extern "C" {
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
// Get a string describing the model type
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
// Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
// Returns 0 on success
LLAMA_API int llama_model_quantize(
@@ -465,6 +469,43 @@ extern "C" {
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
//
// Beam search
//
struct llama_beam_view {
const llama_token * tokens;
size_t n_tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Callback should set this to true when a beam is at end-of-beam.
};
// Passed to beam_search_callback function.
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
// These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state {
struct llama_beam_view * beam_views;
size_t n_beams; // Number of elements in beam_views[].
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
bool last_call; // True iff this is the last callback invocation.
};
// Type of pointer to the beam_search_callback function.
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
/// @details Deterministically returns entire sentence constructed by a beam search.
/// @param ctx Pointer to the llama_context.
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
/// @param callback_data A pointer that is simply passed back to callback.
/// @param n_beams Number of beams to use.
/// @param n_past Number of tokens already evaluated.
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
/// @param n_threads Number of threads as passed to llama_eval().
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
// Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx);