mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-14 01:36:47 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d298382ad9 | |||
| 32a28217f4 | |||
| c429b33beb | |||
| 9146d36fe7 | |||
| b9adcbbf92 | |||
| 9588f196b1 | |||
| 3cbd23ed88 | |||
| 00c6390793 | |||
| faa0e6979a | |||
| 9791f40258 | |||
| 902184dd3a |
+13
-1
@@ -1,5 +1,16 @@
|
||||
# https://github.com/actions/labeler
|
||||
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-kompute.h
|
||||
- ggml-kompute.cpp
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-metal.h
|
||||
- ggml-metal.cpp
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -9,6 +20,7 @@ SYCL:
|
||||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-cuda.h
|
||||
- ggml-cuda/**
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
|
||||
@@ -72,6 +72,7 @@ else()
|
||||
set(INS_ENB ON)
|
||||
endif()
|
||||
|
||||
option(LLAMA_SVE "llama: enable SVE" OFF)
|
||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
@@ -1040,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
if (LLAMA_SVE)
|
||||
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
|
||||
@@ -203,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
**Tools:**
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||
|
||||
+16
-6
@@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.interactive_specials = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--special") {
|
||||
params.special = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
return true;
|
||||
@@ -1362,6 +1366,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --special special tokens output enabled\n");
|
||||
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||
@@ -1855,11 +1860,15 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
|
||||
std::string fs_get_cache_directory() {
|
||||
std::string cache_directory = "";
|
||||
auto ensure_trailing_slash = [](std::string p) {
|
||||
// Make sure to add trailing slash
|
||||
if (p.back() != DIRECTORY_SEPARATOR) {
|
||||
p += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
return p;
|
||||
};
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
if (cache_directory.back() != DIRECTORY_SEPARATOR) {
|
||||
cache_directory += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else {
|
||||
#ifdef __linux__
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
@@ -1870,12 +1879,12 @@ std::string fs_get_cache_directory() {
|
||||
#elif defined(__APPLE__)
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("APPDATA");
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#endif // __linux__
|
||||
cache_directory = ensure_trailing_slash(cache_directory);
|
||||
cache_directory += "llama.cpp";
|
||||
cache_directory += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
return cache_directory;
|
||||
return ensure_trailing_slash(cache_directory);
|
||||
}
|
||||
|
||||
|
||||
@@ -2840,6 +2849,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
|
||||
@@ -146,6 +146,7 @@ struct gpt_params {
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||
bool special = false; // enable special token output
|
||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
|
||||
+1
-1
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
||||
|
||||
params.custom_n_ctx = false;
|
||||
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_checkpointing = true;
|
||||
|
||||
params.sample_start = "";
|
||||
|
||||
@@ -81,6 +81,7 @@ models = [
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -473,6 +473,9 @@ class Model:
|
||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2392,7 +2395,8 @@ class CommandR2Model(Model):
|
||||
|
||||
# max_position_embeddings = 8192 in config.json but model was actually
|
||||
# trained on 128k context length
|
||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||
# aya-23 models don't have model_max_length specified
|
||||
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
||||
|
||||
params.samples_start_after_nl = false;
|
||||
params.use_adam = true;
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_scratch = true;
|
||||
|
||||
// only adam
|
||||
|
||||
@@ -7,8 +7,6 @@ android {
|
||||
namespace = "com.example.llama"
|
||||
compileSdk = 34
|
||||
|
||||
ndkVersion = "26.1.10909125"
|
||||
|
||||
defaultConfig {
|
||||
applicationId = "com.example.llama"
|
||||
minSdk = 33
|
||||
@@ -20,17 +18,6 @@ android {
|
||||
vectorDrawables {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
@@ -55,17 +42,6 @@ android {
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion = "1.5.1"
|
||||
}
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
@@ -78,6 +54,7 @@ dependencies {
|
||||
implementation("androidx.compose.ui:ui-graphics")
|
||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||
implementation("androidx.compose.material3:material3")
|
||||
implementation(project(":llama"))
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.llama.cpp.LLamaAndroid
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
@@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
|
||||
import kotlinx.coroutines.flow.catch
|
||||
import kotlinx.coroutines.launch
|
||||
|
||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val NanosPerSecond = 1_000_000_000.0
|
||||
@@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.unload()
|
||||
llamaAndroid.unload()
|
||||
} catch (exc: IllegalStateException) {
|
||||
messages += exc.message!!
|
||||
}
|
||||
@@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
messages += ""
|
||||
|
||||
viewModelScope.launch {
|
||||
llm.send(text)
|
||||
llamaAndroid.send(text)
|
||||
.catch {
|
||||
Log.e(tag, "send() failed", it)
|
||||
messages += it.message!!
|
||||
@@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
val start = System.nanoTime()
|
||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
|
||||
val end = System.nanoTime()
|
||||
|
||||
messages += warmupResult
|
||||
@@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
return@launch
|
||||
}
|
||||
|
||||
messages += llm.bench(512, 128, 1, 3)
|
||||
messages += llamaAndroid.bench(512, 128, 1, 3)
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "bench() failed", exc)
|
||||
messages += exc.message!!
|
||||
@@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
fun load(pathToModel: String) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.load(pathToModel)
|
||||
llamaAndroid.load(pathToModel)
|
||||
messages += "Loaded $pathToModel"
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "load() failed", exc)
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
plugins {
|
||||
id("com.android.application") version "8.2.0" apply false
|
||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||
id("com.android.library") version "8.2.0" apply false
|
||||
}
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
/build
|
||||
+1
-1
@@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
@@ -0,0 +1,68 @@
|
||||
plugins {
|
||||
id("com.android.library")
|
||||
id("org.jetbrains.kotlin.android")
|
||||
}
|
||||
|
||||
android {
|
||||
namespace = "android.llama.cpp"
|
||||
compileSdk = 34
|
||||
|
||||
defaultConfig {
|
||||
minSdk = 33
|
||||
|
||||
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||
consumerProguardFiles("consumer-rules.pro")
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
|
||||
cppFlags("")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
release {
|
||||
isMinifyEnabled = false
|
||||
proguardFiles(
|
||||
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||
"proguard-rules.pro"
|
||||
)
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = "1.8"
|
||||
}
|
||||
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation("androidx.core:core-ktx:1.12.0")
|
||||
implementation("androidx.appcompat:appcompat:1.6.1")
|
||||
implementation("com.google.android.material:material:1.11.0")
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
}
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
||||
+24
@@ -0,0 +1,24 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import androidx.test.platform.app.InstrumentationRegistry
|
||||
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||
|
||||
import org.junit.Test
|
||||
import org.junit.runner.RunWith
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Instrumented test, which will execute on an Android device.
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
@RunWith(AndroidJUnit4::class)
|
||||
class ExampleInstrumentedTest {
|
||||
@Test
|
||||
fun useAppContext() {
|
||||
// Context of the app under test.
|
||||
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||
assertEquals("android.llama.cpp.test", appContext.packageName)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
|
||||
</manifest>
|
||||
@@ -0,0 +1,49 @@
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
||||
+14
-14
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||
|
||||
if (!model) {
|
||||
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
llama_free(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
||||
llama_log_set(log_callback, NULL);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_bench_1model(
|
||||
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||
|
||||
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||
llama_backend_init();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
||||
return env->NewStringUTF(llama_print_system_info());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_com_example_llama_Llm_completion_1init(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_completion_1loop(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
+4
-4
@@ -1,4 +1,4 @@
|
||||
package com.example.llama
|
||||
package android.llama.cpp
|
||||
|
||||
import android.util.Log
|
||||
import kotlinx.coroutines.CoroutineDispatcher
|
||||
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
|
||||
import java.util.concurrent.Executors
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
class Llm {
|
||||
class LLamaAndroid {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||
@@ -165,8 +165,8 @@ class Llm {
|
||||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
private val _instance: Llm = Llm()
|
||||
private val _instance: LLamaAndroid = LLamaAndroid()
|
||||
|
||||
fun instance(): Llm = _instance
|
||||
fun instance(): LLamaAndroid = _instance
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import org.junit.Test
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Example local unit test, which will execute on the development machine (host).
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
class ExampleUnitTest {
|
||||
@Test
|
||||
fun addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2)
|
||||
}
|
||||
}
|
||||
@@ -15,3 +15,4 @@ dependencyResolutionManagement {
|
||||
|
||||
rootProject.name = "LlamaAndroid"
|
||||
include(":app")
|
||||
include(":llama")
|
||||
|
||||
+11
-3
@@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
||||
// display text
|
||||
if (input_echo && display) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
||||
printf("%s", token_str.c_str());
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||
|
||||
// Console/Stream Output
|
||||
fprintf(stdout, "%s", token_str.c_str());
|
||||
|
||||
// Record Displayed Tokens To Log
|
||||
// Note: Generated tokens are created one by one hence this check
|
||||
if (embd.size() > 1) {
|
||||
// Incoming Requested Tokens
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
// Outgoing Generated Tokens
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>SimpleChat (LlamaCPP, ...) </title>
|
||||
<title>SimpleChat LlamaCppEtal </title>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="message" content="Save Nature Save Earth" />
|
||||
@@ -30,20 +30,17 @@
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<label for="system-in">System</label>
|
||||
<input type="text" name="system" id="system-in" class="flex-grow"/>
|
||||
<input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div id="chat-div">
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p> You need to have javascript enabled.</p>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
|
||||
<textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
|
||||
<button id="user-btn">submit</button>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -14,11 +14,15 @@ own system prompts.
|
||||
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||
enough manner, in general.
|
||||
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat.
|
||||
Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
|
||||
console.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
|
||||
the js file as needed.
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
|
||||
form of old messages culling can be achieved.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
|
||||
they can update the js file or equivalent member in gMe as needed.
|
||||
|
||||
|
||||
## usage
|
||||
@@ -43,11 +47,33 @@ next run this web front end in examples/server/public_simplechat
|
||||
### using the front end
|
||||
|
||||
Open this simple web front end from your local browser
|
||||
|
||||
* http://127.0.0.1:PORT/index.html
|
||||
|
||||
Once inside
|
||||
|
||||
* Select between chat and completion mode. By default it is set to chat mode.
|
||||
|
||||
* In completion mode
|
||||
* logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
|
||||
If the model requires any prefix wrt user role messages, then the end user has to
|
||||
explicitly add the needed prefix, when they enter their chat message.
|
||||
Similarly if the model requires any prefix to trigger assistant/ai-model response,
|
||||
then the end user needs to enter the same.
|
||||
This keeps the logic simple, while still giving flexibility to the end user to
|
||||
manage any templating/tagging requirement wrt their messages to the model.
|
||||
* the logic doesnt insert newline at the begining and end wrt the prompt message generated.
|
||||
However if the chat being sent to /completions end point has more than one role's message,
|
||||
then insert newline when moving from one role's message to the next role's message, so
|
||||
that it can be clearly identified/distinguished.
|
||||
* given that /completions endpoint normally doesnt add additional chat-templating of its
|
||||
own, the above ensures that end user can create a custom single/multi message combo with
|
||||
any tags/special-tokens related chat templating to test out model handshake. Or enduser
|
||||
can use it just for normal completion related/based query.
|
||||
|
||||
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||
Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
|
||||
responses with a suitable system prompt.
|
||||
* if chat.add_system_begin is used
|
||||
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||
* you cant set a system prompt, after you have submitted any user query
|
||||
@@ -55,27 +81,121 @@ Once inside
|
||||
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||
|
||||
* Enter your query and either press enter or click on the submit button.
|
||||
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||
|
||||
* Wait for the logic to communicate with the server and get the response.
|
||||
* the user is not allowed to enter any fresh query during this time.
|
||||
* the user input box will be disabled and a working message will be shown in it.
|
||||
|
||||
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||
|
||||
* Using NewChat one can start independent chat sessions.
|
||||
* two independent chat sessions are setup by default.
|
||||
|
||||
|
||||
## Devel note
|
||||
|
||||
### Reason behind this
|
||||
|
||||
The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
|
||||
by developers who may not be from web frontend background (so inturn may not be familiar with template /
|
||||
end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
|
||||
|
||||
And given that the idea is also to help explore/experiment for developers, some flexibility is provided
|
||||
to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
|
||||
to explore some of the end points and ideas/implications around them.
|
||||
|
||||
|
||||
### General
|
||||
|
||||
Me/gMe consolidates the settings which control the behaviour into one object.
|
||||
One can see the current settings, as well as change/update them using browsers devel-tool/console.
|
||||
|
||||
bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
|
||||
communicating with the server or only sends the latest user query/message.
|
||||
|
||||
bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
|
||||
messages that get inserted into prompt field wrt /Completion endpoint.
|
||||
|
||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||
irrespective of whether /chat/completions or /completions endpoint.
|
||||
|
||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||
modify the existing options value or remove them, for now you can update this global var
|
||||
using browser's development-tools/console.
|
||||
|
||||
iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
|
||||
This is disabled by default. However if enabled, then in addition to latest system message, only
|
||||
the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
|
||||
from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
|
||||
only user messages after the latest system message/prompt will be considered.
|
||||
|
||||
This specified sliding window user message count also includes the latest user query.
|
||||
<0 : Send entire chat history to server
|
||||
0 : Send only the system message if any to the server
|
||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||
|
||||
|
||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way.
|
||||
|
||||
|
||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||
|
||||
|
||||
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||
not been added, for now.
|
||||
|
||||
|
||||
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||
the system prompt, anytime during the conversation or only at the beginning.
|
||||
|
||||
|
||||
read_json_early, is to experiment with reading json response data early on, if available,
|
||||
so that user can be shown generated data, as and when it is being generated, rather than
|
||||
at the end when full data is available.
|
||||
|
||||
the server flow doesnt seem to be sending back data early, atleast for request (inc options)
|
||||
that is currently sent.
|
||||
|
||||
if able to read json data early on in future, as and when ai model is generating data, then
|
||||
this helper needs to indirectly update the chat div with the recieved data, without waiting
|
||||
for the overall data to be available.
|
||||
|
||||
|
||||
### Default setup
|
||||
|
||||
By default things are setup to try and make the user experience a bit better, if possible.
|
||||
However a developer when testing the server of ai-model may want to change these value.
|
||||
|
||||
Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
|
||||
just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
|
||||
full chat history. This way if there is any response with garbage/repeatation, it doesnt
|
||||
mess with things beyond the next question/request/query, in some ways.
|
||||
|
||||
Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
|
||||
available wrt next query-response. However dont forget that the server when started should
|
||||
also be started with a model context size of 1k or more, to be on safe side.
|
||||
|
||||
The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
|
||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||
to /completions endpoint handling code on server side.
|
||||
|
||||
Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
|
||||
along with the user query. So that the model is partly set to try avoid repeating text in
|
||||
its response.
|
||||
|
||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
|
||||
|
||||
|
||||
## At the end
|
||||
|
||||
Also a thank you to all open source and open model developers, who strive for the common good.
|
||||
|
||||
@@ -48,6 +48,13 @@ button {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.ul1 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
.ul2 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0.6vmin;
|
||||
}
|
||||
|
||||
@@ -14,23 +14,86 @@ class ApiEP {
|
||||
}
|
||||
|
||||
let gUsageMsg = `
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p class="role-system">Usage</p>
|
||||
<ul class="ul1">
|
||||
<li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode normally wont have a system prompt.</li>
|
||||
</ul>
|
||||
<li> Enter your query to ai assistant below.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode doesnt insert user/role: prefix implicitly.</li>
|
||||
<li> Use shift+enter for inserting enter/newline.</li>
|
||||
</ul>
|
||||
<li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
|
||||
<ul class="ul2">
|
||||
<li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
|
||||
</ul>
|
||||
</ul>
|
||||
`;
|
||||
|
||||
/** @typedef {{role: string, content: string}[]} ChatMessages */
|
||||
|
||||
class SimpleChat {
|
||||
|
||||
constructor() {
|
||||
/**
|
||||
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||
* @type {{role: string, content: string}[]}
|
||||
* @type {ChatMessages}
|
||||
*/
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recent chat messages.
|
||||
* If iRecentUserMsgCnt < 0
|
||||
* Then return the full chat history
|
||||
* Else
|
||||
* Return chat messages from latest going back till the last/latest system prompt.
|
||||
* While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
|
||||
* @param {number} iRecentUserMsgCnt
|
||||
*/
|
||||
recent_chat(iRecentUserMsgCnt) {
|
||||
if (iRecentUserMsgCnt < 0) {
|
||||
return this.xchat;
|
||||
}
|
||||
if (iRecentUserMsgCnt == 0) {
|
||||
console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
|
||||
}
|
||||
/** @type{ChatMessages} */
|
||||
let rchat = [];
|
||||
let sysMsg = this.get_system_latest();
|
||||
if (sysMsg.length != 0) {
|
||||
rchat.push({role: Roles.System, content: sysMsg});
|
||||
}
|
||||
let iUserCnt = 0;
|
||||
let iStart = this.xchat.length;
|
||||
for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
|
||||
if (iUserCnt >= iRecentUserMsgCnt) {
|
||||
break;
|
||||
}
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.User) {
|
||||
iStart = i;
|
||||
iUserCnt += 1;
|
||||
}
|
||||
}
|
||||
for(let i = iStart; i < this.xchat.length; i++) {
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.System) {
|
||||
continue;
|
||||
}
|
||||
rchat.push({role: msg.role, content: msg.content});
|
||||
}
|
||||
return rchat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an entry into xchat
|
||||
* @param {string} role
|
||||
@@ -57,7 +120,7 @@ class SimpleChat {
|
||||
div.replaceChildren();
|
||||
}
|
||||
let last = undefined;
|
||||
for(const x of this.xchat) {
|
||||
for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
let entry = document.createElement("p");
|
||||
entry.className = `role-${x.role}`;
|
||||
entry.innerText = `${x.role}: ${x.content}`;
|
||||
@@ -69,17 +132,21 @@ class SimpleChat {
|
||||
} else {
|
||||
if (bClear) {
|
||||
div.innerHTML = gUsageMsg;
|
||||
gMe.show_info(div);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
* The needed fields/options are picked from a global object.
|
||||
* Convert the json into string.
|
||||
* @param {Object} obj
|
||||
*/
|
||||
request_jsonstr(obj) {
|
||||
obj["temperature"] = 0.7;
|
||||
for(let k in gMe.chatRequestOptions) {
|
||||
obj[k] = gMe.chatRequestOptions[k];
|
||||
}
|
||||
return JSON.stringify(obj);
|
||||
}
|
||||
|
||||
@@ -88,18 +155,27 @@ class SimpleChat {
|
||||
*/
|
||||
request_messages_jsonstr() {
|
||||
let req = {
|
||||
messages: this.xchat,
|
||||
messages: this.recent_chat(gMe.iRecentUserMsgCnt),
|
||||
}
|
||||
return this.request_jsonstr(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string form of json object suitable for /completions
|
||||
* @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
|
||||
*/
|
||||
request_prompt_jsonstr() {
|
||||
request_prompt_jsonstr(bInsertStandardRolePrefix) {
|
||||
let prompt = "";
|
||||
for(const chat of this.xchat) {
|
||||
prompt += `${chat.role}: ${chat.content}\n`;
|
||||
let iCnt = 0;
|
||||
for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
iCnt += 1;
|
||||
if (iCnt > 1) {
|
||||
prompt += "\n";
|
||||
}
|
||||
if (bInsertStandardRolePrefix) {
|
||||
prompt += `${chat.role}: `;
|
||||
}
|
||||
prompt += `${chat.content}`;
|
||||
}
|
||||
let req = {
|
||||
prompt: prompt,
|
||||
@@ -171,7 +247,6 @@ let gChatURL = {
|
||||
'chat': `${gBaseURL}/chat/completions`,
|
||||
'completion': `${gBaseURL}/completions`,
|
||||
}
|
||||
const gbCompletionFreshChatAlways = true;
|
||||
|
||||
|
||||
/**
|
||||
@@ -291,6 +366,8 @@ class MultiChatUI {
|
||||
// allow user to insert enter into their message using shift+enter.
|
||||
// while just pressing enter key will lead to submitting.
|
||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||
let value = this.elInUser.value;
|
||||
this.elInUser.value = value.substring(0,value.length-1);
|
||||
this.elBtnUser.click();
|
||||
ev.preventDefault();
|
||||
}
|
||||
@@ -321,6 +398,29 @@ class MultiChatUI {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try read json response early, if available.
|
||||
* @param {Response} resp
|
||||
*/
|
||||
async read_json_early(resp) {
|
||||
if (!resp.body) {
|
||||
throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
|
||||
}
|
||||
let tdUtf8 = new TextDecoder("utf-8");
|
||||
let rr = resp.body.getReader();
|
||||
let gotBody = "";
|
||||
while(true) {
|
||||
let { value: cur, done: done} = await rr.read();
|
||||
let curBody = tdUtf8.decode(cur);
|
||||
console.debug("DBUG:SC:PART:", curBody);
|
||||
gotBody += curBody;
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return JSON.parse(gotBody);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle user query submit request, wrt specified chat session.
|
||||
* @param {string} chatId
|
||||
@@ -330,6 +430,14 @@ class MultiChatUI {
|
||||
|
||||
let chat = this.simpleChats[chatId];
|
||||
|
||||
// In completion mode, if configured, clear any previous chat history.
|
||||
// So if user wants to simulate a multi-chat based completion query,
|
||||
// they will have to enter the full thing, as a suitable multiline
|
||||
// user input/query.
|
||||
if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
|
||||
chat.clear();
|
||||
}
|
||||
|
||||
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||
|
||||
let content = this.elInUser.value;
|
||||
@@ -344,7 +452,7 @@ class MultiChatUI {
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
theBody = chat.request_messages_jsonstr();
|
||||
} else {
|
||||
theBody = chat.request_prompt_jsonstr();
|
||||
theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
|
||||
}
|
||||
|
||||
this.elInUser.value = "working...";
|
||||
@@ -359,6 +467,7 @@ class MultiChatUI {
|
||||
});
|
||||
|
||||
let respBody = await resp.json();
|
||||
//let respBody = await this.read_json_early(resp);
|
||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||
let assistantMsg;
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
@@ -376,13 +485,6 @@ class MultiChatUI {
|
||||
} else {
|
||||
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||
}
|
||||
// Purposefully clear at end rather than begin of this function
|
||||
// so that one can switch from chat to completion mode and sequece
|
||||
// in a completion mode with multiple user-assistant chat data
|
||||
// from before to be sent/occur once.
|
||||
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
|
||||
chat.xchat.length = 0;
|
||||
}
|
||||
this.ui_reset_userinput();
|
||||
}
|
||||
|
||||
@@ -462,17 +564,66 @@ class MultiChatUI {
|
||||
}
|
||||
|
||||
|
||||
let gMuitChat;
|
||||
const gChatIds = [ "Default", "Other" ];
|
||||
class Me {
|
||||
|
||||
constructor() {
|
||||
this.defaultChatIds = [ "Default", "Other" ];
|
||||
this.multiChat = new MultiChatUI();
|
||||
this.bCompletionFreshChatAlways = true;
|
||||
this.bCompletionInsertStandardRolePrefix = false;
|
||||
this.iRecentUserMsgCnt = 2;
|
||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
this.chatRequestOptions = {
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 1024,
|
||||
"frequency_penalty": 1.2,
|
||||
"presence_penalty": 1.2,
|
||||
"n_predict": 1024
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {HTMLDivElement} elDiv
|
||||
*/
|
||||
show_info(elDiv) {
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = "Settings (devel-tools-console gMe)";
|
||||
p.className = "role-system";
|
||||
elDiv.appendChild(p);
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** @type {Me} */
|
||||
let gMe;
|
||||
|
||||
function startme() {
|
||||
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||
gMuitChat = new MultiChatUI();
|
||||
for (let cid of gChatIds) {
|
||||
gMuitChat.new_chat_session(cid);
|
||||
gMe = new Me();
|
||||
for (let cid of gMe.defaultChatIds) {
|
||||
gMe.multiChat.new_chat_session(cid);
|
||||
}
|
||||
gMuitChat.setup_ui(gChatIds[0]);
|
||||
gMuitChat.show_sessions();
|
||||
gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
|
||||
gMe.multiChat.show_sessions();
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", startme);
|
||||
|
||||
@@ -144,6 +144,10 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
|
||||
+64
-2
@@ -3813,7 +3813,44 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q4_0 * restrict x0 = &x[i + 0];
|
||||
const block_q4_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
|
||||
// load x
|
||||
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
||||
|
||||
// sub 8
|
||||
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -5384,7 +5421,32 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q8_0 * restrict x0 = &x[i + 0];
|
||||
const block_q8_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
|
||||
// load x
|
||||
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
||||
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
|
||||
@@ -22742,6 +22742,16 @@ int ggml_cpu_has_neon(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// TODO: Currently, SVE 256 bit is only supported.
|
||||
GGML_ASSERT(svcntb() == QK8_0);
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_arm_fma(void) {
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
return 1;
|
||||
|
||||
@@ -2404,6 +2404,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
|
||||
@@ -4593,6 +4593,9 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "dbrx") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||
} else if (
|
||||
tokenizer_pre == "smaug-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@@ -12512,6 +12515,7 @@ struct llm_tokenizer_bpe {
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// same as llama3
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
@@ -17861,6 +17865,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||
);
|
||||
}
|
||||
|
||||
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
||||
return llama_is_control_token(model->vocab, token);
|
||||
}
|
||||
|
||||
llama_token llama_token_bos(const struct llama_model * model) {
|
||||
return model->vocab.special_bos_id;
|
||||
}
|
||||
@@ -18337,6 +18345,7 @@ const char * llama_print_system_info(void) {
|
||||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
|
||||
@@ -85,6 +85,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -823,6 +824,9 @@ extern "C" {
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Identify if Token Id is a control token or a render-able token
|
||||
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
|
||||
Reference in New Issue
Block a user