Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 158f8c9e21 | |||
| 862f5e41ab | |||
| 3a48d558a6 | |||
| 7c8d3abd1a | |||
| 122ed4840c | |||
| a0b3ac8c48 | |||
| d75c232e1d | |||
| e0324285a5 |
@@ -515,6 +515,31 @@ jobs:
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v3
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v3
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
||||
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
|
||||
@@ -167,6 +167,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
if (params.n_threads_batch <= 0) {
|
||||
params.n_threads_batch = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-td" || arg == "--threads-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_draft <= 0) {
|
||||
params.n_threads_draft = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_batch_draft <= 0) {
|
||||
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-p" || arg == "--prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -845,6 +863,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N\n");
|
||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -td N, --threads-draft N");
|
||||
printf(" number of threads to use during generation (default: same as --threads)");
|
||||
printf(" -tbd N, --threads-batch-draft N\n");
|
||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
|
||||
printf(" -p PROMPT, --prompt PROMPT\n");
|
||||
printf(" prompt to start generation with (default: empty)\n");
|
||||
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
||||
|
||||
@@ -46,7 +46,9 @@ struct gpt_params {
|
||||
uint32_t seed = -1; // RNG seed
|
||||
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
|
||||
@@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
|
||||
return tn_buf.data();
|
||||
};
|
||||
|
||||
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
|
||||
// write_magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic
|
||||
file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic
|
||||
file.write_u32(1); // version
|
||||
// write_hparams
|
||||
file.write_u32(lora->hparams.lora_r);
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
# Gradle files
|
||||
.gradle/
|
||||
build/
|
||||
|
||||
# Local configuration file (sdk path, etc)
|
||||
local.properties
|
||||
|
||||
# Log/OS Files
|
||||
*.log
|
||||
|
||||
# Android Studio generated files and folders
|
||||
captures/
|
||||
.externalNativeBuild/
|
||||
.cxx/
|
||||
*.apk
|
||||
output.json
|
||||
|
||||
# IntelliJ
|
||||
*.iml
|
||||
.idea/
|
||||
misc.xml
|
||||
deploymentTargetDropDown.xml
|
||||
render.experimental.xml
|
||||
|
||||
# Keystore files
|
||||
*.jks
|
||||
*.keystore
|
||||
|
||||
# Google Services (e.g. APIs or Firebase)
|
||||
google-services.json
|
||||
|
||||
# Android Profiling
|
||||
*.hprof
|
||||
@@ -0,0 +1 @@
|
||||
/build
|
||||
@@ -0,0 +1,91 @@
|
||||
plugins {
|
||||
id("com.android.application")
|
||||
id("org.jetbrains.kotlin.android")
|
||||
}
|
||||
|
||||
android {
|
||||
namespace = "com.example.llama"
|
||||
compileSdk = 34
|
||||
|
||||
ndkVersion = "26.1.10909125"
|
||||
|
||||
defaultConfig {
|
||||
applicationId = "com.example.llama"
|
||||
minSdk = 33
|
||||
targetSdk = 34
|
||||
versionCode = 1
|
||||
versionName = "1.0"
|
||||
|
||||
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||
vectorDrawables {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
||||
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
||||
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
||||
if (project.hasProperty("skip-armeabi-v7a")) {
|
||||
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
release {
|
||||
isMinifyEnabled = false
|
||||
proguardFiles(
|
||||
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||
"proguard-rules.pro"
|
||||
)
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = "1.8"
|
||||
}
|
||||
buildFeatures {
|
||||
compose = true
|
||||
}
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion = "1.5.1"
|
||||
}
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation("androidx.core:core-ktx:1.12.0")
|
||||
implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
|
||||
implementation("androidx.activity:activity-compose:1.8.2")
|
||||
implementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
||||
implementation("androidx.compose.ui:ui")
|
||||
implementation("androidx.compose.ui:ui-graphics")
|
||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||
implementation("androidx.compose.material3:material3")
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
||||
androidTestImplementation("androidx.compose.ui:ui-test-junit4")
|
||||
debugImplementation("androidx.compose.ui:ui-tooling")
|
||||
debugImplementation("androidx.compose.ui:ui-test-manifest")
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
||||
@@ -0,0 +1,30 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
xmlns:tools="http://schemas.android.com/tools">
|
||||
|
||||
<uses-permission android:name="android.permission.INTERNET" />
|
||||
|
||||
<application
|
||||
android:allowBackup="true"
|
||||
android:dataExtractionRules="@xml/data_extraction_rules"
|
||||
android:fullBackupContent="@xml/backup_rules"
|
||||
android:icon="@mipmap/ic_launcher"
|
||||
android:label="@string/app_name"
|
||||
android:roundIcon="@mipmap/ic_launcher_round"
|
||||
android:supportsRtl="true"
|
||||
android:theme="@style/Theme.LlamaAndroid"
|
||||
>
|
||||
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:exported="true"
|
||||
android:theme="@style/Theme.LlamaAndroid">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
|
||||
<category android:name="android.intent.category.LAUNCHER" />
|
||||
</intent-filter>
|
||||
</activity>
|
||||
</application>
|
||||
|
||||
</manifest>
|
||||
@@ -0,0 +1,50 @@
|
||||
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
||||
@@ -0,0 +1,394 @@
|
||||
#include <android/log.h>
|
||||
#include <jni.h>
|
||||
#include <iomanip>
|
||||
#include <math.h>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
#include "llama.h"
|
||||
#include "common/common.h"
|
||||
|
||||
// Write C++ code here.
|
||||
//
|
||||
// Do not forget to dynamically load the C++ library into your application.
|
||||
//
|
||||
// For instance,
|
||||
//
|
||||
// In MainActivity.java:
|
||||
// static {
|
||||
// System.loadLibrary("llama-android");
|
||||
// }
|
||||
//
|
||||
// Or, in MainActivity.kt:
|
||||
// companion object {
|
||||
// init {
|
||||
// System.loadLibrary("llama-android")
|
||||
// }
|
||||
// }
|
||||
|
||||
#define TAG "llama-android.cpp"
|
||||
#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
|
||||
#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
|
||||
|
||||
jclass la_int_var;
|
||||
jmethodID la_int_var_value;
|
||||
jmethodID la_int_var_inc;
|
||||
|
||||
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
|
||||
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
|
||||
else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
|
||||
else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||
LOGi("Loading model from %s", path_to_model);
|
||||
|
||||
auto model = llama_load_model_from_file(path_to_model, model_params);
|
||||
env->ReleaseStringUTFChars(filename, path_to_model);
|
||||
|
||||
if (!model) {
|
||||
LOGe("load_model() failed");
|
||||
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return reinterpret_cast<jlong>(model);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||
|
||||
if (!model) {
|
||||
LOGe("new_context(): model cannot be null");
|
||||
env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
|
||||
LOGi("Using %d threads", n_threads);
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = 2048;
|
||||
ctx_params.n_threads = n_threads;
|
||||
ctx_params.n_threads_batch = n_threads;
|
||||
|
||||
llama_context * context = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (!context) {
|
||||
LOGe("llama_new_context_with_model() returned null)");
|
||||
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
|
||||
"llama_new_context_with_model() returned null)");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return reinterpret_cast<jlong>(context);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
llama_free(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||
llama_log_set(log_callback, NULL);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_bench_1model(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
jlong model_pointer,
|
||||
jlong batch_pointer,
|
||||
jint pp,
|
||||
jint tg,
|
||||
jint pl,
|
||||
jint nr
|
||||
) {
|
||||
auto pp_avg = 0.0;
|
||||
auto tg_avg = 0.0;
|
||||
auto pp_std = 0.0;
|
||||
auto tg_std = 0.0;
|
||||
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto model = reinterpret_cast<llama_model *>(model_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
|
||||
const int n_ctx = llama_n_ctx(context);
|
||||
|
||||
LOGi("n_ctx = %d", n_ctx);
|
||||
|
||||
int i, j;
|
||||
int nri;
|
||||
for (nri = 0; nri < nr; nri++) {
|
||||
LOGi("Benchmark prompt processing (pp)");
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
|
||||
const int n_tokens = pp;
|
||||
for (i = 0; i < n_tokens; i++) {
|
||||
llama_batch_add(*batch, 0, i, { 0 }, false);
|
||||
}
|
||||
|
||||
batch->logits[batch->n_tokens - 1] = true;
|
||||
llama_kv_cache_clear(context);
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGi("llama_decode() failed during prompt processing");
|
||||
}
|
||||
const auto t_pp_end = ggml_time_us();
|
||||
|
||||
// bench text generation
|
||||
|
||||
LOGi("Benchmark text generation (tg)");
|
||||
|
||||
llama_kv_cache_clear(context);
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
for (i = 0; i < tg; i++) {
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
for (j = 0; j < pl; j++) {
|
||||
llama_batch_add(*batch, 0, i, { j }, true);
|
||||
}
|
||||
|
||||
LOGi("llama_decode() text generation: %d", i);
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGi("llama_decode() failed during text generation");
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_tg_end = ggml_time_us();
|
||||
|
||||
llama_kv_cache_clear(context);
|
||||
|
||||
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
||||
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
||||
|
||||
const auto speed_pp = double(pp) / t_pp;
|
||||
const auto speed_tg = double(pl * tg) / t_tg;
|
||||
|
||||
pp_avg += speed_pp;
|
||||
tg_avg += speed_tg;
|
||||
|
||||
pp_std += speed_pp * speed_pp;
|
||||
tg_std += speed_tg * speed_tg;
|
||||
|
||||
LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
|
||||
}
|
||||
|
||||
pp_avg /= double(nr);
|
||||
tg_avg /= double(nr);
|
||||
|
||||
if (nr > 1) {
|
||||
pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
|
||||
tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
|
||||
} else {
|
||||
pp_std = 0;
|
||||
tg_std = 0;
|
||||
}
|
||||
|
||||
char model_desc[128];
|
||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||
|
||||
const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
|
||||
const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
|
||||
|
||||
const auto backend = "(Android)"; // TODO: What should this be?
|
||||
|
||||
std::stringstream result;
|
||||
result << std::setprecision(2);
|
||||
result << "| model | size | params | backend | test | t/s |\n";
|
||||
result << "| --- | --- | --- | --- | --- | --- |\n";
|
||||
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
|
||||
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
|
||||
|
||||
return env->NewStringUTF(result.str().c_str());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||
|
||||
llama_batch *batch = new llama_batch {
|
||||
0,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
};
|
||||
|
||||
if (embd) {
|
||||
batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
||||
} else {
|
||||
batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
||||
}
|
||||
|
||||
batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
||||
batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
||||
batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
||||
}
|
||||
batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
||||
|
||||
return reinterpret_cast<jlong>(batch);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
|
||||
llama_backend_init(numa);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||
return env->NewStringUTF(llama_print_system_info());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_com_example_llama_Llm_completion_1init(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
jlong batch_pointer,
|
||||
jstring jtext,
|
||||
jint n_len
|
||||
) {
|
||||
|
||||
const auto text = env->GetStringUTFChars(jtext, 0);
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
|
||||
const auto tokens_list = llama_tokenize(context, text, 1);
|
||||
|
||||
auto n_ctx = llama_n_ctx(context);
|
||||
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
|
||||
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
|
||||
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
|
||||
}
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
LOGi("%s", llama_token_to_piece(context, id).c_str());
|
||||
}
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
|
||||
// evaluate the initial prompt
|
||||
for (auto i = 0; i < tokens_list.size(); i++) {
|
||||
llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
batch->logits[batch->n_tokens - 1] = true;
|
||||
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGe("llama_decode() failed");
|
||||
}
|
||||
|
||||
env->ReleaseStringUTFChars(jtext, text);
|
||||
|
||||
return batch->n_tokens;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_completion_1loop(
|
||||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
jlong batch_pointer,
|
||||
jint n_len,
|
||||
jobject intvar_ncur
|
||||
) {
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
const auto model = llama_get_model(context);
|
||||
|
||||
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
||||
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
||||
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
||||
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// sample the most likely token
|
||||
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
|
||||
|
||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
return env->NewStringUTF("");
|
||||
}
|
||||
|
||||
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
||||
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
|
||||
auto new_token = env->NewStringUTF(new_token_chars.c_str());
|
||||
|
||||
llama_batch_clear(*batch);
|
||||
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
|
||||
|
||||
env->CallVoidMethod(intvar_ncur, la_int_var_inc);
|
||||
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
LOGe("llama_decode() returned null");
|
||||
}
|
||||
|
||||
return new_token;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.app.DownloadManager
|
||||
import android.net.Uri
|
||||
import android.util.Log
|
||||
import androidx.compose.material3.Button
|
||||
import androidx.compose.material3.Text
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableDoubleStateOf
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
import androidx.compose.runtime.remember
|
||||
import androidx.compose.runtime.rememberCoroutineScope
|
||||
import androidx.compose.runtime.setValue
|
||||
import androidx.core.database.getLongOrNull
|
||||
import androidx.core.net.toUri
|
||||
import kotlinx.coroutines.delay
|
||||
import kotlinx.coroutines.launch
|
||||
import java.io.File
|
||||
|
||||
data class Downloadable(val name: String, val source: Uri, val destination: File) {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val tag: String? = this::class.qualifiedName
|
||||
|
||||
sealed interface State
|
||||
data object Ready: State
|
||||
data class Downloading(val id: Long): State
|
||||
data class Downloaded(val downloadable: Downloadable): State
|
||||
data class Error(val message: String): State
|
||||
|
||||
@JvmStatic
|
||||
@Composable
|
||||
fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
|
||||
var status: State by remember {
|
||||
mutableStateOf(
|
||||
if (item.destination.exists()) Downloaded(item)
|
||||
else Ready
|
||||
)
|
||||
}
|
||||
var progress by remember { mutableDoubleStateOf(0.0) }
|
||||
|
||||
val coroutineScope = rememberCoroutineScope()
|
||||
|
||||
suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
|
||||
while (true) {
|
||||
val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
|
||||
|
||||
if (cursor == null) {
|
||||
Log.e(tag, "dm.query() returned null")
|
||||
return Error("dm.query() returned null")
|
||||
}
|
||||
|
||||
if (!cursor.moveToFirst() || cursor.count < 1) {
|
||||
cursor.close()
|
||||
Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
|
||||
return Ready
|
||||
}
|
||||
|
||||
val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
|
||||
val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
|
||||
val sofar = cursor.getLongOrNull(pix) ?: 0
|
||||
val total = cursor.getLongOrNull(tix) ?: 1
|
||||
cursor.close()
|
||||
|
||||
if (sofar == total) {
|
||||
return Downloaded(item)
|
||||
}
|
||||
|
||||
progress = (sofar * 1.0) / total
|
||||
|
||||
delay(1000L)
|
||||
}
|
||||
}
|
||||
|
||||
fun onClick() {
|
||||
when (val s = status) {
|
||||
is Downloaded -> {
|
||||
viewModel.load(item.destination.path)
|
||||
}
|
||||
|
||||
is Downloading -> {
|
||||
coroutineScope.launch {
|
||||
status = waitForDownload(s, item)
|
||||
}
|
||||
}
|
||||
|
||||
else -> {
|
||||
item.destination.delete()
|
||||
|
||||
val request = DownloadManager.Request(item.source).apply {
|
||||
setTitle("Downloading model")
|
||||
setDescription("Downloading model: ${item.name}")
|
||||
setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
|
||||
setDestinationUri(item.destination.toUri())
|
||||
}
|
||||
|
||||
viewModel.log("Saving ${item.name} to ${item.destination.path}")
|
||||
Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
|
||||
|
||||
val id = dm.enqueue(request)
|
||||
status = Downloading(id)
|
||||
onClick()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Button(onClick = { onClick() }, enabled = status !is Downloading) {
|
||||
when (status) {
|
||||
is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
|
||||
is Downloaded -> Text("Load ${item.name}")
|
||||
is Ready -> Text("Download ${item.name}")
|
||||
is Error -> Text("Download ${item.name}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.util.Log
|
||||
import kotlinx.coroutines.CoroutineDispatcher
|
||||
import kotlinx.coroutines.asCoroutineDispatcher
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.flow
|
||||
import kotlinx.coroutines.flow.flowOn
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.util.concurrent.Executors
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
class Llm {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||
|
||||
private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
|
||||
thread(start = false, name = "Llm-RunLoop") {
|
||||
Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
|
||||
|
||||
// No-op if called more than once.
|
||||
System.loadLibrary("llama-android")
|
||||
|
||||
// Set llama log handler to Android
|
||||
log_to_android()
|
||||
backend_init(false)
|
||||
|
||||
Log.d(tag, system_info())
|
||||
|
||||
it.run()
|
||||
}.apply {
|
||||
uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
|
||||
Log.e(tag, "Unhandled exception", exception)
|
||||
}
|
||||
}
|
||||
}.asCoroutineDispatcher()
|
||||
|
||||
private val nlen: Int = 64
|
||||
|
||||
private external fun log_to_android()
|
||||
private external fun load_model(filename: String): Long
|
||||
private external fun free_model(model: Long)
|
||||
private external fun new_context(model: Long): Long
|
||||
private external fun free_context(context: Long)
|
||||
private external fun backend_init(numa: Boolean)
|
||||
private external fun backend_free()
|
||||
private external fun free_batch(batch: Long)
|
||||
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
||||
private external fun bench_model(
|
||||
context: Long,
|
||||
model: Long,
|
||||
batch: Long,
|
||||
pp: Int,
|
||||
tg: Int,
|
||||
pl: Int,
|
||||
nr: Int
|
||||
): String
|
||||
|
||||
private external fun system_info(): String
|
||||
|
||||
private external fun completion_init(
|
||||
context: Long,
|
||||
batch: Long,
|
||||
text: String,
|
||||
nLen: Int
|
||||
): Int
|
||||
|
||||
private external fun completion_loop(
|
||||
context: Long,
|
||||
batch: Long,
|
||||
nLen: Int,
|
||||
ncur: IntVar
|
||||
): String
|
||||
|
||||
private external fun kv_cache_clear(context: Long)
|
||||
|
||||
suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
|
||||
return withContext(runLoop) {
|
||||
when (val state = threadLocalState.get()) {
|
||||
is State.Loaded -> {
|
||||
Log.d(tag, "bench(): $state")
|
||||
bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
|
||||
}
|
||||
|
||||
else -> throw IllegalStateException("No model loaded")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun load(pathToModel: String) {
|
||||
withContext(runLoop) {
|
||||
when (threadLocalState.get()) {
|
||||
is State.Idle -> {
|
||||
val model = load_model(pathToModel)
|
||||
if (model == 0L) throw IllegalStateException("load_model() failed")
|
||||
|
||||
val context = new_context(model)
|
||||
if (context == 0L) throw IllegalStateException("new_context() failed")
|
||||
|
||||
val batch = new_batch(512, 0, 1)
|
||||
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
||||
|
||||
Log.i(tag, "Loaded model $pathToModel")
|
||||
threadLocalState.set(State.Loaded(model, context, batch))
|
||||
}
|
||||
else -> throw IllegalStateException("Model already loaded")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun send(message: String): Flow<String> = flow {
|
||||
when (val state = threadLocalState.get()) {
|
||||
is State.Loaded -> {
|
||||
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
||||
while (ncur.value <= nlen) {
|
||||
val str = completion_loop(state.context, state.batch, nlen, ncur)
|
||||
if (str.isEmpty()) {
|
||||
break
|
||||
}
|
||||
emit(str)
|
||||
}
|
||||
kv_cache_clear(state.context)
|
||||
}
|
||||
else -> {}
|
||||
}
|
||||
}.flowOn(runLoop)
|
||||
|
||||
/**
|
||||
* Unloads the model and frees resources.
|
||||
*
|
||||
* This is a no-op if there's no model loaded.
|
||||
*/
|
||||
suspend fun unload() {
|
||||
withContext(runLoop) {
|
||||
when (val state = threadLocalState.get()) {
|
||||
is State.Loaded -> {
|
||||
free_context(state.context)
|
||||
free_model(state.model)
|
||||
free_batch(state.batch)
|
||||
|
||||
threadLocalState.set(State.Idle)
|
||||
}
|
||||
else -> {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
private class IntVar(value: Int) {
|
||||
@Volatile
|
||||
var value: Int = value
|
||||
private set
|
||||
|
||||
fun inc() {
|
||||
synchronized(this) {
|
||||
value += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private sealed interface State {
|
||||
data object Idle: State
|
||||
data class Loaded(val model: Long, val context: Long, val batch: Long): State
|
||||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
private val _instance: Llm = Llm()
|
||||
|
||||
fun instance(): Llm = _instance
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.app.ActivityManager
|
||||
import android.app.DownloadManager
|
||||
import android.content.ClipData
|
||||
import android.content.ClipboardManager
|
||||
import android.net.Uri
|
||||
import android.os.Bundle
|
||||
import android.os.StrictMode
|
||||
import android.os.StrictMode.VmPolicy
|
||||
import android.text.format.Formatter
|
||||
import androidx.activity.ComponentActivity
|
||||
import androidx.activity.compose.setContent
|
||||
import androidx.activity.viewModels
|
||||
import androidx.compose.foundation.layout.Box
|
||||
import androidx.compose.foundation.layout.Column
|
||||
import androidx.compose.foundation.layout.Row
|
||||
import androidx.compose.foundation.layout.fillMaxSize
|
||||
import androidx.compose.foundation.layout.padding
|
||||
import androidx.compose.foundation.lazy.LazyColumn
|
||||
import androidx.compose.foundation.lazy.items
|
||||
import androidx.compose.foundation.lazy.rememberLazyListState
|
||||
import androidx.compose.material3.Button
|
||||
import androidx.compose.material3.LocalContentColor
|
||||
import androidx.compose.material3.MaterialTheme
|
||||
import androidx.compose.material3.OutlinedTextField
|
||||
import androidx.compose.material3.Surface
|
||||
import androidx.compose.material3.Text
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.compose.ui.unit.dp
|
||||
import androidx.core.content.getSystemService
|
||||
import com.example.llama.ui.theme.LlamaAndroidTheme
|
||||
import java.io.File
|
||||
|
||||
class MainActivity(
|
||||
activityManager: ActivityManager? = null,
|
||||
downloadManager: DownloadManager? = null,
|
||||
clipboardManager: ClipboardManager? = null,
|
||||
): ComponentActivity() {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
|
||||
private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
|
||||
private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
|
||||
|
||||
private val viewModel: MainViewModel by viewModels()
|
||||
|
||||
// Get a MemoryInfo object for the device's current memory status.
|
||||
private fun availableMemory(): ActivityManager.MemoryInfo {
|
||||
return ActivityManager.MemoryInfo().also { memoryInfo ->
|
||||
activityManager.getMemoryInfo(memoryInfo)
|
||||
}
|
||||
}
|
||||
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
|
||||
StrictMode.setVmPolicy(
|
||||
VmPolicy.Builder(StrictMode.getVmPolicy())
|
||||
.detectLeakedClosableObjects()
|
||||
.build()
|
||||
)
|
||||
|
||||
val free = Formatter.formatFileSize(this, availableMemory().availMem)
|
||||
val total = Formatter.formatFileSize(this, availableMemory().totalMem)
|
||||
|
||||
viewModel.log("Current memory: $free / $total")
|
||||
viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
|
||||
|
||||
val extFilesDir = getExternalFilesDir(null)
|
||||
|
||||
val models = listOf(
|
||||
Downloadable(
|
||||
"Phi-2 7B (Q4_0, 1.6 GiB)",
|
||||
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
|
||||
File(extFilesDir, "phi-2-q4_0.gguf"),
|
||||
),
|
||||
Downloadable(
|
||||
"TinyLlama 1.1B (f16, 2.2 GiB)",
|
||||
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
|
||||
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
|
||||
),
|
||||
Downloadable(
|
||||
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
|
||||
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
|
||||
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
|
||||
),
|
||||
)
|
||||
|
||||
setContent {
|
||||
LlamaAndroidTheme {
|
||||
// A surface container using the 'background' color from the theme
|
||||
Surface(
|
||||
modifier = Modifier.fillMaxSize(),
|
||||
color = MaterialTheme.colorScheme.background
|
||||
) {
|
||||
MainCompose(
|
||||
viewModel,
|
||||
clipboardManager,
|
||||
downloadManager,
|
||||
models,
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Composable
|
||||
fun MainCompose(
|
||||
viewModel: MainViewModel,
|
||||
clipboard: ClipboardManager,
|
||||
dm: DownloadManager,
|
||||
models: List<Downloadable>
|
||||
) {
|
||||
Column {
|
||||
val scrollState = rememberLazyListState()
|
||||
|
||||
Box(modifier = Modifier.weight(1f)) {
|
||||
LazyColumn(state = scrollState) {
|
||||
items(viewModel.messages) {
|
||||
Text(
|
||||
it,
|
||||
style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
|
||||
modifier = Modifier.padding(16.dp)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
OutlinedTextField(
|
||||
value = viewModel.message,
|
||||
onValueChange = { viewModel.updateMessage(it) },
|
||||
label = { Text("Message") },
|
||||
)
|
||||
Row {
|
||||
Button({ viewModel.send() }) { Text("Send") }
|
||||
Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
|
||||
Button({ viewModel.clear() }) { Text("Clear") }
|
||||
Button({
|
||||
viewModel.messages.joinToString("\n").let {
|
||||
clipboard.setPrimaryClip(ClipData.newPlainText("", it))
|
||||
}
|
||||
}) { Text("Copy") }
|
||||
}
|
||||
|
||||
Column {
|
||||
for (model in models) {
|
||||
Downloadable.Button(viewModel, dm, model)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
import androidx.compose.runtime.setValue
|
||||
import androidx.lifecycle.ViewModel
|
||||
import androidx.lifecycle.viewModelScope
|
||||
import kotlinx.coroutines.flow.catch
|
||||
import kotlinx.coroutines.launch
|
||||
|
||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val NanosPerSecond = 1_000_000_000.0
|
||||
}
|
||||
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
var messages by mutableStateOf(listOf("Initializing..."))
|
||||
private set
|
||||
|
||||
var message by mutableStateOf("")
|
||||
private set
|
||||
|
||||
override fun onCleared() {
|
||||
super.onCleared()
|
||||
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.unload()
|
||||
} catch (exc: IllegalStateException) {
|
||||
messages += exc.message!!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun send() {
|
||||
val text = message
|
||||
message = ""
|
||||
|
||||
// Add to messages console.
|
||||
messages += text
|
||||
messages += ""
|
||||
|
||||
viewModelScope.launch {
|
||||
llm.send(text)
|
||||
.catch {
|
||||
Log.e(tag, "send() failed", it)
|
||||
messages += it.message!!
|
||||
}
|
||||
.collect { messages = messages.dropLast(1) + (messages.last() + it) }
|
||||
}
|
||||
}
|
||||
|
||||
fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
val start = System.nanoTime()
|
||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||
val end = System.nanoTime()
|
||||
|
||||
messages += warmupResult
|
||||
|
||||
val warmup = (end - start).toDouble() / NanosPerSecond
|
||||
messages += "Warm up time: $warmup seconds, please wait..."
|
||||
|
||||
if (warmup > 5.0) {
|
||||
messages += "Warm up took too long, aborting benchmark"
|
||||
return@launch
|
||||
}
|
||||
|
||||
messages += llm.bench(512, 128, 1, 3)
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "bench() failed", exc)
|
||||
messages += exc.message!!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun load(pathToModel: String) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.load(pathToModel)
|
||||
messages += "Loaded $pathToModel"
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "load() failed", exc)
|
||||
messages += exc.message!!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun updateMessage(newMessage: String) {
|
||||
message = newMessage
|
||||
}
|
||||
|
||||
fun clear() {
|
||||
messages = listOf()
|
||||
}
|
||||
|
||||
fun log(message: String) {
|
||||
messages += message
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.example.llama.ui.theme
|
||||
|
||||
import androidx.compose.ui.graphics.Color
|
||||
|
||||
val Purple80 = Color(0xFFD0BCFF)
|
||||
val PurpleGrey80 = Color(0xFFCCC2DC)
|
||||
val Pink80 = Color(0xFFEFB8C8)
|
||||
|
||||
val Purple40 = Color(0xFF6650a4)
|
||||
val PurpleGrey40 = Color(0xFF625b71)
|
||||
val Pink40 = Color(0xFF7D5260)
|
||||
@@ -0,0 +1,70 @@
|
||||
package com.example.llama.ui.theme
|
||||
|
||||
import android.app.Activity
|
||||
import android.os.Build
|
||||
import androidx.compose.foundation.isSystemInDarkTheme
|
||||
import androidx.compose.material3.MaterialTheme
|
||||
import androidx.compose.material3.darkColorScheme
|
||||
import androidx.compose.material3.dynamicDarkColorScheme
|
||||
import androidx.compose.material3.dynamicLightColorScheme
|
||||
import androidx.compose.material3.lightColorScheme
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.runtime.SideEffect
|
||||
import androidx.compose.ui.graphics.toArgb
|
||||
import androidx.compose.ui.platform.LocalContext
|
||||
import androidx.compose.ui.platform.LocalView
|
||||
import androidx.core.view.WindowCompat
|
||||
|
||||
private val DarkColorScheme = darkColorScheme(
|
||||
primary = Purple80,
|
||||
secondary = PurpleGrey80,
|
||||
tertiary = Pink80
|
||||
)
|
||||
|
||||
private val LightColorScheme = lightColorScheme(
|
||||
primary = Purple40,
|
||||
secondary = PurpleGrey40,
|
||||
tertiary = Pink40
|
||||
|
||||
/* Other default colors to override
|
||||
background = Color(0xFFFFFBFE),
|
||||
surface = Color(0xFFFFFBFE),
|
||||
onPrimary = Color.White,
|
||||
onSecondary = Color.White,
|
||||
onTertiary = Color.White,
|
||||
onBackground = Color(0xFF1C1B1F),
|
||||
onSurface = Color(0xFF1C1B1F),
|
||||
*/
|
||||
)
|
||||
|
||||
@Composable
|
||||
fun LlamaAndroidTheme(
|
||||
darkTheme: Boolean = isSystemInDarkTheme(),
|
||||
// Dynamic color is available on Android 12+
|
||||
dynamicColor: Boolean = true,
|
||||
content: @Composable () -> Unit
|
||||
) {
|
||||
val colorScheme = when {
|
||||
dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
|
||||
val context = LocalContext.current
|
||||
if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
|
||||
}
|
||||
|
||||
darkTheme -> DarkColorScheme
|
||||
else -> LightColorScheme
|
||||
}
|
||||
val view = LocalView.current
|
||||
if (!view.isInEditMode) {
|
||||
SideEffect {
|
||||
val window = (view.context as Activity).window
|
||||
window.statusBarColor = colorScheme.primary.toArgb()
|
||||
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
|
||||
}
|
||||
}
|
||||
|
||||
MaterialTheme(
|
||||
colorScheme = colorScheme,
|
||||
typography = Typography,
|
||||
content = content
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
package com.example.llama.ui.theme
|
||||
|
||||
import androidx.compose.material3.Typography
|
||||
import androidx.compose.ui.text.TextStyle
|
||||
import androidx.compose.ui.text.font.FontFamily
|
||||
import androidx.compose.ui.text.font.FontWeight
|
||||
import androidx.compose.ui.unit.sp
|
||||
|
||||
// Set of Material typography styles to start with
|
||||
val Typography = Typography(
|
||||
bodyLarge = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Normal,
|
||||
fontSize = 16.sp,
|
||||
lineHeight = 24.sp,
|
||||
letterSpacing = 0.5.sp
|
||||
)
|
||||
/* Other default text styles to override
|
||||
titleLarge = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Normal,
|
||||
fontSize = 22.sp,
|
||||
lineHeight = 28.sp,
|
||||
letterSpacing = 0.sp
|
||||
),
|
||||
labelSmall = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Medium,
|
||||
fontSize = 11.sp,
|
||||
lineHeight = 16.sp,
|
||||
letterSpacing = 0.5.sp
|
||||
)
|
||||
*/
|
||||
)
|
||||
@@ -0,0 +1,170 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
android:width="108dp"
|
||||
android:height="108dp"
|
||||
android:viewportWidth="108"
|
||||
android:viewportHeight="108">
|
||||
<path
|
||||
android:fillColor="#3DDC84"
|
||||
android:pathData="M0,0h108v108h-108z" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M9,0L9,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,0L19,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M29,0L29,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M39,0L39,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M49,0L49,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M59,0L59,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M69,0L69,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M79,0L79,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M89,0L89,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M99,0L99,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,9L108,9"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,19L108,19"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,29L108,29"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,39L108,39"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,49L108,49"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,59L108,59"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,69L108,69"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,79L108,79"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,89L108,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,99L108,99"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,29L89,29"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,39L89,39"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,49L89,49"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,59L89,59"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,69L89,69"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,79L89,79"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M29,19L29,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M39,19L39,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M49,19L49,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M59,19L59,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M69,19L69,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M79,19L79,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
</vector>
|
||||
@@ -0,0 +1,30 @@
|
||||
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
xmlns:aapt="http://schemas.android.com/aapt"
|
||||
android:width="108dp"
|
||||
android:height="108dp"
|
||||
android:viewportWidth="108"
|
||||
android:viewportHeight="108">
|
||||
<path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
|
||||
<aapt:attr name="android:fillColor">
|
||||
<gradient
|
||||
android:endX="85.84757"
|
||||
android:endY="92.4963"
|
||||
android:startX="42.9492"
|
||||
android:startY="49.59793"
|
||||
android:type="linear">
|
||||
<item
|
||||
android:color="#44000000"
|
||||
android:offset="0.0" />
|
||||
<item
|
||||
android:color="#00000000"
|
||||
android:offset="1.0" />
|
||||
</gradient>
|
||||
</aapt:attr>
|
||||
</path>
|
||||
<path
|
||||
android:fillColor="#FFFFFF"
|
||||
android:fillType="nonZero"
|
||||
android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
|
||||
android:strokeWidth="1"
|
||||
android:strokeColor="#00000000" />
|
||||
</vector>
|
||||
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
<background android:drawable="@drawable/ic_launcher_background" />
|
||||
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
|
||||
</adaptive-icon>
|
||||
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
<background android:drawable="@drawable/ic_launcher_background" />
|
||||
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
|
||||
</adaptive-icon>
|
||||
|
After Width: | Height: | Size: 1.4 KiB |
|
After Width: | Height: | Size: 2.8 KiB |
|
After Width: | Height: | Size: 982 B |
|
After Width: | Height: | Size: 1.7 KiB |
|
After Width: | Height: | Size: 1.9 KiB |
|
After Width: | Height: | Size: 3.8 KiB |
|
After Width: | Height: | Size: 2.8 KiB |
|
After Width: | Height: | Size: 5.8 KiB |
|
After Width: | Height: | Size: 3.8 KiB |
|
After Width: | Height: | Size: 7.6 KiB |
@@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
<color name="purple_200">#FFBB86FC</color>
|
||||
<color name="purple_500">#FF6200EE</color>
|
||||
<color name="purple_700">#FF3700B3</color>
|
||||
<color name="teal_200">#FF03DAC5</color>
|
||||
<color name="teal_700">#FF018786</color>
|
||||
<color name="black">#FF000000</color>
|
||||
<color name="white">#FFFFFFFF</color>
|
||||
</resources>
|
||||
@@ -0,0 +1,3 @@
|
||||
<resources>
|
||||
<string name="app_name">LlamaAndroid</string>
|
||||
</resources>
|
||||
@@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
|
||||
<style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
|
||||
</resources>
|
||||
@@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><!--
|
||||
Sample backup rules file; uncomment and customize as necessary.
|
||||
See https://developer.android.com/guide/topics/data/autobackup
|
||||
for details.
|
||||
Note: This file is ignored for devices older that API 31
|
||||
See https://developer.android.com/about/versions/12/backup-restore
|
||||
-->
|
||||
<full-backup-content>
|
||||
<!--
|
||||
<include domain="sharedpref" path="."/>
|
||||
<exclude domain="sharedpref" path="device.xml"/>
|
||||
-->
|
||||
</full-backup-content>
|
||||
@@ -0,0 +1,19 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><!--
|
||||
Sample data extraction rules file; uncomment and customize as necessary.
|
||||
See https://developer.android.com/about/versions/12/backup-restore#xml-changes
|
||||
for details.
|
||||
-->
|
||||
<data-extraction-rules>
|
||||
<cloud-backup>
|
||||
<!-- TODO: Use <include> and <exclude> to control what is backed up.
|
||||
<include .../>
|
||||
<exclude .../>
|
||||
-->
|
||||
</cloud-backup>
|
||||
<!--
|
||||
<device-transfer>
|
||||
<include .../>
|
||||
<exclude .../>
|
||||
</device-transfer>
|
||||
-->
|
||||
</data-extraction-rules>
|
||||
@@ -0,0 +1,5 @@
|
||||
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
||||
plugins {
|
||||
id("com.android.application") version "8.2.0" apply false
|
||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
# Project-wide Gradle settings.
|
||||
# IDE (e.g. Android Studio) users:
|
||||
# Gradle settings configured through the IDE *will override*
|
||||
# any settings specified in this file.
|
||||
# For more details on how to configure your build environment visit
|
||||
# http://www.gradle.org/docs/current/userguide/build_environment.html
|
||||
# Specifies the JVM arguments used for the daemon process.
|
||||
# The setting is particularly useful for tweaking memory settings.
|
||||
org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
|
||||
# When configured, Gradle will run in incubating parallel mode.
|
||||
# This option should only be used with decoupled projects. More details, visit
|
||||
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
|
||||
# org.gradle.parallel=true
|
||||
# AndroidX package structure to make it clearer which packages are bundled with the
|
||||
# Android operating system, and which are packaged with your app's APK
|
||||
# https://developer.android.com/topic/libraries/support-library/androidx-rn
|
||||
android.useAndroidX=true
|
||||
# Kotlin code style for this project: "official" or "obsolete":
|
||||
kotlin.code.style=official
|
||||
# Enables namespacing of each library's R class so that its R class includes only the
|
||||
# resources declared in the library itself and none from the library's dependencies,
|
||||
# thereby reducing the size of the R class for that library
|
||||
android.nonTransitiveRClass=true
|
||||
@@ -0,0 +1,6 @@
|
||||
#Thu Dec 21 14:31:09 AEDT 2023
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env sh
|
||||
|
||||
#
|
||||
# Copyright 2015 the original author or authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Gradle start up script for UN*X
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
# Attempt to set APP_HOME
|
||||
# Resolve links: $0 may be a link
|
||||
PRG="$0"
|
||||
# Need this for relative symlinks.
|
||||
while [ -h "$PRG" ] ; do
|
||||
ls=`ls -ld "$PRG"`
|
||||
link=`expr "$ls" : '.*-> \(.*\)$'`
|
||||
if expr "$link" : '/.*' > /dev/null; then
|
||||
PRG="$link"
|
||||
else
|
||||
PRG=`dirname "$PRG"`"/$link"
|
||||
fi
|
||||
done
|
||||
SAVED="`pwd`"
|
||||
cd "`dirname \"$PRG\"`/" >/dev/null
|
||||
APP_HOME="`pwd -P`"
|
||||
cd "$SAVED" >/dev/null
|
||||
|
||||
APP_NAME="Gradle"
|
||||
APP_BASE_NAME=`basename "$0"`
|
||||
|
||||
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
||||
|
||||
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
||||
MAX_FD="maximum"
|
||||
|
||||
warn () {
|
||||
echo "$*"
|
||||
}
|
||||
|
||||
die () {
|
||||
echo
|
||||
echo "$*"
|
||||
echo
|
||||
exit 1
|
||||
}
|
||||
|
||||
# OS specific support (must be 'true' or 'false').
|
||||
cygwin=false
|
||||
msys=false
|
||||
darwin=false
|
||||
nonstop=false
|
||||
case "`uname`" in
|
||||
CYGWIN* )
|
||||
cygwin=true
|
||||
;;
|
||||
Darwin* )
|
||||
darwin=true
|
||||
;;
|
||||
MINGW* )
|
||||
msys=true
|
||||
;;
|
||||
NONSTOP* )
|
||||
nonstop=true
|
||||
;;
|
||||
esac
|
||||
|
||||
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
||||
|
||||
|
||||
# Determine the Java command to use to start the JVM.
|
||||
if [ -n "$JAVA_HOME" ] ; then
|
||||
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
||||
# IBM's JDK on AIX uses strange locations for the executables
|
||||
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||
else
|
||||
JAVACMD="$JAVA_HOME/bin/java"
|
||||
fi
|
||||
if [ ! -x "$JAVACMD" ] ; then
|
||||
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
else
|
||||
JAVACMD="java"
|
||||
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
|
||||
# Increase the maximum file descriptors if we can.
|
||||
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
||||
MAX_FD_LIMIT=`ulimit -H -n`
|
||||
if [ $? -eq 0 ] ; then
|
||||
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
||||
MAX_FD="$MAX_FD_LIMIT"
|
||||
fi
|
||||
ulimit -n $MAX_FD
|
||||
if [ $? -ne 0 ] ; then
|
||||
warn "Could not set maximum file descriptor limit: $MAX_FD"
|
||||
fi
|
||||
else
|
||||
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
|
||||
fi
|
||||
fi
|
||||
|
||||
# For Darwin, add options to specify how the application appears in the dock
|
||||
if $darwin; then
|
||||
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
|
||||
fi
|
||||
|
||||
# For Cygwin or MSYS, switch paths to Windows format before running java
|
||||
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
|
||||
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
||||
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
||||
|
||||
JAVACMD=`cygpath --unix "$JAVACMD"`
|
||||
|
||||
# We build the pattern for arguments to be converted via cygpath
|
||||
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
||||
SEP=""
|
||||
for dir in $ROOTDIRSRAW ; do
|
||||
ROOTDIRS="$ROOTDIRS$SEP$dir"
|
||||
SEP="|"
|
||||
done
|
||||
OURCYGPATTERN="(^($ROOTDIRS))"
|
||||
# Add a user-defined pattern to the cygpath arguments
|
||||
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
|
||||
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
|
||||
fi
|
||||
# Now convert the arguments - kludge to limit ourselves to /bin/sh
|
||||
i=0
|
||||
for arg in "$@" ; do
|
||||
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
|
||||
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
|
||||
|
||||
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
|
||||
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
|
||||
else
|
||||
eval `echo args$i`="\"$arg\""
|
||||
fi
|
||||
i=`expr $i + 1`
|
||||
done
|
||||
case $i in
|
||||
0) set -- ;;
|
||||
1) set -- "$args0" ;;
|
||||
2) set -- "$args0" "$args1" ;;
|
||||
3) set -- "$args0" "$args1" "$args2" ;;
|
||||
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
|
||||
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
|
||||
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
|
||||
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
|
||||
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
|
||||
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Escape application args
|
||||
save () {
|
||||
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
||||
echo " "
|
||||
}
|
||||
APP_ARGS=`save "$@"`
|
||||
|
||||
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
||||
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
||||
|
||||
exec "$JAVACMD" "$@"
|
||||
@@ -0,0 +1,17 @@
|
||||
pluginManagement {
|
||||
repositories {
|
||||
google()
|
||||
mavenCentral()
|
||||
gradlePluginPortal()
|
||||
}
|
||||
}
|
||||
dependencyResolutionManagement {
|
||||
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
|
||||
repositories {
|
||||
google()
|
||||
mavenCentral()
|
||||
}
|
||||
}
|
||||
|
||||
rootProject.name = "LlamaAndroid"
|
||||
include(":app")
|
||||
@@ -65,6 +65,10 @@ int main(int argc, char ** argv) {
|
||||
// load the draft model
|
||||
params.model = params.model_draft;
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
if (params.n_threads_draft > 0) {
|
||||
params.n_threads = params.n_threads_draft;
|
||||
}
|
||||
params.n_threads_batch = params.n_threads_batch_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
{
|
||||
|
||||
@@ -16,14 +16,14 @@ extern "C" {
|
||||
typedef void * ggml_backend_buffer_type_context_t;
|
||||
|
||||
struct ggml_backend_buffer_type_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||
// check if tensor data is in host memory
|
||||
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer_type {
|
||||
@@ -35,15 +35,15 @@ extern "C" {
|
||||
typedef void * ggml_backend_buffer_context_t;
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
||||
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
||||
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
||||
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
||||
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer {
|
||||
@@ -54,7 +54,7 @@ extern "C" {
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
};
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
ggml_backend_buffer_context_t context,
|
||||
@@ -70,31 +70,31 @@ extern "C" {
|
||||
typedef void * ggml_backend_context_t;
|
||||
|
||||
struct ggml_backend_i {
|
||||
const char * (*get_name)(ggml_backend_t backend);
|
||||
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
||||
|
||||
void (*free)(ggml_backend_t backend);
|
||||
void (*GGML_CALL free)(ggml_backend_t backend);
|
||||
|
||||
// buffer allocation
|
||||
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
||||
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
||||
|
||||
// (optional) asynchronous tensor data access
|
||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// (optional) complete all pending operations
|
||||
void (*synchronize)(ggml_backend_t backend);
|
||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||
|
||||
// compute graph with a plan
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
// compute graph without a plan (async)
|
||||
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// check if the backend supports an operation
|
||||
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
@@ -107,9 +107,9 @@ extern "C" {
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
||||
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
||||
|
||||
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name(buft);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_alignment(buft);
|
||||
}
|
||||
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||
if (buft->iface.get_alloc_size) {
|
||||
return buft->iface.get_alloc_size(buft, tensor);
|
||||
@@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||
|
||||
// backend buffer
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
ggml_backend_buffer_context_t context,
|
||||
@@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return base;
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
// init_tensor is optional
|
||||
if (buffer->iface.init_tensor) {
|
||||
buffer->iface.init_tensor(buffer, tensor);
|
||||
@@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
@@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
|
||||
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
@@ -318,9 +318,9 @@ struct ggml_backend_reg {
|
||||
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
||||
static size_t ggml_backend_registry_count = 0;
|
||||
|
||||
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
||||
|
||||
static void ggml_backend_registry_init(void) {
|
||||
GGML_CALL static void ggml_backend_registry_init(void) {
|
||||
static bool initialized = false;
|
||||
|
||||
if (initialized) {
|
||||
@@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) {
|
||||
|
||||
// add forward decls here to avoid including the backend headers
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
extern void ggml_backend_cuda_reg_devices(void);
|
||||
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
||||
ggml_backend_cuda_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
||||
extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
||||
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
||||
|
||||
size_t id = ggml_backend_registry_count;
|
||||
@@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
||||
|
||||
// backend CPU
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return (void *)buffer->context;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
memcpy(data, (const char *)tensor->data + offset, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||
return true;
|
||||
@@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
memset(buffer->context, value, buffer->size);
|
||||
}
|
||||
|
||||
@@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||
|
||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||||
|
||||
@@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
|
||||
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return TENSOR_ALIGNMENT;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return ggml_backend_is_cpu(backend);
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return true;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
||||
@@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
||||
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buf);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
//void * ptr = hbw_malloc(size);
|
||||
void * ptr;
|
||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||
@@ -617,20 +617,20 @@ struct ggml_backend_cpu_context {
|
||||
size_t work_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||||
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||
GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
free(cpu_ctx->work_data);
|
||||
free(cpu_ctx);
|
||||
free(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
@@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu {
|
||||
struct ggml_cgraph cgraph;
|
||||
};
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||
@@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
||||
return cpu_plan;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
free(cpu_plan->cplan.work_data);
|
||||
@@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||
@@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||
@@ -690,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||
@@ -732,7 +732,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
return cpu_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
||||
}
|
||||
|
||||
@@ -743,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
||||
return ggml_backend_cpu_init();
|
||||
|
||||
GGML_UNUSED(params);
|
||||
|
||||
@@ -17,12 +17,12 @@ extern "C" {
|
||||
//
|
||||
|
||||
// buffer type
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
|
||||
// buffer
|
||||
enum ggml_backend_buffer_usage {
|
||||
@@ -30,18 +30,18 @@ extern "C" {
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
//
|
||||
// Backend
|
||||
@@ -58,8 +58,8 @@ extern "C" {
|
||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||
|
||||
@@ -80,13 +80,13 @@ extern "C" {
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
@@ -183,7 +183,7 @@ extern "C" {
|
||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||
|
||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
|
||||
// Compare the output of two backends
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||
|
||||
@@ -7615,11 +7615,11 @@ struct cuda_pool_alloc {
|
||||
|
||||
static bool g_cublas_loaded = false;
|
||||
|
||||
bool ggml_cublas_loaded(void) {
|
||||
GGML_CALL bool ggml_cublas_loaded(void) {
|
||||
return g_cublas_loaded;
|
||||
}
|
||||
|
||||
void ggml_init_cublas() {
|
||||
GGML_CALL void ggml_init_cublas() {
|
||||
static bool initialized = false;
|
||||
|
||||
if (!initialized) {
|
||||
@@ -7707,7 +7707,7 @@ void ggml_init_cublas() {
|
||||
}
|
||||
}
|
||||
|
||||
void * ggml_cuda_host_malloc(size_t size) {
|
||||
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
||||
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
@@ -7725,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void ggml_cuda_host_free(void * ptr) {
|
||||
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
||||
CUDA_CHECK(cudaFreeHost(ptr));
|
||||
}
|
||||
|
||||
@@ -9242,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
||||
}
|
||||
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
if (!g_cublas_loaded) return false;
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
@@ -10013,7 +10013,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
||||
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
||||
}
|
||||
|
||||
static void ggml_cuda_set_main_device(const int main_device) {
|
||||
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
||||
if (main_device >= g_device_count) {
|
||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||
main_device, g_device_count, g_main_device);
|
||||
@@ -10028,7 +10028,7 @@ static void ggml_cuda_set_main_device(const int main_device) {
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
if (!g_cublas_loaded) return false;
|
||||
|
||||
ggml_cuda_func_t func;
|
||||
@@ -10186,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
return true;
|
||||
}
|
||||
|
||||
int ggml_cuda_get_device_count() {
|
||||
GGML_CALL int ggml_cuda_get_device_count() {
|
||||
int device_count;
|
||||
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
||||
return 0;
|
||||
@@ -10194,7 +10194,7 @@ int ggml_cuda_get_device_count() {
|
||||
return device_count;
|
||||
}
|
||||
|
||||
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
snprintf(description, description_size, "%s", prop.name);
|
||||
@@ -10244,27 +10244,27 @@ struct ggml_backend_cuda_buffer_context {
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
return ctx->dev_ptr;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||
@@ -10296,7 +10296,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
@@ -10307,7 +10307,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
@@ -10318,7 +10318,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
||||
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
||||
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
@@ -10335,7 +10335,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -10357,19 +10357,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||
};
|
||||
|
||||
// cuda buffer type
|
||||
|
||||
struct ggml_backend_cuda_buffer_type_context {
|
||||
int device;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
|
||||
ggml_cuda_set_device(buft_ctx->device);
|
||||
@@ -10388,13 +10387,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
@@ -10414,7 +10413,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
if (!ggml_backend_is_cuda(backend)) {
|
||||
return false;
|
||||
}
|
||||
@@ -10434,7 +10433,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
// FIXME: this is not thread safe
|
||||
if (device >= ggml_backend_cuda_get_device_count()) {
|
||||
return nullptr;
|
||||
@@ -10479,7 +10478,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_CUDA_NAME "_Split";
|
||||
|
||||
UNUSED(buffer);
|
||||
@@ -10490,19 +10489,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_
|
||||
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
||||
//}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
||||
return (void *)0x1000;
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
@@ -10552,7 +10551,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
@@ -10586,7 +10585,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
@@ -10620,7 +10619,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
UNUSED(buffer);
|
||||
UNUSED(value);
|
||||
}
|
||||
@@ -10639,13 +10638,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||
|
||||
// cuda split buffer type
|
||||
|
||||
static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_CUDA_NAME "_Split";
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||
// instead, we allocate them for each tensor separately in init_tensor
|
||||
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
||||
@@ -10655,13 +10654,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||
|
||||
size_t total_size = 0;
|
||||
@@ -10688,13 +10687,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu
|
||||
return total_size;
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return ggml_backend_is_cuda(backend);
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
UNUSED(buft);
|
||||
@@ -10709,7 +10708,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
||||
// FIXME: this is not thread safe
|
||||
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
||||
|
||||
@@ -10745,23 +10744,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
|
||||
|
||||
// host buffer type
|
||||
|
||||
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_CUDA_NAME "_Host";
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_CUDA_NAME "_Host";
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_cuda_host_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr = ggml_cuda_host_malloc(size);
|
||||
|
||||
if (ptr == nullptr) {
|
||||
@@ -10777,7 +10776,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
||||
@@ -10795,26 +10794,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||
|
||||
// backend
|
||||
|
||||
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
return cuda_ctx->name.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
delete cuda_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||
@@ -10823,7 +10822,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||
@@ -10832,7 +10831,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
||||
@@ -10843,7 +10842,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggm
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
||||
@@ -10851,7 +10850,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||
@@ -10890,7 +10889,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
@@ -11016,7 +11015,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
ggml_init_cublas(); // TODO: remove from ggml.c
|
||||
|
||||
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
||||
@@ -11040,35 +11039,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
||||
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
||||
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
||||
}
|
||||
|
||||
int ggml_backend_cuda_get_device_count() {
|
||||
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
||||
return ggml_cuda_get_device_count();
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||
ggml_cuda_get_device_description(device, description, description_size);
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
ggml_cuda_set_device(device);
|
||||
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
// backend registry
|
||||
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
||||
return cuda_backend;
|
||||
|
||||
UNUSED(params);
|
||||
}
|
||||
|
||||
extern "C" int ggml_backend_cuda_reg_devices();
|
||||
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
|
||||
int ggml_backend_cuda_reg_devices() {
|
||||
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||
int device_count = ggml_cuda_get_device_count();
|
||||
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
||||
for (int i = 0; i < device_count; i++) {
|
||||
|
||||
@@ -18,34 +18,34 @@ extern "C" {
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
||||
GGML_API void ggml_init_cublas(void);
|
||||
GGML_API GGML_CALL void ggml_init_cublas(void);
|
||||
|
||||
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
||||
GGML_API bool ggml_cublas_loaded(void);
|
||||
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
||||
|
||||
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
||||
GGML_API void ggml_cuda_host_free(void * ptr);
|
||||
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
||||
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API int ggml_cuda_get_device_count(void);
|
||||
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
GGML_API int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -27,7 +27,6 @@
|
||||
|
||||
// max memory buffers that can be mapped to the device
|
||||
#define GGML_METAL_MAX_BUFFERS 64
|
||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_cgraph;
|
||||
@@ -47,11 +46,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||
|
||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
// ideally, the user code should be doing these checks
|
||||
|
||||
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
||||
GGML_PRINT("%s: --- end ---\n", __func__);
|
||||
}
|
||||
|
||||
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||
}
|
||||
|
||||
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||
}
|
||||
|
||||
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
size_t nbytes;
|
||||
size_t blck_size = ggml_blck_size(tensor->type);
|
||||
if (blck_size == 1) {
|
||||
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
||||
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||
}
|
||||
|
||||
int ggml_blck_size(enum ggml_type type) {
|
||||
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
||||
return type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
size_t ggml_type_size(enum ggml_type type) {
|
||||
GGML_CALL size_t ggml_type_size(enum ggml_type type) {
|
||||
return type_traits[type].type_size;
|
||||
}
|
||||
|
||||
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||
GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||
assert(ne % ggml_blck_size(type) == 0);
|
||||
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
||||
}
|
||||
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
||||
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type) {
|
||||
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
||||
return type_traits[type].type_name;
|
||||
}
|
||||
|
||||
bool ggml_is_quantized(enum ggml_type type) {
|
||||
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
||||
return type_traits[type].is_quantized;
|
||||
}
|
||||
|
||||
const char * ggml_op_name(enum ggml_op op) {
|
||||
GGML_CALL const char * ggml_op_name(enum ggml_op op) {
|
||||
return GGML_OP_NAME[op];
|
||||
}
|
||||
|
||||
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
||||
return GGML_UNARY_OP_NAME[op];
|
||||
}
|
||||
|
||||
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||
GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||
if (t->op == GGML_OP_UNARY) {
|
||||
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
||||
return ggml_unary_op_name(uop);
|
||||
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||
}
|
||||
}
|
||||
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||
return ggml_type_size(tensor->type);
|
||||
}
|
||||
|
||||
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
|
||||
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
||||
}
|
||||
|
||||
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||
return tensor->nb[0] > tensor->nb[1];
|
||||
}
|
||||
|
||||
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
||||
@@ -3079,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
||||
return (float *)(tensor->data);
|
||||
}
|
||||
|
||||
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
||||
GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
||||
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
||||
}
|
||||
@@ -11653,7 +11653,7 @@ static void ggml_rope_cache_init(
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
|
||||
@@ -187,6 +187,16 @@
|
||||
# define GGML_API
|
||||
#endif
|
||||
|
||||
#ifdef GGML_MULTIPLATFORM
|
||||
# if defined(_WIN32)
|
||||
# define GGML_CALL
|
||||
# else
|
||||
# define GGML_CALL __attribute__((__ms_abi__))
|
||||
# endif
|
||||
#else
|
||||
# define GGML_CALL
|
||||
#endif
|
||||
|
||||
// TODO: support for clang
|
||||
#ifdef __GNUC__
|
||||
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||
@@ -649,41 +659,41 @@ extern "C" {
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
|
||||
GGML_API int ggml_blck_size(enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
||||
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||
"use ggml_row_size() instead");
|
||||
|
||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
|
||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||
|
||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
||||
|
||||
// TODO: temporary until model loading of ggml examples is refactored
|
||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||
|
||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
@@ -770,7 +780,7 @@ extern "C" {
|
||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||
@@ -1413,7 +1423,7 @@ extern "C" {
|
||||
float beta_slow);
|
||||
|
||||
// compute correction dims for YaRN RoPE scaling
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||
|
||||
// xPos RoPE, in-place, returns view(a)
|
||||
|
||||