mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-10 15:56:44 +02:00
Compare commits
30 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5ab6c2132a | |||
| afc09db51c | |||
| eb75395b5c | |||
| a7a6ceb7ae | |||
| 48edda30ee | |||
| 45eba9369f | |||
| acec9eaaa9 | |||
| e2583cbc29 | |||
| e8b8d32e86 | |||
| 8f3a642ec1 | |||
| 0745384449 | |||
| 019ba1dcd0 | |||
| beabc8cfb0 | |||
| 0d152b37fe | |||
| f8c90cdbaa | |||
| f93af02488 | |||
| f72f8f22c9 | |||
| 79f34abddb | |||
| 8186242b6d | |||
| ac2219fef3 | |||
| 48be797ffb | |||
| f56e1baec3 | |||
| 017efe899d | |||
| ff5a3f0c09 | |||
| 1c84003c08 | |||
| e78f0b0d05 | |||
| 665018c749 | |||
| 29a404a951 | |||
| 0fe321031a | |||
| 9476b01226 |
@@ -188,7 +188,7 @@ jobs:
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
|
||||
cmake ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
@@ -253,6 +253,29 @@ jobs:
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
macOS-latest-swift:
|
||||
runs-on: macos-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
destination: ['platform=macOS,name=Any Mac', 'platform=iOS,name=Any iOS Device', 'platform=tvOS,name=Any tvOS Device']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
|
||||
@@ -265,17 +288,17 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'noavx'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -414,7 +437,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
|
||||
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Determine tag name
|
||||
|
||||
+2
-1
@@ -91,4 +91,5 @@ tests/test-quantize-perf
|
||||
tests/test-sampling
|
||||
tests/test-tokenizer-0-llama
|
||||
tests/test-tokenizer-0-falcon
|
||||
tests/test-tokenizer-1
|
||||
tests/test-tokenizer-1-llama
|
||||
tests/test-tokenizer-1-bpe
|
||||
|
||||
+42
-23
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
|
||||
cmake_minimum_required(VERSION 3.13) # for add_link_options
|
||||
project("llama.cpp" C CXX)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
@@ -44,7 +44,7 @@ endif()
|
||||
|
||||
# general
|
||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
|
||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||
|
||||
# debug
|
||||
@@ -58,15 +58,21 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer"
|
||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||
|
||||
# instruction set specific
|
||||
option(LLAMA_AVX "llama: enable AVX" ON)
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
||||
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
||||
option(LLAMA_FMA "llama: enable FMA" ON)
|
||||
if (LLAMA_NATIVE)
|
||||
set(INS_ENB OFF)
|
||||
else()
|
||||
set(INS_ENB ON)
|
||||
endif()
|
||||
|
||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
||||
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
||||
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
|
||||
# in MSVC F16C is implied with AVX2/AVX512
|
||||
if (NOT MSVC)
|
||||
option(LLAMA_F16C "llama: enable F16C" ON)
|
||||
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
|
||||
endif()
|
||||
|
||||
# 3rd party libs
|
||||
@@ -343,8 +349,9 @@ if (LLAMA_MPI)
|
||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
||||
add_compile_definitions(GGML_USE_MPI)
|
||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
|
||||
set(c_flags ${c_flags} -Wno-cast-qual)
|
||||
if (NOT MSVC)
|
||||
add_compile_options(-Wno-cast-qual)
|
||||
endif()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
|
||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
|
||||
# Even if you're only using the C header, C++ programs may bring in MPI
|
||||
@@ -418,10 +425,11 @@ if (LLAMA_ALL_WARNINGS)
|
||||
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
|
||||
-Werror=implicit-function-declaration)
|
||||
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
||||
set(host_cxx_flags "")
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
@@ -431,27 +439,38 @@ if (LLAMA_ALL_WARNINGS)
|
||||
endif()
|
||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
||||
set(cxx_flags ${cxx_flags} -Wno-array-bounds)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
|
||||
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
||||
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
|
||||
endif()
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
||||
set(cxx_flags ${cxx_flags} -Wextra-semi)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
# todo : msvc
|
||||
endif()
|
||||
|
||||
add_compile_options(
|
||||
${warning_flags}
|
||||
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
||||
)
|
||||
set(c_flags ${c_flags} ${warning_flags})
|
||||
set(cxx_flags ${cxx_flags} ${warning_flags})
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
|
||||
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
set(cuda_flags -Wno-pedantic)
|
||||
endif()
|
||||
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
|
||||
|
||||
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
|
||||
if (NOT cuda_host_flags STREQUAL "")
|
||||
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
||||
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
|
||||
@@ -491,9 +510,6 @@ if (NOT MSVC)
|
||||
if (LLAMA_GPROF)
|
||||
add_compile_options(-pg)
|
||||
endif()
|
||||
if (LLAMA_NATIVE)
|
||||
add_compile_options(-march=native)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||
@@ -548,6 +564,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_NATIVE)
|
||||
add_compile_options(-march=native)
|
||||
endif()
|
||||
if (LLAMA_F16C)
|
||||
add_compile_options(-mf16c)
|
||||
endif()
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
@@ -62,9 +62,11 @@ test: $(TEST_TARGETS)
|
||||
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||
continue; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||
continue; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
||||
continue; \
|
||||
else \
|
||||
echo "Running test $$test_target..."; \
|
||||
./$$test_target; \
|
||||
@@ -670,6 +672,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
|
||||
+6
-3
@@ -44,9 +44,12 @@ let package = Package(
|
||||
cSettings: [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
||||
.define("GGML_USE_K_QUANTS"),
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.define("ACCELERATE_NEW_LAPACK"),
|
||||
.define("ACCELERATE_LAPACK_ILP64")
|
||||
.define("GGML_USE_ACCELERATE")
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||
// .define("ACCELERATE_NEW_LAPACK"),
|
||||
// .define("ACCELERATE_LAPACK_ILP64")
|
||||
] + additionalSettings,
|
||||
linkerSettings: [
|
||||
.linkedFramework("Accelerate")
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
|
||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
|
||||
+6
-2
@@ -361,7 +361,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back({argv[i], 1.0f});
|
||||
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-scaled") {
|
||||
if (++i >= argc) {
|
||||
@@ -373,7 +373,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
|
||||
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
@@ -616,6 +616,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
process_escapes(params.prompt);
|
||||
process_escapes(params.input_prefix);
|
||||
process_escapes(params.input_suffix);
|
||||
for (auto & antiprompt : params.antiprompt) {
|
||||
process_escapes(antiprompt);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -923,6 +926,7 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
||||
result += piece;
|
||||
}
|
||||
|
||||
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,11 +11,14 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
import itertools
|
||||
import gguf
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
@@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
|
||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||
vocab_size = hparams.get('vocab_size')
|
||||
if vocab_size is None:
|
||||
vocab_size = tokenizer.vocab_size()
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
for i in range(vocab_size):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
|
||||
+78
-103
@@ -4,6 +4,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
@@ -20,32 +21,10 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
def count_model_parts(dir_model: Path, prefix: str) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
if filename.startswith(prefix):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
@@ -99,20 +78,26 @@ print("gguf: loading model "+dir_model.name)
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "RWForCausalLM":
|
||||
if hparams["architectures"][0] != "FalconForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
num_parts = count_model_parts(dir_model, "model-00")
|
||||
if num_parts:
|
||||
is_safetensors = True
|
||||
from safetensors import safe_open
|
||||
else:
|
||||
is_safetensors = False
|
||||
num_parts = count_model_parts(dir_model, "pytorch_model-")
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.FALCON
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["n_layer"]
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
gguf_writer.add_name("Falcon")
|
||||
gguf_writer.add_context_length(2048) # not in config.json
|
||||
@@ -120,9 +105,9 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
if "n_head_kv" in hparams:
|
||||
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
if "num_kv_heads" in hparams:
|
||||
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
|
||||
else:
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
@@ -133,50 +118,32 @@ gguf_writer.add_file_type(ftype)
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytearray] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
tokens.append(reverse_vocab[i])
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
@@ -186,8 +153,8 @@ special_vocab.add_to_gguf(gguf_writer)
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||
n_head = hparams["num_attention_heads"]
|
||||
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
|
||||
|
||||
head_dim = hparams["hidden_size"] // n_head
|
||||
|
||||
@@ -196,6 +163,10 @@ print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
elif is_safetensors:
|
||||
part_names = (
|
||||
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
|
||||
)
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
@@ -205,60 +176,64 @@ for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||
if is_safetensors:
|
||||
ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
|
||||
else:
|
||||
ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
with ctx as model_part:
|
||||
for name in model_part.keys():
|
||||
data = model_part.get_tensor(name) if is_safetensors else model_part[name]
|
||||
|
||||
old_dtype = data.dtype
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
# QKV tensor transform
|
||||
# The original query_key_value tensor contains n_head_kv "kv groups",
|
||||
# each consisting of n_head/n_head_kv query weights followed by one key
|
||||
# and one value weight (shared by all query heads in the kv group).
|
||||
# This layout makes it a big pain to work with in GGML.
|
||||
# So we rearrange them here,, so that we have n_head query weights
|
||||
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
||||
# in contiguous fashion.
|
||||
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
||||
# QKV tensor transform
|
||||
# The original query_key_value tensor contains n_head_kv "kv groups",
|
||||
# each consisting of n_head/n_head_kv query weights followed by one key
|
||||
# and one value weight (shared by all query heads in the kv group).
|
||||
# This layout makes it a big pain to work with in GGML.
|
||||
# So we rearrange them here,, so that we have n_head query weights
|
||||
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
||||
# in contiguous fashion.
|
||||
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
||||
|
||||
if "query_key_value" in name:
|
||||
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
||||
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
|
||||
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||
data = torch.cat((q,k,v)).reshape_as(data)
|
||||
if "query_key_value" in name:
|
||||
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
||||
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
|
||||
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||
data = torch.cat((q,k,v)).reshape_as(data)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
|
||||
@@ -19,29 +19,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
@@ -130,48 +107,32 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytearray] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
Executable
+318
@@ -0,0 +1,318 @@
|
||||
#!/usr/bin/env python3
|
||||
# HF refact--> gguf conversion
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer # type: ignore[import]
|
||||
|
||||
if "NO_LOCAL_GGUF" not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
return num_parts
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert a Refact model to a GGML compatible file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vocab-only",
|
||||
action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile",
|
||||
type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model",
|
||||
type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype",
|
||||
type=int,
|
||||
choices=[0, 1],
|
||||
default=1,
|
||||
nargs="?",
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f"Error: {args.model} is not a directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
|
||||
|
||||
print("gguf: loading model " + dir_model.name)
|
||||
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "GPTRefactForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
ARCH = gguf.MODEL_ARCH.REFACT
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
# Get refact feed forward dimension
|
||||
hidden_dim = hparams["n_embd"]
|
||||
inner_dim = 4 * hidden_dim
|
||||
hidden_dim = int(2 * inner_dim / 3)
|
||||
multiple_of = 256
|
||||
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
||||
|
||||
block_count = hparams["n_layer"]
|
||||
|
||||
gguf_writer.add_name("Refact")
|
||||
# refact uses Alibi. So this is from config.json which might be used by training.
|
||||
gguf_writer.add_context_length(hparams["n_positions"])
|
||||
gguf_writer.add_embedding_length(hparams["n_embd"])
|
||||
|
||||
gguf_writer.add_feed_forward_length(ff_dim)
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
|
||||
gguf_writer.add_file_type(ftype)
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytearray] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_json_file = dir_model / "tokenizer.json"
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = (
|
||||
hparams["vocab_size"]
|
||||
if "vocab_size" in hparams
|
||||
else len(tokenizer_json["model"]["vocab"])
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
text = reverse_vocab[i]
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode("utf-8"))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = 1
|
||||
|
||||
head_dim = hparams["n_embd"] // n_head
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||
|
||||
for i in range(block_count):
|
||||
if f"transformer.h.{i}.attn.kv.weight" in model_part:
|
||||
data = model_part[f"transformer.h.{i}.attn.kv.weight"]
|
||||
model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
|
||||
: n_head_kv * head_dim
|
||||
]
|
||||
model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
|
||||
n_head_kv * head_dim :
|
||||
]
|
||||
del model_part[f"transformer.h.{i}.attn.kv.weight"]
|
||||
if f"transformer.h.{i}.attn.q.weight" in model_part:
|
||||
model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
|
||||
f"transformer.h.{i}.attn.q.weight"
|
||||
]
|
||||
del model_part[f"transformer.h.{i}.attn.q.weight"]
|
||||
if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
|
||||
data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
||||
model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
|
||||
model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
|
||||
del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if (
|
||||
ftype == 1
|
||||
and data_dtype == np.float32
|
||||
and name.endswith(".weight")
|
||||
and n_dims == 2
|
||||
):
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(
|
||||
new_name
|
||||
+ ", n_dims = "
|
||||
+ str(n_dims)
|
||||
+ ", "
|
||||
+ str(old_dtype)
|
||||
+ " --> "
|
||||
+ str(data.dtype)
|
||||
)
|
||||
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
||||
@@ -20,28 +20,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
@@ -117,50 +95,32 @@ gguf_writer.add_file_type(ftype)
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytearray] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
+7
-22
@@ -41,8 +41,7 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||
|
||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.LLAMA
|
||||
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
|
||||
ARCH = gguf.MODEL_ARCH.LLAMA
|
||||
|
||||
DEFAULT_CONCURRENCY = 8
|
||||
#
|
||||
@@ -339,29 +338,15 @@ class BpeVocab:
|
||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.bpe_tokenizer
|
||||
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
||||
byte_encoder = tokenization_gpt2.bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
score = 0.0
|
||||
for i, item in enumerate(tokenizer):
|
||||
text: bytes = item.encode("utf-8")
|
||||
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
|
||||
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
|
||||
if i == 0 and text == b'<unk>':
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
elif i == 1 or i == 2:
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
elif i >= 3 and text.startswith(b'<0x'):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
else:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
yield text, score, toktype
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||
|
||||
for i, _ in enumerate(tokenizer):
|
||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
score = -1000.0
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.bpe_tokens()
|
||||
@@ -953,7 +938,7 @@ class OutputFile:
|
||||
of.close()
|
||||
|
||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||
wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
||||
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||
return GGMLFileType.AllF32
|
||||
|
||||
@@ -35,6 +35,7 @@ else()
|
||||
endif()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
add_subdirectory(server-parallel)
|
||||
endif()
|
||||
add_subdirectory(export-lora)
|
||||
endif()
|
||||
|
||||
@@ -9,7 +9,7 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
|
||||
MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
|
||||
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
|
||||
USER_NAME="${USER_NAME:-User}"
|
||||
AI_NAME="${AI_NAME:-ChatLLaMa}"
|
||||
@@ -61,9 +61,9 @@ fi
|
||||
|
||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||
echo 'Prompt cache does not exist, building...'
|
||||
# Default batch_size to 8 here for better user feedback during initial prompt processing
|
||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||
./main 2>>"$LOG" \
|
||||
--batch_size 8 \
|
||||
--batch_size 64 \
|
||||
"${OPTS[@]}" \
|
||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||
--file "$CUR_PROMPT_FILE" \
|
||||
@@ -132,7 +132,7 @@ while read -e line; do
|
||||
# HACK get num tokens from debug message
|
||||
# TODO get both messages in one go
|
||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||
! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
|
||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||
```
|
||||
|
||||
The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
|
||||
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
|
||||
|
||||
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
|
||||
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
|
||||
|
||||
@@ -313,7 +313,7 @@ class ModelParams:
|
||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||
|
||||
def tensor_name(key, bid=None, suffix=".weight"):
|
||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
|
||||
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
|
||||
|
||||
class Layer:
|
||||
def __init__(self, params, lora_params, bid):
|
||||
|
||||
@@ -543,6 +543,9 @@ int main(int argc, char ** argv) {
|
||||
if (i > 0) {
|
||||
embd.erase(embd.begin(), embd.begin() + i);
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the session from the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, n_past, -1);
|
||||
}
|
||||
|
||||
// evaluate tokens in batches
|
||||
@@ -667,7 +670,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
@@ -694,10 +697,8 @@ int main(int argc, char ** argv) {
|
||||
if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
console::set_display(console::user_input);
|
||||
}
|
||||
is_antiprompt = true;
|
||||
fflush(stdout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -721,8 +722,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
console::set_display(console::user_input);
|
||||
fflush(stdout);
|
||||
} else if (params.instruct) {
|
||||
is_interacting = true;
|
||||
}
|
||||
@@ -747,6 +746,9 @@ int main(int argc, char ** argv) {
|
||||
printf("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
// color user input only
|
||||
console::set_display(console::user_input);
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
|
||||
@@ -332,7 +332,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
|
||||
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
set(TARGET server-parallel)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} server.cpp ../server/json.hpp ../server/httplib.h)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
@@ -0,0 +1,66 @@
|
||||
# llama.cpp/example/server-parallel
|
||||
|
||||
This example demonstrates a PoC HTTP API server that handles simulataneus requests. Long prompts are not supported.
|
||||
|
||||
## Quick Start
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./server-parallel -m models/7B/ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching
|
||||
```
|
||||
|
||||
### Windows:
|
||||
|
||||
```powershell
|
||||
server-parallel.exe -m models\7B\ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching
|
||||
```
|
||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
- **GET** `/props`: Return the user and assistant name for generate the prompt.
|
||||
|
||||
*Response:*
|
||||
```json
|
||||
{
|
||||
"user_name": "User:",
|
||||
"assistant_name": "Assistant:"
|
||||
}
|
||||
```
|
||||
|
||||
- **POST** `/completion`: Given a prompt, it returns the predicted completion, just streaming mode.
|
||||
|
||||
*Options:*
|
||||
|
||||
`temperature`: Adjust the randomness of the generated text (default: 0.1).
|
||||
|
||||
`prompt`: Provide a prompt as a string, It should be a coherent continuation of the system prompt.
|
||||
|
||||
`system_prompt`: Provide a system prompt as a string.
|
||||
|
||||
`anti_prompt`: Provide the name of the user coherent with the system prompt.
|
||||
|
||||
`assistant_name`: Provide the name of the assistant coherent with the system prompt.
|
||||
|
||||
*Example request:*
|
||||
```json
|
||||
{
|
||||
"system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nHuman: Hello\nAssistant: Hi, how may I help you?\nHuman:",
|
||||
"anti_prompt": "Human:",
|
||||
"assistant_name": "Assistant:",
|
||||
"prompt": "When is the day of independency of US?",
|
||||
"temperature": 0.2
|
||||
}
|
||||
```
|
||||
|
||||
*Response:*
|
||||
```json
|
||||
{
|
||||
"content": "<token_str>"
|
||||
}
|
||||
```
|
||||
|
||||
# This example is a Proof of Concept, have some bugs and unexpected behaivors, this not supports long prompts.
|
||||
@@ -0,0 +1,263 @@
|
||||
const char* system_prompt_default =
|
||||
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
|
||||
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||
User: Recommend a nice restaurant in the area.
|
||||
Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
|
||||
User: Who is Richard Feynman?
|
||||
Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
|
||||
User:)";
|
||||
|
||||
const char* index_html_ = R"(
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>llama.cpp - server parallel PoC</title>
|
||||
<script src="index.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div style="width: 90%;margin: auto;">
|
||||
<h2>Server parallel - PoC</h2>
|
||||
<form id="myForm">
|
||||
<input type="checkbox" id="system_promt_cb" name="myCheckbox" onchange="toggleSP() ">
|
||||
<label for="system_promt_cb">Use custom system prompt</label>
|
||||
<br>
|
||||
<div id="system_prompt_view" style="display: none;">
|
||||
<textarea id="sp_text" name="systemPrompt" style="width: 100%;height: 4rem;" placeholder="System Prompt"></textarea>
|
||||
<label for="user_name">User name</label>
|
||||
<input type="text" id="user_name" value="" placeholder="Anti prompt" required>
|
||||
<label for="assistant_name">Assistant name</label>
|
||||
<input type="text" id="assistant_name" value="" placeholder="Assistant:" required>
|
||||
<button type="button" id="btn_reset" onclick="clearSP() " >Clear all</button>
|
||||
</div>
|
||||
<br>
|
||||
<label for="slot_id">Slot ID (-1 load in a idle slot)</label>
|
||||
<input type="number" id="slot_id" value="-1" required>
|
||||
<br>
|
||||
<label for="temperature">Temperature</label>
|
||||
<input type="number" id="temperature" value="0.1" required>
|
||||
<br>
|
||||
<label for="message">Message</label>
|
||||
<input id="message" style="width: 80%;" required>
|
||||
<br><br>
|
||||
<button type="button" id="btn_send" onclick="perform() " >Send</button>
|
||||
<br>
|
||||
<br>
|
||||
<button type="button" id="btn_reset" onclick="resetBtn() " >Reset</button>
|
||||
</form>
|
||||
<div id="conversation_view">
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
)";
|
||||
|
||||
const char* index_js_ = R"(
|
||||
let conversation = [];
|
||||
let current_message = -1;
|
||||
|
||||
const questions = [
|
||||
"Who is Elon Musk?",
|
||||
"Who is Jeff Bezos?",
|
||||
"How to get a job at google?",
|
||||
"What are you?",
|
||||
"When was born Abraham Lincoln?",
|
||||
];
|
||||
|
||||
let user_name = "";
|
||||
let assistant_name = "";
|
||||
|
||||
function toggleSP() {
|
||||
if(document.getElementById("system_promt_cb").checked) {
|
||||
document.getElementById("system_prompt_view").style.display = "block";
|
||||
} else {
|
||||
document.getElementById("system_prompt_view").style.display = "none";
|
||||
}
|
||||
}
|
||||
|
||||
function clearSP() {
|
||||
document.getElementById("sp_text").value = "";
|
||||
document.getElementById("anti_prompt").value = "";
|
||||
document.getElementById("assistant_name").value = "";
|
||||
}
|
||||
|
||||
docReady(async () => {
|
||||
document.getElementById("message").value =
|
||||
questions[Math.floor(Math.random() * questions.length)];
|
||||
|
||||
// to keep the same prompt format in all clients
|
||||
const response = await fetch("/props");
|
||||
if (!response.ok) {
|
||||
alert(`HTTP error! Status: ${response.status}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
user_name = data.user_name;
|
||||
assistant_name = data.assistant_name;
|
||||
});
|
||||
|
||||
function docReady(fn) {
|
||||
// see if DOM is already available
|
||||
if (
|
||||
document.readyState === "complete" ||
|
||||
document.readyState === "interactive"
|
||||
) {
|
||||
// call on next available tick
|
||||
setTimeout(fn, 1);
|
||||
} else {
|
||||
document.addEventListener("DOMContentLoaded", fn);
|
||||
}
|
||||
}
|
||||
|
||||
function updateView() {
|
||||
let conv_view = document.getElementById("conversation_view");
|
||||
// build view
|
||||
conv_view.innerHTML = "";
|
||||
for (let index in conversation) {
|
||||
conversation[index].assistant = conversation[index].assistant.replace(
|
||||
user_name,
|
||||
""
|
||||
);
|
||||
conv_view.innerHTML += `
|
||||
<p><span style="font-weight: bold">User:</span> ${conversation[index].user}<p>
|
||||
<p style="white-space: pre-line;"><span style="font-weight: bold">Assistant:</span> ${conversation[index].assistant}<p>`;
|
||||
}
|
||||
}
|
||||
|
||||
async function call_llama(options) {
|
||||
const response = await fetch("/completion", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(options),
|
||||
headers: {
|
||||
Connection: "keep-alive",
|
||||
"Content-Type": "application/json",
|
||||
Accept: "text/event-stream",
|
||||
},
|
||||
});
|
||||
|
||||
const reader = response.body.getReader();
|
||||
let cont = true;
|
||||
const decoder = new TextDecoder();
|
||||
let leftover = ""; // Buffer for partially read lines
|
||||
|
||||
try {
|
||||
let cont = true;
|
||||
|
||||
while (cont) {
|
||||
const result = await reader.read();
|
||||
if (result.done) {
|
||||
document.getElementById("btn_send").disabled = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// Add any leftover data to the current chunk of data
|
||||
const text = leftover + decoder.decode(result.value);
|
||||
|
||||
// Check if the last character is a line break
|
||||
const endsWithLineBreak = text.endsWith("\n");
|
||||
|
||||
// Split the text into lines
|
||||
let lines = text.split("\n");
|
||||
|
||||
// If the text doesn't end with a line break, then the last line is incomplete
|
||||
// Store it in leftover to be added to the next chunk of data
|
||||
if (!endsWithLineBreak) {
|
||||
leftover = lines.pop();
|
||||
} else {
|
||||
leftover = ""; // Reset leftover if we have a line break at the end
|
||||
}
|
||||
|
||||
// Parse all sse events and add them to result
|
||||
const regex = /^(\S+):\s(.*)$/gm;
|
||||
for (const line of lines) {
|
||||
const match = regex.exec(line);
|
||||
if (match) {
|
||||
result[match[1]] = match[2];
|
||||
// since we know this is llama.cpp, let's just decode the json in data
|
||||
if (result.data) {
|
||||
result.data = JSON.parse(result.data);
|
||||
conversation[current_message].assistant += result.data.content;
|
||||
updateView();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
if (e.name !== "AbortError") {
|
||||
console.error("llama error: ", e);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
function generatePrompt() {
|
||||
// generate a good prompt to have coherence
|
||||
let prompt = "";
|
||||
for (let index in conversation) {
|
||||
if (index == 0) {
|
||||
prompt += conversation[index].user + "\n";
|
||||
} else {
|
||||
prompt += user_name + conversation[index].user + "\n";
|
||||
}
|
||||
if (index == current_message) {
|
||||
prompt += assistant_name;
|
||||
} else {
|
||||
prompt += assistant_name + conversation[index].assistant;
|
||||
}
|
||||
}
|
||||
return prompt;
|
||||
}
|
||||
|
||||
function resetBtn() {
|
||||
document.getElementById("slot_id").value = "-1";
|
||||
document.getElementById("temperature").value = "0.1";
|
||||
document.getElementById("message").value =
|
||||
questions[Math.floor(Math.random() * questions.length)];
|
||||
document.getElementById("conversation_view").innerHTML = "";
|
||||
conversation = [];
|
||||
current_message = -1;
|
||||
}
|
||||
|
||||
async function perform() {
|
||||
var slot_id = parseInt(document.getElementById("slot_id").value);
|
||||
var temperature = parseFloat(document.getElementById("temperature").value);
|
||||
var prompt = " " + document.getElementById("message").value;
|
||||
if (!isNaN(slot_id) && !isNaN(temperature) && prompt.length > 0) {
|
||||
let options = {
|
||||
slot_id,
|
||||
temperature
|
||||
};
|
||||
if(document.getElementById("system_promt_cb").checked) {
|
||||
let system_prompt = document.getElementById("sp_text").value;
|
||||
let anti_prompt = document.getElementById("user_name").value;
|
||||
let assistant_name_ = document.getElementById("assistant_name").value;
|
||||
if(!system_prompt || !anti_prompt || !assistant_name_) {
|
||||
document.getElementById("conversation_view").innerText =
|
||||
"please, insert valid props.";
|
||||
return;
|
||||
}
|
||||
conversation = [];
|
||||
current_message = -1;
|
||||
document.getElementById("system_promt_cb").checked = false;
|
||||
document.getElementById("system_promt_cb").dispatchEvent(new Event("change"));
|
||||
options.system_prompt = system_prompt;
|
||||
options.anti_prompt = anti_prompt;
|
||||
options.assistant_name = assistant_name_;
|
||||
user_name = anti_prompt;
|
||||
assistant_name = assistant_name_;
|
||||
}
|
||||
current_message++;
|
||||
conversation.push({
|
||||
user: prompt,
|
||||
assistant: "",
|
||||
});
|
||||
updateView();
|
||||
document.getElementById("message").value = "";
|
||||
document.getElementById("btn_send").disabled = true;
|
||||
options.prompt = generatePrompt();
|
||||
await call_llama(options);
|
||||
} else {
|
||||
document.getElementById("conversation_view").innerText =
|
||||
"please, insert valid props.";
|
||||
}
|
||||
}
|
||||
|
||||
)";
|
||||
@@ -0,0 +1,884 @@
|
||||
#include "frontend.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include "../server/httplib.h"
|
||||
#include "../server/json.hpp"
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
|
||||
using namespace httplib;
|
||||
using namespace std;
|
||||
using namespace nlohmann;
|
||||
|
||||
struct server_params
|
||||
{
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = "examples/server/public";
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
};
|
||||
|
||||
// utils functions taken of examples/server
|
||||
|
||||
static bool ends_with(const std::string &str, const std::string &suffix)
|
||||
{
|
||||
return str.size() >= suffix.size() &&
|
||||
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
||||
}
|
||||
|
||||
static size_t find_partial_stop_string(const std::string &stop,
|
||||
const std::string &text)
|
||||
{
|
||||
if (!text.empty() && !stop.empty())
|
||||
{
|
||||
const char text_last_char = text.back();
|
||||
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
|
||||
{
|
||||
if (stop[char_index] == text_last_char)
|
||||
{
|
||||
const std::string current_partial = stop.substr(0, char_index + 1);
|
||||
if (ends_with(text, current_partial))
|
||||
{
|
||||
return text.size() - char_index - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::string::npos;
|
||||
}
|
||||
|
||||
enum stop_type
|
||||
{
|
||||
STOP_FULL,
|
||||
STOP_PARTIAL,
|
||||
};
|
||||
|
||||
enum slot_state
|
||||
{
|
||||
IDLE,
|
||||
PROCESSING
|
||||
};
|
||||
|
||||
enum slot_command {
|
||||
NONE,
|
||||
LOAD_PROMPT,
|
||||
RELEASE
|
||||
};
|
||||
|
||||
|
||||
struct llama_client_slot
|
||||
{
|
||||
int id;
|
||||
int32_t n_prompt = 0;
|
||||
int32_t n_decoded = 0;
|
||||
int32_t i_batch = -1;
|
||||
string prompt = "";
|
||||
string sampled_token_str;
|
||||
string generated_text = "";
|
||||
llama_token sampled;
|
||||
std::vector<llama_token> tokens_prev;
|
||||
slot_state state = IDLE;
|
||||
slot_command command = NONE;
|
||||
bool newToken = false;
|
||||
float temperature = 0.1f;
|
||||
|
||||
void start(string prompt_, float temp_) {
|
||||
prompt = prompt_;
|
||||
command = LOAD_PROMPT;
|
||||
temperature = temp_;
|
||||
newToken = false;
|
||||
}
|
||||
|
||||
bool hasNewToken() {
|
||||
if(newToken) {
|
||||
newToken = false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool available() {
|
||||
return state == IDLE && command == NONE;
|
||||
}
|
||||
|
||||
void nofity() {
|
||||
newToken = !newToken;
|
||||
}
|
||||
|
||||
void release() {
|
||||
if(state == PROCESSING) {
|
||||
command = RELEASE;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct server_parallel_context {
|
||||
// example props
|
||||
vector<llama_client_slot> slots;
|
||||
std::string system_prompt = "";
|
||||
bool update_system_prompt = true;
|
||||
|
||||
// broadcast to all clients to keep the same prompt format
|
||||
std::string user_name = ""; // this should be the anti prompt
|
||||
std::string assistant_name = ""; // this is for generate the prompt
|
||||
|
||||
// llama native props
|
||||
gpt_params params;
|
||||
llama_model *model = NULL;
|
||||
llama_context *ctx = NULL;
|
||||
int n_ctx;
|
||||
int n_vocab;
|
||||
std::vector<llama_token_data> candidates;
|
||||
std::vector<llama_token> tokens_system;
|
||||
int32_t n_tokens_system = 0;
|
||||
llama_batch batch;
|
||||
|
||||
bool loadModel(gpt_params params_) {
|
||||
params = params_;
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == nullptr)
|
||||
{
|
||||
LOG_TEE("unable to load model: %s", params.model.c_str());
|
||||
return false;
|
||||
}
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
n_vocab = llama_n_vocab(model);
|
||||
candidates.reserve(n_vocab);
|
||||
return true;
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
// create slots
|
||||
LOG_TEE("Available slots:\n");
|
||||
for (int i = 0; i < params.n_parallel; i++)
|
||||
{
|
||||
llama_client_slot slot;
|
||||
slot.id = i;
|
||||
slot.prompt = "default";
|
||||
slot.state = IDLE;
|
||||
slot.tokens_prev.resize(std::max(256, params.n_predict));
|
||||
std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0);
|
||||
LOG_TEE(" - slot %i\n", slot.id);
|
||||
slots.push_back(slot);
|
||||
}
|
||||
batch = llama_batch_init(params.n_ctx, 0);
|
||||
|
||||
// always assign a default system prompt
|
||||
system_prompt = system_prompt_default;
|
||||
user_name = "User:";
|
||||
assistant_name = "Assistant:";
|
||||
params.antiprompt.push_back(user_name);
|
||||
}
|
||||
|
||||
void updateSystemPrompt() {
|
||||
tokens_system = ::llama_tokenize(ctx, system_prompt, true);
|
||||
n_tokens_system = tokens_system.size();
|
||||
|
||||
batch.n_tokens = n_tokens_system;
|
||||
|
||||
// clear the entire KV cache
|
||||
for (int i = 0; i < params.n_parallel; ++i)
|
||||
{
|
||||
llama_kv_cache_seq_rm(ctx, i, 0, -1);
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i)
|
||||
{
|
||||
batch.token[i] = tokens_system[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
for (int32_t i = 1; i < params.n_parallel; ++i)
|
||||
{
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
|
||||
}
|
||||
|
||||
LOG_TEE("system prompt updated\n");
|
||||
update_system_prompt = false;
|
||||
}
|
||||
|
||||
void notifySystemPromptChanged() {
|
||||
// release all slots
|
||||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
slot.release();
|
||||
}
|
||||
waitAllAreIdle();
|
||||
// wait until system prompt load
|
||||
update_system_prompt = true;
|
||||
while(update_system_prompt) {
|
||||
this_thread::sleep_for(chrono::milliseconds(5));
|
||||
}
|
||||
// system prompt loaded, continue
|
||||
}
|
||||
|
||||
llama_client_slot* requestCompletion(json data) {
|
||||
if(data.contains("system_prompt") &&
|
||||
data.contains("anti_prompt") &&
|
||||
data.contains("assistant_name")) {
|
||||
system_prompt = data.value("system_prompt", "");
|
||||
user_name = data.value("anti_prompt", "");
|
||||
assistant_name = data.value("assistant_name", "");
|
||||
params.antiprompt.clear();
|
||||
params.antiprompt.push_back(user_name);
|
||||
notifySystemPromptChanged();
|
||||
}
|
||||
int slot_id = data.value("slot_id", -1);
|
||||
float temperature = data.value("temperature", 0.1f);
|
||||
string prompt = data.value("prompt", "");
|
||||
for (llama_client_slot & slot : slots)
|
||||
{
|
||||
if ((slot_id == -1 && slot.available()) || slot.id == slot_id)
|
||||
{
|
||||
slot.start(prompt, temperature);
|
||||
LOG_TEE("slot %i is processing\n", slot.id);
|
||||
return &slot; // return a pointer to slot (thread safe?)
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t findStoppingStrings(const std::string &text, const size_t last_token_size,
|
||||
const stop_type type)
|
||||
{
|
||||
size_t stop_pos = std::string::npos;
|
||||
for (const std::string &word : params.antiprompt)
|
||||
{
|
||||
size_t pos;
|
||||
if (type == STOP_FULL)
|
||||
{
|
||||
const size_t tmp = word.size() + last_token_size;
|
||||
const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
|
||||
pos = text.find(word, from_pos);
|
||||
}
|
||||
else
|
||||
{
|
||||
pos = find_partial_stop_string(word, text);
|
||||
}
|
||||
if (pos != std::string::npos &&
|
||||
(stop_pos == std::string::npos || pos < stop_pos))
|
||||
{
|
||||
stop_pos = pos;
|
||||
}
|
||||
}
|
||||
return stop_pos;
|
||||
}
|
||||
|
||||
void waitAllAreIdle() {
|
||||
bool wait = true;
|
||||
while(wait) {
|
||||
wait = false;
|
||||
for (auto &slot : slots)
|
||||
{
|
||||
if (!slot.available())
|
||||
{
|
||||
wait = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool updateSlots() {
|
||||
// update the system prompt wait until all slots are idle state
|
||||
if(update_system_prompt) {
|
||||
updateSystemPrompt();
|
||||
}
|
||||
|
||||
batch.n_tokens = 0;
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
for (auto & slot : slots) {
|
||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
||||
{
|
||||
LOG_TEE("slot %i released\n", slot.id);
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, n_tokens_system, n_ctx);
|
||||
slot.state = IDLE;
|
||||
slot.command = NONE;
|
||||
continue;
|
||||
}
|
||||
|
||||
// no decode wait until the token had been send to client
|
||||
// improves performance and avoid decoherence?
|
||||
|
||||
if (slot.state == IDLE || slot.newToken) {
|
||||
continue;
|
||||
}
|
||||
|
||||
batch.token [batch.n_tokens] = slot.sampled;
|
||||
batch.pos [batch.n_tokens] = n_tokens_system + slot.n_prompt + slot.n_decoded;
|
||||
batch.seq_id[batch.n_tokens] = slot.id;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
slot.n_decoded += 1;
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
}
|
||||
|
||||
// assign workload to the slots
|
||||
if (params.cont_batching || batch.n_tokens == 0) {
|
||||
for (llama_client_slot & slot : slots) {
|
||||
// need process the prompt
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT) {
|
||||
slot.state = PROCESSING;
|
||||
slot.command = NONE;
|
||||
//LOG_TEE("slot %i process prompt:\n%s%s'------------------------------\n", slot.id, system_prompt.c_str(), slot.prompt.c_str());
|
||||
std::fill(slot.tokens_prev.begin(), slot.tokens_prev.end(), 0);
|
||||
|
||||
// do not prepend BOS because we have a system prompt!
|
||||
std::vector<llama_token> tokens_prompt;
|
||||
tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false);
|
||||
|
||||
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
||||
batch.token [batch.n_tokens] = tokens_prompt[i];
|
||||
batch.pos [batch.n_tokens] = i + n_tokens_system;
|
||||
batch.seq_id[batch.n_tokens] = slot.id;
|
||||
batch.logits[batch.n_tokens] = false;
|
||||
batch.n_tokens += 1;
|
||||
}
|
||||
|
||||
// extract the logits only for the last token
|
||||
if (batch.n_tokens > 0) {
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
}
|
||||
|
||||
slot.n_prompt = tokens_prompt.size();
|
||||
slot.n_decoded = 0;
|
||||
slot.i_batch = batch.n_tokens - 1;
|
||||
|
||||
// insert new requests one-by-one
|
||||
//if (cont_batching) {
|
||||
// break;
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// process in chunks of params.n_batch
|
||||
int32_t n_batch = params.n_batch;
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
// experiment: process in powers of 2
|
||||
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
|
||||
// n_batch /= 2;
|
||||
// i -= n_batch;
|
||||
// continue;
|
||||
//}
|
||||
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
if (ret != 0) {
|
||||
if (n_batch == 1 || ret < 0) {
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
n_batch /= 2;
|
||||
i -= n_batch;
|
||||
continue;
|
||||
}
|
||||
|
||||
for (auto & slot : slots) {
|
||||
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
params.temp = slot.temperature;
|
||||
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, slot.tokens_prev, candidates, slot.i_batch - i);
|
||||
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
slot.tokens_prev.erase(slot.tokens_prev.begin());
|
||||
slot.tokens_prev.push_back(id);
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
slot.generated_text += token_str;
|
||||
slot.sampled = id;
|
||||
|
||||
size_t stop_pos =
|
||||
findStoppingStrings(slot.generated_text, token_str.size(), STOP_FULL);
|
||||
|
||||
slot.sampled_token_str = token_str;
|
||||
// notify new token
|
||||
slot.nofity();
|
||||
|
||||
if (slot.n_decoded > 2 &&
|
||||
(id == llama_token_eos(ctx) ||
|
||||
(params.n_predict > 0 &&
|
||||
slot.n_decoded + slot.n_prompt >=
|
||||
params.n_predict) ||
|
||||
stop_pos != std::string::npos)) {
|
||||
//LOG_TEE("slot %i generated text:\n%s'------------------------------\n", slot.id, slot.generated_text.c_str());
|
||||
slot.generated_text.clear();
|
||||
slot.release();
|
||||
}
|
||||
slot.i_batch = -1;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
const server_params &sparams)
|
||||
{
|
||||
printf("usage: %s [options]\n", argv0);
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported())
|
||||
{
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
printf(" -nommq, --no-mul-mat-q\n");
|
||||
printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: %s)\n", params.model.c_str());
|
||||
printf(" -a ALIAS, --alias ALIAS\n");
|
||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
||||
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||
|
||||
// new arguments
|
||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -f FNAME, --file FNAME\n");
|
||||
printf(" load a system prompt from a file.\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
gpt_params ¶ms)
|
||||
{
|
||||
gpt_params default_params;
|
||||
server_params default_sparams;
|
||||
std::string arg;
|
||||
bool invalid_param = false;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
arg = argv[i];
|
||||
if (arg == "--port")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.port = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--host")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.hostname = argv[i];
|
||||
}
|
||||
else if (arg == "--path")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.public_path = argv[i];
|
||||
}
|
||||
else if (arg == "--timeout" || arg == "-to")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.read_timeout = std::stoi(argv[i]);
|
||||
sparams.write_timeout = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-m" || arg == "--model")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
}
|
||||
else if (arg == "-a" || arg == "--alias")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model_alias = argv[i];
|
||||
}
|
||||
else if (arg == "-h" || arg == "--help")
|
||||
{
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-freq-base")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-freq-scale")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||
{
|
||||
params.memory_f16 = false;
|
||||
}
|
||||
else if (arg == "--threads" || arg == "-t")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-b" || arg == "--batch-size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_batch = std::stoi(argv[i]);
|
||||
params.n_batch = std::min(512, params.n_batch);
|
||||
}
|
||||
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_TEE("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||
"See main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
}
|
||||
else if (arg == "--tensor-split" || arg == "-ts")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
|
||||
{
|
||||
if (i_device < split_arg.size())
|
||||
{
|
||||
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
|
||||
}
|
||||
else
|
||||
{
|
||||
params.tensor_split[i_device] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
||||
{
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.mul_mat_q = false;
|
||||
#else
|
||||
LOG_TEE("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else if (arg == "--main-gpu" || arg == "-mg")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_TEE("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.");
|
||||
#endif
|
||||
}
|
||||
else if (arg == "--lora")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back({argv[i], 1.0f});
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-scaled")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
const char * lora_adapter = argv[i];
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back(make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_base = argv[i];
|
||||
}
|
||||
else if (arg == "--mlock")
|
||||
{
|
||||
params.use_mlock = true;
|
||||
}
|
||||
else if (arg == "--no-mmap")
|
||||
{
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--numa")
|
||||
{
|
||||
params.numa = true;
|
||||
} else if (arg == "-cb" || arg == "--cont-batching")
|
||||
{
|
||||
params.cont_batching = true;
|
||||
}
|
||||
else if (arg == "-np" || arg == "--parallel")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_parallel = std::stoi(argv[i]);
|
||||
} else if (arg == "-n" || arg == "--n-predict")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_predict = std::stoi(argv[i]);
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (invalid_param)
|
||||
{
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
gpt_params params;
|
||||
|
||||
server_params sparams;
|
||||
|
||||
server_params_parse(argc, argv, sparams, params);
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("server-parallel", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
// load the target model
|
||||
params.logits_all = true;
|
||||
server_parallel_context llama;
|
||||
|
||||
if(!llama.loadModel(params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama.initialize();
|
||||
|
||||
Server svr;
|
||||
|
||||
svr.Get("/", [&](const Request & /*req*/, Response &res)
|
||||
{ res.set_content(index_html_, "text/html"); });
|
||||
|
||||
svr.Get("/index.js", [&](const Request & /*req*/, Response &res)
|
||||
{ res.set_content(index_js_, "text/html"); });
|
||||
|
||||
svr.Get("/props", [&llama](const Request & /*req*/, Response &res)
|
||||
{
|
||||
json data = {
|
||||
{ "user_name", llama.user_name.c_str() },
|
||||
{ "assistant_name", llama.assistant_name.c_str() }
|
||||
};
|
||||
res.set_content(data.dump(), "application/json"); });
|
||||
|
||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
llama_client_slot* slot = llama.requestCompletion(json::parse(req.body));
|
||||
// Verify if the slot exist
|
||||
if (slot) {
|
||||
res.set_chunked_content_provider("text/event-stream",
|
||||
[slot](size_t /*offset*/, DataSink &sink) {
|
||||
if(slot->available()) { // slot has been released
|
||||
sink.done();
|
||||
return false;
|
||||
}
|
||||
if(slot->hasNewToken()) { // new token notification
|
||||
stringstream ss;
|
||||
json res_d = {{ "content", slot->sampled_token_str }};
|
||||
ss << "data: " << res_d.dump() << "\n\n";
|
||||
string result = ss.str();
|
||||
if(!sink.write(result.c_str(), result.size())) {
|
||||
slot->release();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
} else {
|
||||
LOG_TEE("slot unavailable\n");
|
||||
res.status = 404;
|
||||
res.set_content("slot_error", "text/plain");
|
||||
} });
|
||||
|
||||
thread t([&llama]()
|
||||
{
|
||||
bool running = true;
|
||||
while (running)
|
||||
{
|
||||
running = llama.updateSlots();
|
||||
} });
|
||||
|
||||
svr.set_read_timeout(sparams.read_timeout);
|
||||
svr.set_write_timeout(sparams.write_timeout);
|
||||
|
||||
if (!svr.bind_to_port(sparams.hostname, sparams.port))
|
||||
{
|
||||
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Set the base directory for serving static files
|
||||
svr.set_base_dir(sparams.public_path);
|
||||
|
||||
// to make it ctrl+clickable:
|
||||
printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||
|
||||
if (!svr.listen_after_bind())
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@@ -448,7 +448,7 @@ struct llama_server_context
|
||||
n_past = common_part(embd, prompt_tokens);
|
||||
|
||||
// since #3228 we now have to manually manage the KV cache
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
embd = prompt_tokens;
|
||||
if (n_past == num_prompt_tokens)
|
||||
@@ -504,9 +504,11 @@ struct llama_server_context
|
||||
});
|
||||
}
|
||||
|
||||
bool tg = true;
|
||||
while (n_past < embd.size())
|
||||
{
|
||||
int n_eval = (int)embd.size() - n_past;
|
||||
tg = n_eval == 1;
|
||||
if (n_eval > params.n_batch)
|
||||
{
|
||||
n_eval = params.n_batch;
|
||||
@@ -633,7 +635,9 @@ struct llama_server_context
|
||||
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(result.tok);
|
||||
num_tokens_predicted++;
|
||||
if (tg) {
|
||||
num_tokens_predicted++;
|
||||
}
|
||||
}
|
||||
|
||||
// add it to the context
|
||||
@@ -1011,7 +1015,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back({argv[i], 1.0f});
|
||||
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-scaled")
|
||||
@@ -1027,7 +1031,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
|
||||
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
@@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)
|
||||
{
|
||||
const auto timings = llama_get_timings(llama.ctx);
|
||||
|
||||
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
|
||||
|
||||
return json{
|
||||
{"prompt_n", timings.n_p_eval},
|
||||
{"prompt_ms", timings.t_p_eval_ms},
|
||||
|
||||
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
|
||||
LOG("out of drafted tokens\n");
|
||||
}
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
|
||||
++n_past_dft;
|
||||
|
||||
@@ -257,7 +257,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// evaluate the drafted token on the draft model
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
|
||||
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
|
||||
++n_past_cur;
|
||||
|
||||
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// evaluate the target model on the drafted tokens
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
|
||||
++n_past_tgt;
|
||||
|
||||
|
||||
@@ -364,7 +364,7 @@ class ModelParams:
|
||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||
|
||||
def tensor_name(key, bid=None):
|
||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
|
||||
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
|
||||
|
||||
class Layer:
|
||||
def __init__(self, params, bid):
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
mkdir -p $out/include
|
||||
cp ${src}/llama.h $out/include/
|
||||
'';
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||
cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||
in
|
||||
{
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
|
||||
+4
-5
@@ -1213,12 +1213,9 @@ void ggml_metal_graph_compute(
|
||||
float max_bias;
|
||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
||||
|
||||
if (__builtin_popcount(n_head) != 1) {
|
||||
GGML_ASSERT(false && "only power-of-two n_head implemented");
|
||||
}
|
||||
|
||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
@@ -1239,7 +1236,9 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||
[encoder setBytes:&m1 length:sizeof( float) atIndex:19];
|
||||
[encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
|
||||
+9
-2
@@ -830,7 +830,9 @@ kernel void kernel_alibi_f32(
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant float & m0,
|
||||
constant float & m0,
|
||||
constant float & m1,
|
||||
constant int & n_heads_log2_floor,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
@@ -846,7 +848,12 @@ kernel void kernel_alibi_f32(
|
||||
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
float m_k = pow(m0, i2 + 1);
|
||||
float m_k;
|
||||
if (i2 < n_heads_log2_floor) {
|
||||
m_k = pow(m0, i2 + 1);
|
||||
} else {
|
||||
m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
|
||||
}
|
||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
|
||||
|
||||
+119
-53
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
|
||||
|
||||
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
||||
{
|
||||
const int i = get_group_id(0);
|
||||
const int i = get_group_id(0) + get_global_offset(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int n = tid / 32;
|
||||
const int l = tid - 32 * n;
|
||||
const int is = 8 * n + l / 16;
|
||||
|
||||
const uint8_t q = x[i].qs[32 * n + l];
|
||||
__global float *y = yy + i * QK_K + 128 * n;
|
||||
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
|
||||
|
||||
const float dall = vload_half(0, &x[i].d);
|
||||
const float dmin = vload_half(0, &x[i].dmin);
|
||||
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
||||
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
||||
{
|
||||
int r = get_local_id(0) / 4;
|
||||
int i = get_group_id(0);
|
||||
int i = get_group_id(0) + get_global_offset(0);
|
||||
int tid = r / 2;
|
||||
int is0 = r % 2;
|
||||
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
||||
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
||||
float d_all = vload_half(0, &x[i].d);
|
||||
float dl = d_all * (us - 32);
|
||||
|
||||
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
|
||||
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
|
||||
const __global uint8_t *q = x[i].qs + 32 * n;
|
||||
const __global uint8_t *hm = x[i].hmask;
|
||||
|
||||
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
||||
|
||||
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
||||
{
|
||||
const int i = get_group_id(0);
|
||||
const int i = get_group_id(0) + get_global_offset(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int il = tid / 8;
|
||||
const int ir = tid % 8;
|
||||
const int is = 2 * il;
|
||||
const int n = 4;
|
||||
|
||||
__global float *y = yy + i * QK_K + 64 * il + n * ir;
|
||||
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
|
||||
|
||||
const float dall = vload_half(0, &x[i].d);
|
||||
const float dmin = vload_half(0, &x[i].dmin);
|
||||
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
||||
|
||||
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
||||
{
|
||||
const int i = get_group_id(0);
|
||||
const int i = get_group_id(0) + get_global_offset(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int il = tid / 16;
|
||||
const int ir = tid % 16;
|
||||
const int is = 2 * il;
|
||||
|
||||
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
|
||||
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
|
||||
|
||||
const float dall = vload_half(0, &x[i].d);
|
||||
const float dmin = vload_half(0, &x[i].dmin);
|
||||
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
||||
|
||||
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
||||
{
|
||||
const int i = get_group_id(0);
|
||||
const int i = get_group_id(0) + get_global_offset(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int ip = tid / 32;
|
||||
const int il = tid - 32 * ip;
|
||||
const int is = 8 * ip + il / 16;
|
||||
|
||||
__global float *y = yy + i * QK_K + 128 * ip + il;
|
||||
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
|
||||
|
||||
const float d = vload_half(0, &x[i].d);
|
||||
|
||||
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
||||
const uint qk = QUANT_K;
|
||||
const uint qr = QUANT_R;
|
||||
|
||||
const int ib = i/qk; // block index
|
||||
const int ib = i/qk + get_global_offset(0); // block index
|
||||
const int iqs = (i%qk)/qr; // quant index
|
||||
const int iybs = i - i%qk; // y block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
||||
const enum ggml_type type = src->type;
|
||||
const size_t ts = ggml_type_size(type);
|
||||
const size_t bs = ggml_blck_size(type);
|
||||
const uint64_t row_size = ts*ne0/bs;
|
||||
|
||||
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
|
||||
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
||||
err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
|
||||
return err;
|
||||
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
|
||||
if (nb0 == ts && nb1 == row_size) {
|
||||
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
|
||||
}
|
||||
if (nb0 == ts) {
|
||||
const size_t buffer_origin[3] = { offset, 0, 0 };
|
||||
const size_t host_origin[3] = { 0, 0, 0 };
|
||||
const size_t region[3] = { ts*ne0/bs, ne1, 1 };
|
||||
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
|
||||
return err;
|
||||
const size_t region[3] = { row_size, ne1, 1 };
|
||||
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
|
||||
}
|
||||
std::vector<cl_event> events;
|
||||
if (ev && ne1>1) events.reserve(ne1-1);
|
||||
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||
// pretend the row is a matrix with cols=1
|
||||
const size_t buffer_origin[3] = { offset, i1, 0 };
|
||||
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
|
||||
const size_t host_origin[3] = { 0, 0, 0 };
|
||||
const size_t region[3] = { ts/bs, ne0, 1 };
|
||||
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
|
||||
const size_t region[3] = { ts, ne0/bs, 1 };
|
||||
// if an event is requested, make the last write wait for all previous writes to complete
|
||||
if (ev && i1) {
|
||||
events.push_back(*ev);
|
||||
}
|
||||
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
|
||||
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
|
||||
if (err != CL_SUCCESS) {
|
||||
break;
|
||||
for (auto event : events) {
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
}
|
||||
return err;
|
||||
for (auto event : events) {
|
||||
CL_CHECK(clReleaseEvent(event));
|
||||
}
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -1476,10 +1488,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne00;
|
||||
@@ -1498,13 +1515,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
size_t x_offset = 0;
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
// copy data to device
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else if (i02 != pi02 || i03 != pi03) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
}
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
@@ -1514,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
@@ -1525,7 +1554,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
}
|
||||
@@ -1547,6 +1576,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const int nb10 = src1->nb[0];
|
||||
const int nb11 = src1->nb[1];
|
||||
@@ -1556,6 +1587,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
||||
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
||||
const int x_ne = ne01 * ne00;
|
||||
@@ -1577,32 +1611,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
bool src1_cont_rows = nb10 == sizeof(float);
|
||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
size_t x_offset = 0;
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
// copy src0 to device
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else if (i02 != pi02 || i03 != pi03) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
}
|
||||
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
||||
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
|
||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
||||
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||
if (src1_cont_rows) {
|
||||
if (src1_cont_cols) {
|
||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
}
|
||||
else {
|
||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
||||
for (int64_t i00 = 0; i00 < ne10; i00++) {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
||||
// very slow due to no inlining
|
||||
tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
|
||||
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1618,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
@@ -1631,7 +1677,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
// copy dst to host, then convert to float
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
}
|
||||
@@ -1652,18 +1698,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
const ggml_type type = src0->type;
|
||||
const bool mul_mat_vec = ne11 == 1;
|
||||
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
|
||||
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
|
||||
const size_t q_sz = ggml_type_size(type) * x_bps;
|
||||
|
||||
size_t x_size;
|
||||
size_t y_size;
|
||||
@@ -1690,12 +1742,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t ev_idx = 0;
|
||||
std::vector<cl_event> events;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
if (i02 != pi02 || i03 != pi03) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
}
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
@@ -1704,7 +1767,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||
// copy src1 to device
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
||||
@@ -1720,12 +1783,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne / global_denom;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
events.emplace_back();
|
||||
|
||||
@@ -1749,7 +1813,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
||||
for (auto *event : events) {
|
||||
clReleaseEvent(event);
|
||||
@@ -1844,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||
const int64_t ne3 = tensor->ne[3];
|
||||
|
||||
const ggml_type type = tensor->type;
|
||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
|
||||
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
|
||||
|
||||
size_t q_size;
|
||||
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||
|
||||
tensor->data = data;
|
||||
// copy tensor to device
|
||||
size_t offset = 0;
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
int i = i3*ne2 + i2;
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
|
||||
offset += s_sz;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -401,10 +401,14 @@ extern "C" {
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
@@ -1386,6 +1390,14 @@ extern "C" {
|
||||
int s,
|
||||
int d);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int p0,
|
||||
int d0);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1759,6 +1771,7 @@ extern "C" {
|
||||
GGML_OPT_NO_CONTEXT,
|
||||
GGML_OPT_INVALID_WOLFE,
|
||||
GGML_OPT_FAIL,
|
||||
GGML_OPT_CANCEL,
|
||||
|
||||
GGML_LINESEARCH_FAIL = -128,
|
||||
GGML_LINESEARCH_MINIMUM_STEP,
|
||||
|
||||
+250
-149
@@ -85,10 +85,13 @@ class MODEL_ARCH(IntEnum):
|
||||
GPTNEOX : int = auto()
|
||||
MPT : int = auto()
|
||||
STARCODER : int = auto()
|
||||
REFACT : int = auto()
|
||||
BERT : int = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
TOKEN_EMBD : int = auto()
|
||||
TOKEN_TYPES : int = auto()
|
||||
POS_EMBD : int = auto()
|
||||
OUTPUT : int = auto()
|
||||
OUTPUT_NORM : int = auto()
|
||||
@@ -116,78 +119,153 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.STARCODER: "starcoder",
|
||||
MODEL_ARCH.REFACT: "refact",
|
||||
MODEL_ARCH.BERT: "bert",
|
||||
}
|
||||
|
||||
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||
MODEL_ARCH.LLAMA: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.GPTNEOX: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.FALCON: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.BAICHUAN: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.STARCODER: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.GPT2: {
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.GPTNEOX: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.FALCON: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM_2,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.BAICHUAN: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.STARCODER: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.BERT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_TYPES,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.MPT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.GPTJ: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.REFACT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.GPT2: [
|
||||
# TODO
|
||||
},
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -208,31 +286,41 @@ class TensorNameMap:
|
||||
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Token embeddings
|
||||
MODEL_TENSOR.TOKEN_EMBD: (
|
||||
"gpt_neox.embed_in", # gptneox
|
||||
"transformer.wte", # gpt2 mpt
|
||||
"transformer.word_embeddings", # falcon
|
||||
"model.embed_tokens", # llama-hf
|
||||
"tok_embeddings", # llama-pth
|
||||
"gpt_neox.embed_in", # gptneox
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact
|
||||
"transformer.word_embeddings", # falcon
|
||||
"model.embed_tokens", # llama-hf
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
MODEL_TENSOR.TOKEN_TYPES: (
|
||||
"embeddings.token_type_embeddings", # bert
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
MODEL_TENSOR.POS_EMBD: (
|
||||
"transformer.wpe", # gpt2
|
||||
"transformer.wpe", # gpt2
|
||||
"embeddings.position_embeddings", # bert
|
||||
),
|
||||
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||
"output", # llama-pth
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 gpt-j mpt falcon llama-hf baichuan
|
||||
"output", # llama-pth
|
||||
),
|
||||
|
||||
# Output norm
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 falcon
|
||||
"model.norm", # llama-hf baichuan
|
||||
"norm", # llama-pth
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||
"model.norm", # llama-hf baichuan
|
||||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
"ln_f", # refact
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
@@ -244,13 +332,14 @@ class TensorNameMap:
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Attention norm
|
||||
MODEL_TENSOR.ATTN_NORM: (
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -260,38 +349,46 @@ class TensorNameMap:
|
||||
|
||||
# Attention query-key-value
|
||||
MODEL_TENSOR.ATTN_QKV: (
|
||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
),
|
||||
|
||||
# Attention query
|
||||
MODEL_TENSOR.ATTN_Q: (
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||
"layers.{bid}.attention.wq", # llama-pth
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||
"layers.{bid}.attention.wq", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.query", # bert
|
||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||
),
|
||||
|
||||
# Attention key
|
||||
MODEL_TENSOR.ATTN_K: (
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||
"layers.{bid}.attention.wk", # llama-pth
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||
"layers.{bid}.attention.wk", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.key", # bert
|
||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||
),
|
||||
|
||||
# Attention value
|
||||
MODEL_TENSOR.ATTN_V: (
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||
"layers.{bid}.attention.wv", # llama-pth
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||
"layers.{bid}.attention.wv", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.value", # bert
|
||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||
),
|
||||
|
||||
# Attention output
|
||||
MODEL_TENSOR.ATTN_OUT: (
|
||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||
"transformer.h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
@@ -302,64 +399,65 @@ class TensorNameMap:
|
||||
|
||||
# Feed-forward norm
|
||||
MODEL_TENSOR.FFN_NORM: (
|
||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_2", # gpt2
|
||||
"transformer.blocks.{bid}.norm_2", # mpt
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_2", # gpt2 refact
|
||||
"transformer.blocks.{bid}.norm_2", # mpt
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
MODEL_TENSOR.FFN_UP: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
),
|
||||
|
||||
# Feed-forward gate
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
MODEL_TENSOR.FFN_DOWN: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_proj", # gpt2
|
||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
),
|
||||
}
|
||||
|
||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||
|
||||
tensor_names: dict[MODEL_TENSOR, str]
|
||||
|
||||
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
||||
mapping = self.mapping = {}
|
||||
tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
|
||||
self.mapping = {}
|
||||
for tensor, keys in self.mappings_cfg.items():
|
||||
tensor_name = tensor_names.get(tensor)
|
||||
if tensor_name is None:
|
||||
if tensor not in MODEL_TENSORS[arch]:
|
||||
continue
|
||||
mapping[tensor_name] = (tensor, tensor_name)
|
||||
tensor_name = TENSOR_NAMES[tensor]
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
for bid in range(n_blocks):
|
||||
for tensor, keys in self.block_mappings_cfg.items():
|
||||
tensor_name = tensor_names.get(tensor)
|
||||
if tensor_name is None:
|
||||
if tensor not in MODEL_TENSORS[arch]:
|
||||
continue
|
||||
tensor_name = tensor_name.format(bid = bid)
|
||||
mapping[tensor_name] = (tensor, tensor_name)
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid)
|
||||
mapping[key] = (tensor, tensor_name)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||
result = self.mapping.get(key)
|
||||
@@ -800,22 +898,25 @@ class SpecialVocab:
|
||||
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
||||
special_token_ids: dict[str, int] = {}
|
||||
|
||||
def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
|
||||
def __init__(
|
||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||
special_token_types: tuple[str, ...] | None = None,
|
||||
):
|
||||
self.special_token_ids = {}
|
||||
self.load_merges = load_merges
|
||||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
self.load(path)
|
||||
self._load(Path(path))
|
||||
|
||||
def load(self, path: Path):
|
||||
if not self.try_load_from_tokenizer_json(path):
|
||||
self.try_load_from_config_json(path)
|
||||
def _load(self, path: Path) -> None:
|
||||
if not self._try_load_from_tokenizer_json(path):
|
||||
self._try_load_from_config_json(path)
|
||||
|
||||
def try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||
tokenizer_file = path / 'tokenizer.json'
|
||||
if not tokenizer_file.is_file():
|
||||
return False
|
||||
with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
|
||||
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||
tokenizer = json.load(f)
|
||||
if self.load_merges:
|
||||
merges = tokenizer.get('model', {}).get('merges')
|
||||
@@ -825,7 +926,7 @@ class SpecialVocab:
|
||||
added_tokens = tokenizer.get('added_tokens')
|
||||
if added_tokens is None or not tokenizer_config_file.is_file():
|
||||
return True
|
||||
with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
|
||||
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||
tokenizer_config = json.load(f)
|
||||
for typ in self.special_token_types:
|
||||
entry = tokenizer_config.get(f'{typ}_token')
|
||||
@@ -844,11 +945,11 @@ class SpecialVocab:
|
||||
break
|
||||
return True
|
||||
|
||||
def try_load_from_config_json(self, path: Path) -> bool:
|
||||
def _try_load_from_config_json(self, path: Path) -> bool:
|
||||
config_file = path / 'config.json'
|
||||
if not config_file.is_file():
|
||||
return False
|
||||
with open(config_file, 'r', encoding = 'utf-8') as f:
|
||||
with open(config_file, encoding = 'utf-8') as f:
|
||||
config = json.load(f)
|
||||
for typ in self.special_token_types:
|
||||
maybe_token_id = config.get(f'{typ}_token_id')
|
||||
@@ -856,7 +957,7 @@ class SpecialVocab:
|
||||
self.special_token_ids[typ] = maybe_token_id
|
||||
return True
|
||||
|
||||
def add_to_gguf(self, gw: GGUFWriter):
|
||||
def add_to_gguf(self, gw: GGUFWriter) -> None:
|
||||
if len(self.merges) > 0:
|
||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||
gw.add_token_merges(self.merges)
|
||||
@@ -868,8 +969,8 @@ class SpecialVocab:
|
||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||
handler(tokid)
|
||||
|
||||
def __repr__(self):
|
||||
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
|
||||
def __repr__(self) -> str:
|
||||
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
|
||||
|
||||
|
||||
# Example usage:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.3.3"
|
||||
version = "0.4.0"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
||||
+744
-2
@@ -54,6 +54,10 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@@ -65,7 +69,6 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
// 2-6 bit quantization in super-blocks
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// ===================== Helper functions
|
||||
//
|
||||
@@ -344,7 +347,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
||||
const float q4scale = 15.f;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||
float max_min = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
@@ -1582,6 +1584,90 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
float sumf = 0;
|
||||
uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
||||
|
||||
size_t vl = 16;
|
||||
|
||||
vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
|
||||
vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
|
||||
|
||||
vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
|
||||
|
||||
vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
|
||||
vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
|
||||
vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
|
||||
vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
|
||||
vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
||||
|
||||
sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
|
||||
|
||||
vl = 32;
|
||||
|
||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
|
||||
|
||||
uint8_t is=0;
|
||||
int isum=0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
// load Q2
|
||||
vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
|
||||
|
||||
vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
|
||||
vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
|
||||
vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
|
||||
vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
|
||||
|
||||
// duplicate scale elements for product
|
||||
vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
|
||||
vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
|
||||
vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
|
||||
vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
|
||||
|
||||
vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
|
||||
vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
|
||||
vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
|
||||
vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
|
||||
|
||||
// load Q8
|
||||
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
||||
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
||||
vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
|
||||
vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
|
||||
|
||||
vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
|
||||
vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
|
||||
vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
|
||||
vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
|
||||
|
||||
vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
|
||||
vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
|
||||
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(isum1);
|
||||
|
||||
q2+=32; q8+=128; is=8;
|
||||
|
||||
}
|
||||
|
||||
sumf += dall * isum;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
@@ -1807,6 +1893,64 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc) + summs;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * scales = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * (float)x[i].d;
|
||||
const float dmin = -y[i].d * (float)x[i].dmin;
|
||||
|
||||
const uint8_t * restrict q2 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
|
||||
|
||||
aux32[0] = sc[0] & 0x0f0f0f0f;
|
||||
aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
|
||||
|
||||
int isum1 = 0;
|
||||
int isum2 = 0;
|
||||
|
||||
size_t vl = 16;
|
||||
|
||||
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
||||
|
||||
// load Q2
|
||||
vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
|
||||
|
||||
vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
|
||||
vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
|
||||
vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
|
||||
vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
|
||||
|
||||
// load Q8, and take product with Q2
|
||||
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||
|
||||
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
|
||||
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
|
||||
vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
|
||||
vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
|
||||
|
||||
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
|
||||
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
|
||||
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
|
||||
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
|
||||
|
||||
sumf += d * (isum1 + isum2);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
@@ -2220,6 +2364,106 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
uint32_t aux[3];
|
||||
uint32_t utmp[4];
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].hmask;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
memcpy(aux, x[i].scales, 12);
|
||||
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
||||
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
||||
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
||||
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
for (int j = 0; j < 16; ++j) scale[j] -= 32;
|
||||
|
||||
|
||||
size_t vl = 32;
|
||||
uint8_t m = 1;
|
||||
|
||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
|
||||
|
||||
int sum_t = 0;
|
||||
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
|
||||
vl = 32;
|
||||
|
||||
// load Q3
|
||||
vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
|
||||
|
||||
vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
|
||||
vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
|
||||
vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
|
||||
vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
|
||||
|
||||
// compute mask for subtraction
|
||||
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
||||
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
||||
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
||||
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
||||
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
// load Q8 and take product with Q3
|
||||
vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
||||
vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
||||
vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
||||
vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
||||
|
||||
vl = 16;
|
||||
|
||||
// retreive lane to multiply with scale
|
||||
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
||||
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
||||
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
||||
vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
|
||||
vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
|
||||
vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
|
||||
vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
|
||||
vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
|
||||
|
||||
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
|
||||
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
|
||||
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
|
||||
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
|
||||
|
||||
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
||||
|
||||
q3 += 32; q8 += 128; scale += 8;
|
||||
|
||||
}
|
||||
|
||||
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
||||
|
||||
sumf += d*sum_t;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
@@ -2523,6 +2767,79 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
uint16_t aux16[2];
|
||||
int8_t * scales = (int8_t *)aux16;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const uint16_t a = *(const uint16_t *)x[i].scales;
|
||||
aux16[0] = a & 0x0f0f;
|
||||
aux16[1] = (a >> 4) & 0x0f0f;
|
||||
|
||||
for (int j = 0; j < 4; ++j) scales[j] -= 8;
|
||||
|
||||
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
||||
|
||||
const float d = y[i].d * (float)x[i].d;
|
||||
|
||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
// load qh
|
||||
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
|
||||
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
||||
|
||||
size_t vl = 16;
|
||||
|
||||
// extend and combine both qh_x1 and qh_x2
|
||||
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
||||
|
||||
vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
||||
vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
|
||||
vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
||||
vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
|
||||
|
||||
// load Q3
|
||||
vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl);
|
||||
|
||||
vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
|
||||
vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
|
||||
vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
|
||||
vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
|
||||
|
||||
vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
|
||||
vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
|
||||
vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
|
||||
vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
|
||||
|
||||
// load Q8 and take product with Q3
|
||||
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||
|
||||
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
||||
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
||||
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
||||
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
||||
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
|
||||
|
||||
sumf += d * isum;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
@@ -2823,6 +3140,78 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
size_t vl = 8;
|
||||
|
||||
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||
const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
||||
|
||||
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
||||
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
||||
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
||||
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
||||
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
||||
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
||||
|
||||
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
||||
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
vl = 32;
|
||||
|
||||
int32_t sum_1 = 0;
|
||||
int32_t sum_2 = 0;
|
||||
|
||||
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
||||
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
// load Q4
|
||||
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
||||
|
||||
// load Q8 and multiply it with lower Q4 nibble
|
||||
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
||||
vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
||||
vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
|
||||
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
|
||||
|
||||
sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
|
||||
|
||||
// load Q8 and multiply it with upper Q4 nibble
|
||||
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
||||
vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
||||
vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
|
||||
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
|
||||
|
||||
sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
|
||||
|
||||
q4 += 32; q8 += 64;
|
||||
|
||||
}
|
||||
|
||||
sumf += d*(sum_1 + sum_2);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -3064,6 +3453,50 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc) - summs;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
uint16_t s16[2];
|
||||
const uint8_t * restrict scales = (const uint8_t *)s16;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const uint16_t * restrict b = (const uint16_t *)x[i].scales;
|
||||
s16[0] = b[0] & 0x0f0f;
|
||||
s16[1] = (b[0] >> 4) & 0x0f0f;
|
||||
|
||||
sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
|
||||
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
|
||||
|
||||
size_t vl = 32;
|
||||
|
||||
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
||||
|
||||
// load Q4
|
||||
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
||||
|
||||
// load Q8 and multiply it with lower Q4 nibble
|
||||
vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
||||
vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
|
||||
vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
|
||||
|
||||
sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
|
||||
|
||||
// load Q8 and multiply it with upper Q4 nibble
|
||||
vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
||||
vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
||||
vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
|
||||
|
||||
sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
uint8_t aux8[QK_K];
|
||||
@@ -3394,6 +3827,93 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc) + summs;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
float sumf = 0;
|
||||
float sums = 0.0;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
vl = 8;
|
||||
|
||||
const uint8_t * restrict q5 = x[i].qs;
|
||||
const uint8_t * restrict hm = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
||||
const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
|
||||
|
||||
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
||||
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
||||
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
||||
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
||||
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
||||
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
||||
|
||||
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
||||
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
||||
|
||||
vl = 32;
|
||||
int32_t aux32 = 0;
|
||||
int is = 0;
|
||||
|
||||
uint8_t m = 1;
|
||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
|
||||
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
// load Q5 and Q8
|
||||
vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
|
||||
vint8m1_t q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
|
||||
vint8m1_t q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
|
||||
|
||||
// compute mask for addition
|
||||
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
||||
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
||||
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
|
||||
m <<= 1;
|
||||
|
||||
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
||||
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
||||
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
|
||||
m <<= 1;
|
||||
|
||||
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
||||
vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
|
||||
|
||||
vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
|
||||
vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
|
||||
|
||||
vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
|
||||
vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
|
||||
|
||||
aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
|
||||
q5 += 32; q8 += 64;
|
||||
|
||||
}
|
||||
|
||||
vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
|
||||
sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf+sums;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
@@ -3639,6 +4159,76 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * (float)x[i].d;
|
||||
const int8_t * sc = x[i].scales;
|
||||
|
||||
const uint8_t * restrict q5 = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
// load qh
|
||||
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8);
|
||||
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
||||
|
||||
size_t vl = 16;
|
||||
|
||||
// combine both qh_1 and qh_2
|
||||
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
||||
|
||||
vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
||||
vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
|
||||
vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
|
||||
vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
||||
|
||||
vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
|
||||
vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
|
||||
vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
|
||||
vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
|
||||
|
||||
// load q5
|
||||
vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl);
|
||||
vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl);
|
||||
|
||||
vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
|
||||
vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
|
||||
vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
|
||||
vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
|
||||
|
||||
vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
|
||||
vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
|
||||
vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
|
||||
vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
|
||||
|
||||
// load Q8 and multiply it with Q5
|
||||
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||
|
||||
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
||||
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
||||
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
||||
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
||||
|
||||
int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
|
||||
int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
|
||||
int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
|
||||
int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
|
||||
|
||||
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
@@ -4023,6 +4613,91 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const int8_t * restrict scale = x[i].scales;
|
||||
|
||||
size_t vl;
|
||||
|
||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
int sum_t = 0;
|
||||
int is = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
|
||||
vl = 32;
|
||||
|
||||
// load qh
|
||||
vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
|
||||
|
||||
// load Q6
|
||||
vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
|
||||
vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
|
||||
|
||||
vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
|
||||
vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
|
||||
vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
|
||||
vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
|
||||
|
||||
vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
|
||||
vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
|
||||
vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
|
||||
vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
|
||||
|
||||
vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
|
||||
vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
|
||||
vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
|
||||
vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
|
||||
|
||||
vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
|
||||
vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
|
||||
vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
|
||||
vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
|
||||
|
||||
// load Q8 and take product
|
||||
vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
||||
vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
||||
vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
||||
vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
||||
|
||||
vl = 16;
|
||||
|
||||
vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
|
||||
vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
|
||||
vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
|
||||
vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
|
||||
vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
|
||||
vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
|
||||
vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
|
||||
vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
|
||||
|
||||
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
|
||||
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
|
||||
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
|
||||
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
|
||||
|
||||
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
||||
|
||||
q6 += 64; qh += 32; q8 += 128; is=8;
|
||||
|
||||
}
|
||||
|
||||
sumf += d * sum_t;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
@@ -4276,6 +4951,73 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d_all = (float)x[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const int8_t * restrict scale = x[i].scales;
|
||||
|
||||
int32_t isum = 0;
|
||||
|
||||
size_t vl = 16;
|
||||
|
||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||
|
||||
// load Q6
|
||||
vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
|
||||
vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
|
||||
|
||||
// load qh
|
||||
vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
|
||||
|
||||
vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
||||
vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
||||
vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
||||
vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||
|
||||
vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
|
||||
vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
|
||||
vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
|
||||
vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
|
||||
|
||||
vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
|
||||
vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
|
||||
vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
|
||||
vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
|
||||
|
||||
// load Q8 and take product
|
||||
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||
|
||||
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
||||
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
||||
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
||||
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
||||
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
|
||||
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
|
||||
|
||||
sumf += isum * d_all * y[i].d;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
#define LLAMA_SESSION_VERSION 2
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
@@ -282,6 +282,9 @@ extern "C" {
|
||||
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int llama_n_embd (const struct llama_model * model);
|
||||
|
||||
// Get the model's RoPE frequency scaling factor
|
||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||
|
||||
// Get a string describing the model type
|
||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
|
||||
@@ -330,12 +333,16 @@ extern "C" {
|
||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
||||
|
||||
// Remove all tokens data of cells in [c0, c1)
|
||||
// c0 < 0 : [0, c1]
|
||||
// c1 < 0 : [c0, inf)
|
||||
LLAMA_API void llama_kv_cache_tokens_rm(
|
||||
struct llama_context * ctx,
|
||||
int32_t c0,
|
||||
int32_t c1);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
@@ -344,6 +351,8 @@ extern "C" {
|
||||
|
||||
// Copy all tokens that belong to the specified sequence to another sequence
|
||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_cp(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id_src,
|
||||
@@ -358,6 +367,8 @@ extern "C" {
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_shift(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -7,9 +7,6 @@ endfunction()
|
||||
|
||||
function(llama_test_executable name source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
# add_executable(${TEST_TARGET} ${source})
|
||||
# install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
# target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
@@ -28,10 +25,12 @@ llama_build_and_test_executable(test-sampling.cpp)
|
||||
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
|
||||
@@ -208,26 +208,6 @@ static struct ggml_tensor * get_random_tensor_i32(
|
||||
return result;
|
||||
}
|
||||
|
||||
static void print_elements(const char* label, const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
printf("%s: %s = null\n", __func__, label);
|
||||
return;
|
||||
}
|
||||
const int nelements = ggml_nelements(t);
|
||||
printf("%s: %s = [", __func__, label);
|
||||
for (int k = 0; k < nelements; ++k) {
|
||||
if (k > 0) { printf(", "); }
|
||||
printf("%.5f", ggml_get_f32_1d(t, k));
|
||||
}
|
||||
printf("] shape: [");
|
||||
for (int k = 0; k < t->n_dims; ++k) {
|
||||
if (k > 0) { printf(", "); }
|
||||
printf("%d", (int)t->ne[k]);
|
||||
}
|
||||
printf("]\n");
|
||||
|
||||
}
|
||||
|
||||
static bool check_gradient(
|
||||
const char * op_name,
|
||||
struct ggml_context * ctx0,
|
||||
|
||||
@@ -40,27 +40,6 @@ static float frand(void) {
|
||||
return (float)rand()/(float)RAND_MAX;
|
||||
}
|
||||
|
||||
static int irand(int n) {
|
||||
return rand()%n;
|
||||
}
|
||||
|
||||
static void get_random_dims(int64_t * dims, int ndims) {
|
||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
||||
|
||||
for (int i = 0; i < ndims; i++) {
|
||||
dims[i] = 1 + irand(4);
|
||||
}
|
||||
}
|
||||
|
||||
static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
|
||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
||||
|
||||
for (int i = 0; i < ndims; i++) {
|
||||
dims[i] = min + irand(max-min);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static struct ggml_tensor * get_random_tensor(
|
||||
struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
|
||||
) {
|
||||
@@ -106,14 +85,6 @@ static struct ggml_tensor * get_random_tensor(
|
||||
return result;
|
||||
}
|
||||
|
||||
static float get_element(const struct ggml_tensor * t, int idx) {
|
||||
return ((float *)t->data)[idx];
|
||||
}
|
||||
|
||||
static void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||
((float *)t->data)[idx] = value;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
struct ggml_init_params params = {
|
||||
/* .mem_size = */ 1024*1024*1024,
|
||||
|
||||
@@ -76,22 +76,21 @@ static void * align_with_offset(void * ptr, int offset) {
|
||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||
}
|
||||
|
||||
static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
|
||||
static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
|
||||
int64_t min_time_us = INT64_MAX;
|
||||
int64_t total_time_us = 0;
|
||||
int64_t min_time_cycles = INT64_MAX;
|
||||
int64_t total_time_cycles = 0;
|
||||
|
||||
for (int i = 0; i < WARMUP; i++) {
|
||||
function();
|
||||
func();
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
const int64_t start_time = ggml_time_us();
|
||||
const int64_t start_cycles = cpu_cycles();
|
||||
|
||||
function();
|
||||
func();
|
||||
|
||||
const int64_t end_cycles = cpu_cycles();
|
||||
const int64_t end_time = ggml_time_us();
|
||||
@@ -245,15 +244,15 @@ int main(int argc, char * argv[]) {
|
||||
|
||||
std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_q1_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_q2_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_out_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_q1_v (largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_q2_v (largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_out_v (largest*4 + MAX_ALIGNMENT*2);
|
||||
|
||||
float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
|
||||
float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
|
||||
float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
|
||||
float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
|
||||
float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
|
||||
float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
|
||||
float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
|
||||
float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
|
||||
|
||||
generate_data(0, largest, test_data1);
|
||||
generate_data(1, largest, test_data2);
|
||||
@@ -283,7 +282,7 @@ int main(int argc, char * argv[]) {
|
||||
printf(" quantize_row_q_reference\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.from_float_reference(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
@@ -297,7 +296,7 @@ int main(int argc, char * argv[]) {
|
||||
printf(" quantize_row_q\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
@@ -312,7 +311,7 @@ int main(int argc, char * argv[]) {
|
||||
qfns.from_float(test_data1, test_q1, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.to_float(test_q1, test_out, size);
|
||||
return test_out[0];
|
||||
};
|
||||
@@ -326,7 +325,7 @@ int main(int argc, char * argv[]) {
|
||||
printf(" quantize_row_q_dot\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
||||
vdot.from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
@@ -343,7 +342,7 @@ int main(int argc, char * argv[]) {
|
||||
qfns.from_float(test_data2, test_q2, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
float result;
|
||||
qfns.vec_dot(size, &result, test_q1, test_q2);
|
||||
return result;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
@@ -85,12 +86,18 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
|
||||
fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
|
||||
fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// We need this for unicode console support
|
||||
console::init(false, false);
|
||||
atexit([]() { console::cleanup(); });
|
||||
#endif
|
||||
|
||||
bool success = true;
|
||||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "unicode.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init(false);
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto mparams = llama_model_default_params();
|
||||
|
||||
mparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), mparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto cparams = llama_context_default_params();
|
||||
|
||||
ctx = llama_new_context_with_model(model, cparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
|
||||
|
||||
#ifdef _WIN32
|
||||
// We need this for unicode console support
|
||||
console::init(false, false);
|
||||
atexit([]() { console::cleanup(); });
|
||||
#endif
|
||||
|
||||
const int n_vocab = llama_n_vocab(model);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
|
||||
try {
|
||||
auto cps = codepoints_from_utf8(str);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (check != str) {
|
||||
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
||||
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument &) {
|
||||
fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
||||
// NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
|
||||
if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
|
||||
std::string str = " " + codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: why doesn't this work for the full range of Unicodes?
|
||||
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "unicode.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -11,30 +12,6 @@
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
||||
typedef int codepoint;
|
||||
|
||||
static std::string codepoint_to_utf8(codepoint cp) {
|
||||
std::string result;
|
||||
if (0x00 <= cp && cp <= 0x7f) {
|
||||
result.push_back(cp);
|
||||
} else if (0x80 <= cp && cp <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else if (0x800 <= cp && cp <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
} else {
|
||||
throw std::invalid_argument("invalid codepoint");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
@@ -95,7 +72,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
|
||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
||||
if (cp < 0xd800 || cp > 0xdfff) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
@@ -107,7 +84,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
|
||||
@@ -0,0 +1,462 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
|
||||
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
|
||||
{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
|
||||
{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
|
||||
{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
|
||||
{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
|
||||
{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
|
||||
{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
|
||||
{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> letter_ranges = {
|
||||
{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
|
||||
{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
|
||||
{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
|
||||
{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
|
||||
{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
|
||||
{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
|
||||
{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
|
||||
{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
|
||||
{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
|
||||
{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
|
||||
{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
|
||||
{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
|
||||
{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
|
||||
{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
|
||||
{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
|
||||
{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
|
||||
{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
|
||||
{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
|
||||
{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
|
||||
{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
|
||||
{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
|
||||
{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
|
||||
{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
|
||||
{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
|
||||
{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
|
||||
{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
|
||||
{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
|
||||
{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
|
||||
{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
|
||||
{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
|
||||
{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
|
||||
{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
|
||||
{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
|
||||
{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
|
||||
{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
|
||||
{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
|
||||
{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
|
||||
{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
|
||||
{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
|
||||
{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
|
||||
{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
|
||||
{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
|
||||
{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
|
||||
{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
|
||||
{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
|
||||
{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
|
||||
{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
|
||||
{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
|
||||
{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
|
||||
{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
|
||||
{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
|
||||
{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
|
||||
{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
|
||||
{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
|
||||
{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
|
||||
{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
|
||||
{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
|
||||
{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> whitespace_ranges = {
|
||||
{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> accent_mark_ranges = {
|
||||
{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
|
||||
{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
|
||||
{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
|
||||
{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
|
||||
{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
|
||||
{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
|
||||
{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
|
||||
{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
|
||||
{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
|
||||
{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
|
||||
{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
|
||||
{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
|
||||
{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
|
||||
{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
|
||||
{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
|
||||
{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
|
||||
{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
|
||||
{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
|
||||
{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
|
||||
{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
|
||||
{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
|
||||
{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
|
||||
{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
|
||||
{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
|
||||
{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
|
||||
{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
|
||||
{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> punctuation_ranges = {
|
||||
{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
|
||||
{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
|
||||
{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
|
||||
{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
|
||||
{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
|
||||
{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
|
||||
{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
|
||||
{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
|
||||
{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
|
||||
{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
|
||||
{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
|
||||
{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
|
||||
{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
|
||||
{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
|
||||
{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
|
||||
{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
|
||||
{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> symbol_ranges = {
|
||||
{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
|
||||
{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
|
||||
{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
|
||||
{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
|
||||
{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
|
||||
{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
|
||||
{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
|
||||
{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
|
||||
{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
|
||||
{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
|
||||
{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
|
||||
{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
|
||||
{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
|
||||
{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
|
||||
{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
|
||||
{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
|
||||
{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
|
||||
{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
|
||||
{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
|
||||
{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
|
||||
{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> control_ranges = {
|
||||
{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
|
||||
{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
|
||||
{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
|
||||
{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
|
||||
{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
|
||||
{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
|
||||
{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
|
||||
{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
|
||||
{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
|
||||
{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
|
||||
{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
|
||||
{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
|
||||
{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
|
||||
{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
|
||||
{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
|
||||
{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
|
||||
{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
|
||||
{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
|
||||
{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
|
||||
{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
|
||||
{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
|
||||
{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
|
||||
{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
|
||||
{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
|
||||
{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
|
||||
{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
|
||||
{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
|
||||
{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
|
||||
{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
|
||||
{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
|
||||
{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
|
||||
{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
|
||||
{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
|
||||
{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
|
||||
{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
|
||||
{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
|
||||
{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
|
||||
{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
|
||||
{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
|
||||
{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
|
||||
{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
|
||||
{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
|
||||
{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
|
||||
{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
|
||||
{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
|
||||
{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
|
||||
{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
|
||||
{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
|
||||
{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
|
||||
{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
|
||||
{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
|
||||
{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
|
||||
{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
|
||||
{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
|
||||
{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
|
||||
{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
|
||||
{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
|
||||
{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
|
||||
{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
|
||||
{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
|
||||
{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
|
||||
{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
|
||||
{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
|
||||
{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
|
||||
};
|
||||
|
||||
static std::string codepoint_to_utf8(uint32_t cp) {
|
||||
std::string result;
|
||||
if (/* 0x00 <= cp && */ cp <= 0x7f) {
|
||||
result.push_back(cp);
|
||||
}
|
||||
else if (0x80 <= cp && cp <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
}
|
||||
else if (0x800 <= cp && cp <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
}
|
||||
else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
}
|
||||
else {
|
||||
throw std::invalid_argument("invalid codepoint");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string codepoints_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < cps.size(); ++i) {
|
||||
result.append(codepoint_to_utf8(cps[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
assert(offset < utf8.size());
|
||||
if (!(utf8[offset + 0] & 0x80)) {
|
||||
auto result = utf8[offset + 0];
|
||||
offset += 1;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x40)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x20)) {
|
||||
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
|
||||
throw std::invalid_argument("invalid character");
|
||||
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
||||
offset += 2;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x10)) {
|
||||
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
|
||||
throw std::invalid_argument("invalid character");
|
||||
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
||||
offset += 3;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x08)) {
|
||||
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
|
||||
throw std::invalid_argument("invalid character");
|
||||
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
||||
offset += 4;
|
||||
return result;
|
||||
}
|
||||
throw std::invalid_argument("invalid string");
|
||||
}
|
||||
|
||||
static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
|
||||
std::vector<uint32_t> result;
|
||||
size_t offset = 0;
|
||||
while (offset < utf8.size()) {
|
||||
result.push_back(codepoint_from_utf8(utf8, offset));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
|
||||
std::vector<uint16_t> result;
|
||||
if (/* 0x0000 <= cp && */ cp <= 0xffff) {
|
||||
result.emplace_back(cp);
|
||||
}
|
||||
else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
|
||||
result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
|
||||
}
|
||||
else {
|
||||
throw std::invalid_argument("invalid codepoint");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t> & cps) {
|
||||
std::vector<uint16_t> result;
|
||||
for (size_t i = 0; i < cps.size(); ++i) {
|
||||
auto temp = codepoint_to_utf16(cps[i]);
|
||||
result.insert(result.end(), temp.begin(), temp.end());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
|
||||
assert(offset < utf16.size());
|
||||
if (((utf16[0] >> 10) << 10) != 0xd800) {
|
||||
auto result = utf16[offset + 0];
|
||||
offset += 1;
|
||||
return result;
|
||||
}
|
||||
else {
|
||||
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
|
||||
throw std::invalid_argument("invalid character");
|
||||
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
||||
offset += 2;
|
||||
return result;
|
||||
}
|
||||
throw std::invalid_argument("invalid string");
|
||||
}
|
||||
|
||||
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
||||
std::vector<uint32_t> result;
|
||||
size_t offset = 0;
|
||||
while (offset < utf16.size())
|
||||
result.push_back(codepoint_from_utf16(utf16, offset));
|
||||
return result;
|
||||
}
|
||||
|
||||
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
||||
#define CODEPOINT_TYPE_DIGIT 1
|
||||
#define CODEPOINT_TYPE_LETTER 2
|
||||
#define CODEPOINT_TYPE_WHITESPACE 3
|
||||
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
||||
#define CODEPOINT_TYPE_PUNCTUATION 5
|
||||
#define CODEPOINT_TYPE_SYMBOL 6
|
||||
#define CODEPOINT_TYPE_CONTROL 7
|
||||
|
||||
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||
std::unordered_map<uint32_t, int> codepoint_types;
|
||||
for (auto p : digit_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
||||
}
|
||||
for(auto p : letter_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
||||
}
|
||||
for(auto p : whitespace_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
||||
}
|
||||
for(auto p : accent_mark_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||
}
|
||||
for(auto p : punctuation_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||
}
|
||||
for (auto p : symbol_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++i)
|
||||
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
||||
}
|
||||
for(auto p : control_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||
}
|
||||
return codepoint_types;
|
||||
}
|
||||
|
||||
static int codepoint_type(uint32_t cp) {
|
||||
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||
return codepoint_types[cp];
|
||||
}
|
||||
|
||||
static int codepoint_type(const std::string & utf8) {
|
||||
if (utf8.length() == 0)
|
||||
return CODEPOINT_TYPE_UNIDENTIFIED;
|
||||
size_t offset = 0;
|
||||
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
||||
}
|
||||
|
||||
static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map_bpe() {
|
||||
std::unordered_map<uint8_t, std::string> map;
|
||||
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[ch] = codepoint_to_utf8(ch);
|
||||
}
|
||||
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[ch] = codepoint_to_utf8(ch);
|
||||
}
|
||||
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[ch] = codepoint_to_utf8(ch);
|
||||
}
|
||||
auto n = 0;
|
||||
for (int ch = 0; ch < 256; ++ch) {
|
||||
if (map.find(ch) == map.end()) {
|
||||
map[ch] = codepoint_to_utf8(256 + n);
|
||||
++n;
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
static std::string bytes_to_unicode_bpe(uint8_t byte) {
|
||||
static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map_bpe();
|
||||
return map.at(byte);
|
||||
}
|
||||
|
||||
static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map_bpe() {
|
||||
std::unordered_map<std::string, uint8_t> map;
|
||||
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[codepoint_to_utf8(ch)] = ch;
|
||||
}
|
||||
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[codepoint_to_utf8(ch)] = ch;
|
||||
}
|
||||
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[codepoint_to_utf8(ch)] = ch;
|
||||
}
|
||||
auto n = 0;
|
||||
for (int ch = 0; ch < 256; ++ch) {
|
||||
if (map.find(codepoint_to_utf8(ch)) == map.end()) {
|
||||
map[codepoint_to_utf8(256 + n)] = ch;
|
||||
++n;
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
static uint8_t unicode_to_bytes_bpe(const std::string & utf8) {
|
||||
static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map_bpe();
|
||||
return map.at(utf8);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user