summaryrefslogtreecommitdiff
path: root/llama.cpp/src
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/src
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/src')
-rw-r--r--llama.cpp/src/CMakeLists.txt165
-rw-r--r--llama.cpp/src/llama-adapter.cpp488
-rw-r--r--llama.cpp/src/llama-adapter.h86
-rw-r--r--llama.cpp/src/llama-arch.cpp2757
-rw-r--r--llama.cpp/src/llama-arch.h606
-rw-r--r--llama.cpp/src/llama-batch.cpp917
-rw-r--r--llama.cpp/src/llama-batch.h173
-rw-r--r--llama.cpp/src/llama-chat.cpp896
-rw-r--r--llama.cpp/src/llama-chat.h71
-rw-r--r--llama.cpp/src/llama-context.cpp3691
-rw-r--r--llama.cpp/src/llama-context.h361
-rw-r--r--llama.cpp/src/llama-cparams.cpp5
-rw-r--r--llama.cpp/src/llama-cparams.h44
-rw-r--r--llama.cpp/src/llama-grammar.cpp1464
-rw-r--r--llama.cpp/src/llama-grammar.h194
-rw-r--r--llama.cpp/src/llama-graph.cpp2626
-rw-r--r--llama.cpp/src/llama-graph.h1021
-rw-r--r--llama.cpp/src/llama-hparams.cpp234
-rw-r--r--llama.cpp/src/llama-hparams.h334
-rw-r--r--llama.cpp/src/llama-impl.cpp171
-rw-r--r--llama.cpp/src/llama-impl.h73
-rw-r--r--llama.cpp/src/llama-io.cpp15
-rw-r--r--llama.cpp/src/llama-io.h35
-rw-r--r--llama.cpp/src/llama-kv-cache-iswa.cpp330
-rw-r--r--llama.cpp/src/llama-kv-cache-iswa.h137
-rw-r--r--llama.cpp/src/llama-kv-cache.cpp2268
-rw-r--r--llama.cpp/src/llama-kv-cache.h388
-rw-r--r--llama.cpp/src/llama-kv-cells.h533
-rw-r--r--llama.cpp/src/llama-memory-hybrid-iswa.cpp275
-rw-r--r--llama.cpp/src/llama-memory-hybrid-iswa.h140
-rw-r--r--llama.cpp/src/llama-memory-hybrid.cpp268
-rw-r--r--llama.cpp/src/llama-memory-hybrid.h139
-rw-r--r--llama.cpp/src/llama-memory-recurrent.cpp1165
-rw-r--r--llama.cpp/src/llama-memory-recurrent.h182
-rw-r--r--llama.cpp/src/llama-memory.cpp59
-rw-r--r--llama.cpp/src/llama-memory.h122
-rw-r--r--llama.cpp/src/llama-mmap.cpp742
-rw-r--r--llama.cpp/src/llama-mmap.h73
-rw-r--r--llama.cpp/src/llama-model-loader.cpp1261
-rw-r--r--llama.cpp/src/llama-model-loader.h176
-rw-r--r--llama.cpp/src/llama-model-saver.cpp285
-rw-r--r--llama.cpp/src/llama-model-saver.h37
-rw-r--r--llama.cpp/src/llama-model.cpp8953
-rw-r--r--llama.cpp/src/llama-model.h563
-rw-r--r--llama.cpp/src/llama-quant.cpp1069
-rw-r--r--llama.cpp/src/llama-quant.h1
-rw-r--r--llama.cpp/src/llama-sampler.cpp3885
-rw-r--r--llama.cpp/src/llama-sampler.h42
-rw-r--r--llama.cpp/src/llama-vocab.cpp3938
-rw-r--r--llama.cpp/src/llama-vocab.h184
-rw-r--r--llama.cpp/src/llama.cpp1174
-rw-r--r--llama.cpp/src/models/afmoe.cpp191
-rw-r--r--llama.cpp/src/models/apertus.cpp125
-rw-r--r--llama.cpp/src/models/arcee.cpp135
-rw-r--r--llama.cpp/src/models/arctic.cpp138
-rw-r--r--llama.cpp/src/models/arwkv7.cpp86
-rw-r--r--llama.cpp/src/models/baichuan.cpp122
-rw-r--r--llama.cpp/src/models/bailingmoe.cpp144
-rw-r--r--llama.cpp/src/models/bailingmoe2.cpp135
-rw-r--r--llama.cpp/src/models/bert.cpp178
-rw-r--r--llama.cpp/src/models/bitnet.cpp160
-rw-r--r--llama.cpp/src/models/bloom.cpp101
-rw-r--r--llama.cpp/src/models/chameleon.cpp178
-rw-r--r--llama.cpp/src/models/chatglm.cpp132
-rw-r--r--llama.cpp/src/models/codeshell.cpp111
-rw-r--r--llama.cpp/src/models/cogvlm.cpp102
-rw-r--r--llama.cpp/src/models/cohere2-iswa.cpp134
-rw-r--r--llama.cpp/src/models/command-r.cpp122
-rw-r--r--llama.cpp/src/models/dbrx.cpp123
-rw-r--r--llama.cpp/src/models/deci.cpp135
-rw-r--r--llama.cpp/src/models/deepseek.cpp144
-rw-r--r--llama.cpp/src/models/deepseek2.cpp259
-rw-r--r--llama.cpp/src/models/dots1.cpp134
-rw-r--r--llama.cpp/src/models/dream.cpp105
-rw-r--r--llama.cpp/src/models/ernie4-5-moe.cpp150
-rw-r--r--llama.cpp/src/models/ernie4-5.cpp110
-rw-r--r--llama.cpp/src/models/exaone-moe.cpp146
-rw-r--r--llama.cpp/src/models/exaone.cpp114
-rw-r--r--llama.cpp/src/models/exaone4.cpp123
-rw-r--r--llama.cpp/src/models/falcon-h1.cpp113
-rw-r--r--llama.cpp/src/models/falcon.cpp120
-rw-r--r--llama.cpp/src/models/gemma-embedding.cpp116
-rw-r--r--llama.cpp/src/models/gemma.cpp112
-rw-r--r--llama.cpp/src/models/gemma2-iswa.cpp128
-rw-r--r--llama.cpp/src/models/gemma3.cpp155
-rw-r--r--llama.cpp/src/models/gemma3n-iswa.cpp384
-rw-r--r--llama.cpp/src/models/glm4-moe.cpp170
-rw-r--r--llama.cpp/src/models/glm4.cpp150
-rw-r--r--llama.cpp/src/models/gpt2.cpp105
-rw-r--r--llama.cpp/src/models/gptneox.cpp144
-rw-r--r--llama.cpp/src/models/granite-hybrid.cpp196
-rw-r--r--llama.cpp/src/models/granite.cpp211
-rw-r--r--llama.cpp/src/models/graph-context-mamba.cpp283
-rw-r--r--llama.cpp/src/models/grok.cpp159
-rw-r--r--llama.cpp/src/models/grovemoe.cpp141
-rw-r--r--llama.cpp/src/models/hunyuan-dense.cpp132
-rw-r--r--llama.cpp/src/models/hunyuan-moe.cpp154
-rw-r--r--llama.cpp/src/models/internlm2.cpp120
-rw-r--r--llama.cpp/src/models/jais.cpp86
-rw-r--r--llama.cpp/src/models/jamba.cpp106
-rw-r--r--llama.cpp/src/models/kimi-linear.cpp772
-rw-r--r--llama.cpp/src/models/lfm2.cpp175
-rw-r--r--llama.cpp/src/models/llada-moe.cpp122
-rw-r--r--llama.cpp/src/models/llada.cpp99
-rw-r--r--llama.cpp/src/models/llama-iswa.cpp178
-rw-r--r--llama.cpp/src/models/llama.cpp168
-rw-r--r--llama.cpp/src/models/maincoder.cpp117
-rw-r--r--llama.cpp/src/models/mamba.cpp55
-rw-r--r--llama.cpp/src/models/mimo2-iswa.cpp123
-rw-r--r--llama.cpp/src/models/minicpm3.cpp200
-rw-r--r--llama.cpp/src/models/minimax-m2.cpp124
-rw-r--r--llama.cpp/src/models/mistral3.cpp160
-rw-r--r--llama.cpp/src/models/models.h723
-rw-r--r--llama.cpp/src/models/modern-bert.cpp116
-rw-r--r--llama.cpp/src/models/mpt.cpp126
-rw-r--r--llama.cpp/src/models/nemotron-h.cpp150
-rw-r--r--llama.cpp/src/models/nemotron.cpp122
-rw-r--r--llama.cpp/src/models/neo-bert.cpp104
-rw-r--r--llama.cpp/src/models/olmo.cpp121
-rw-r--r--llama.cpp/src/models/olmo2.cpp150
-rw-r--r--llama.cpp/src/models/olmoe.cpp124
-rw-r--r--llama.cpp/src/models/openai-moe-iswa.cpp127
-rw-r--r--llama.cpp/src/models/openelm.cpp124
-rw-r--r--llama.cpp/src/models/orion.cpp123
-rw-r--r--llama.cpp/src/models/pangu-embedded.cpp121
-rw-r--r--llama.cpp/src/models/phi2.cpp121
-rw-r--r--llama.cpp/src/models/phi3.cpp152
-rw-r--r--llama.cpp/src/models/plamo.cpp110
-rw-r--r--llama.cpp/src/models/plamo2.cpp316
-rw-r--r--llama.cpp/src/models/plamo3.cpp128
-rw-r--r--llama.cpp/src/models/plm.cpp169
-rw-r--r--llama.cpp/src/models/qwen.cpp108
-rw-r--r--llama.cpp/src/models/qwen2.cpp126
-rw-r--r--llama.cpp/src/models/qwen2moe.cpp151
-rw-r--r--llama.cpp/src/models/qwen2vl.cpp117
-rw-r--r--llama.cpp/src/models/qwen3.cpp117
-rw-r--r--llama.cpp/src/models/qwen35.cpp740
-rw-r--r--llama.cpp/src/models/qwen35moe.cpp774
-rw-r--r--llama.cpp/src/models/qwen3moe.cpp124
-rw-r--r--llama.cpp/src/models/qwen3next.cpp879
-rw-r--r--llama.cpp/src/models/qwen3vl-moe.cpp140
-rw-r--r--llama.cpp/src/models/qwen3vl.cpp132
-rw-r--r--llama.cpp/src/models/refact.cpp94
-rw-r--r--llama.cpp/src/models/rnd1.cpp126
-rw-r--r--llama.cpp/src/models/rwkv6-base.cpp162
-rw-r--r--llama.cpp/src/models/rwkv6.cpp94
-rw-r--r--llama.cpp/src/models/rwkv6qwen2.cpp86
-rw-r--r--llama.cpp/src/models/rwkv7-base.cpp135
-rw-r--r--llama.cpp/src/models/rwkv7.cpp90
-rw-r--r--llama.cpp/src/models/seed-oss.cpp124
-rw-r--r--llama.cpp/src/models/smallthinker.cpp126
-rw-r--r--llama.cpp/src/models/smollm3.cpp128
-rw-r--r--llama.cpp/src/models/stablelm.cpp146
-rw-r--r--llama.cpp/src/models/starcoder.cpp100
-rw-r--r--llama.cpp/src/models/starcoder2.cpp121
-rw-r--r--llama.cpp/src/models/step35-iswa.cpp168
-rw-r--r--llama.cpp/src/models/t5-dec.cpp166
-rw-r--r--llama.cpp/src/models/t5-enc.cpp96
-rw-r--r--llama.cpp/src/models/wavtokenizer-dec.cpp149
-rw-r--r--llama.cpp/src/models/xverse.cpp108
-rw-r--r--llama.cpp/src/unicode-data.cpp7034
-rw-r--r--llama.cpp/src/unicode-data.h20
-rw-r--r--llama.cpp/src/unicode.cpp1124
-rw-r--r--llama.cpp/src/unicode.h111
164 files changed, 71374 insertions, 0 deletions
diff --git a/llama.cpp/src/CMakeLists.txt b/llama.cpp/src/CMakeLists.txt
new file mode 100644
index 0000000..fdda05d
--- /dev/null
+++ b/llama.cpp/src/CMakeLists.txt
@@ -0,0 +1,165 @@
+llama_add_compile_flags()
+
+#
+# libraries
+#
+
+# llama
+
+add_library(llama
+ ../include/llama.h
+ llama.cpp
+ llama-adapter.cpp
+ llama-arch.cpp
+ llama-batch.cpp
+ llama-chat.cpp
+ llama-context.cpp
+ llama-cparams.cpp
+ llama-grammar.cpp
+ llama-graph.cpp
+ llama-hparams.cpp
+ llama-impl.cpp
+ llama-io.cpp
+ llama-kv-cache.cpp
+ llama-kv-cache-iswa.cpp
+ llama-memory.cpp
+ llama-memory-hybrid.cpp
+ llama-memory-hybrid-iswa.cpp
+ llama-memory-recurrent.cpp
+ llama-mmap.cpp
+ llama-model-loader.cpp
+ llama-model-saver.cpp
+ llama-model.cpp
+ llama-quant.cpp
+ llama-sampler.cpp
+ llama-vocab.cpp
+ unicode-data.cpp
+ unicode.cpp
+ unicode.h
+ models/afmoe.cpp
+ models/apertus.cpp
+ models/arcee.cpp
+ models/arctic.cpp
+ models/arwkv7.cpp
+ models/baichuan.cpp
+ models/bailingmoe.cpp
+ models/bailingmoe2.cpp
+ models/bert.cpp
+ models/bitnet.cpp
+ models/bloom.cpp
+ models/chameleon.cpp
+ models/chatglm.cpp
+ models/codeshell.cpp
+ models/cogvlm.cpp
+ models/cohere2-iswa.cpp
+ models/command-r.cpp
+ models/dbrx.cpp
+ models/deci.cpp
+ models/deepseek.cpp
+ models/deepseek2.cpp
+ models/dots1.cpp
+ models/dream.cpp
+ models/ernie4-5-moe.cpp
+ models/ernie4-5.cpp
+ models/exaone.cpp
+ models/exaone4.cpp
+ models/exaone-moe.cpp
+ models/falcon-h1.cpp
+ models/falcon.cpp
+ models/gemma-embedding.cpp
+ models/gemma.cpp
+ models/gemma2-iswa.cpp
+ models/gemma3.cpp
+ models/gemma3n-iswa.cpp
+ models/glm4-moe.cpp
+ models/glm4.cpp
+ models/gpt2.cpp
+ models/gptneox.cpp
+ models/granite-hybrid.cpp
+ models/granite.cpp
+ models/grok.cpp
+ models/grovemoe.cpp
+ models/hunyuan-dense.cpp
+ models/hunyuan-moe.cpp
+ models/internlm2.cpp
+ models/jais.cpp
+ models/jamba.cpp
+ models/kimi-linear.cpp
+ models/lfm2.cpp
+ models/llada-moe.cpp
+ models/llada.cpp
+ models/llama-iswa.cpp
+ models/llama.cpp
+ models/maincoder.cpp
+ models/mamba.cpp
+ models/mimo2-iswa.cpp
+ models/minicpm3.cpp
+ models/minimax-m2.cpp
+ models/modern-bert.cpp
+ models/mpt.cpp
+ models/nemotron-h.cpp
+ models/nemotron.cpp
+ models/neo-bert.cpp
+ models/olmo.cpp
+ models/olmo2.cpp
+ models/olmoe.cpp
+ models/openai-moe-iswa.cpp
+ models/openelm.cpp
+ models/orion.cpp
+ models/pangu-embedded.cpp
+ models/phi2.cpp
+ models/phi3.cpp
+ models/plamo.cpp
+ models/plamo2.cpp
+ models/plamo3.cpp
+ models/plm.cpp
+ models/qwen.cpp
+ models/qwen2.cpp
+ models/qwen2moe.cpp
+ models/qwen2vl.cpp
+ models/qwen3.cpp
+ models/qwen3vl.cpp
+ models/qwen3vl-moe.cpp
+ models/qwen3moe.cpp
+ models/qwen3next.cpp
+ models/qwen35.cpp
+ models/qwen35moe.cpp
+ models/refact.cpp
+ models/rnd1.cpp
+ models/rwkv6-base.cpp
+ models/rwkv6.cpp
+ models/rwkv6qwen2.cpp
+ models/rwkv7-base.cpp
+ models/rwkv7.cpp
+ models/seed-oss.cpp
+ models/smallthinker.cpp
+ models/smollm3.cpp
+ models/stablelm.cpp
+ models/starcoder.cpp
+ models/starcoder2.cpp
+ models/step35-iswa.cpp
+ models/t5-dec.cpp
+ models/t5-enc.cpp
+ models/wavtokenizer-dec.cpp
+ models/xverse.cpp
+ models/mistral3.cpp
+ models/graph-context-mamba.cpp
+ )
+
+set_target_properties(llama PROPERTIES
+ VERSION ${LLAMA_INSTALL_VERSION}
+ SOVERSION 0
+ MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
+)
+
+target_include_directories(llama PRIVATE .)
+target_include_directories(llama PUBLIC ../include)
+target_compile_features (llama PRIVATE cxx_std_17) # don't bump
+
+target_link_libraries(llama PUBLIC ggml)
+
+if (BUILD_SHARED_LIBS)
+ set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+ target_compile_definitions(llama PRIVATE LLAMA_BUILD)
+ target_compile_definitions(llama PUBLIC LLAMA_SHARED)
+endif()
diff --git a/llama.cpp/src/llama-adapter.cpp b/llama.cpp/src/llama-adapter.cpp
new file mode 100644
index 0000000..d6a5800
--- /dev/null
+++ b/llama.cpp/src/llama-adapter.cpp
@@ -0,0 +1,488 @@
+#include "llama-adapter.h"
+
+#include "llama-impl.h"
+#include "llama-mmap.h"
+#include "llama-model.h"
+
+#include <map>
+#include <cassert>
+#include <sstream>
+#include <stdexcept>
+
+// vec
+
+ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
+ if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+ return nullptr;
+ }
+
+ return tensors[il];
+}
+
+ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
+ ggml_tensor * layer_dir = tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx, cur, layer_dir);
+ }
+
+ return cur;
+}
+
+bool llama_adapter_cvec::init(const llama_model & model) {
+ const auto & hparams = model.hparams;
+
+ GGML_ASSERT(tensors.empty());
+ GGML_ASSERT(ctxs.empty());
+ GGML_ASSERT(bufs.empty());
+
+ // create a context for each buffer type
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ ggml_init_params params = {
+ /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ return nullptr;
+ }
+
+ ctx_map[buft] = ctx;
+ ctxs.emplace_back(ctx);
+
+ return ctx;
+ }
+
+ return it->second;
+ };
+
+ // make tensors
+ tensors.reserve(hparams.n_layer);
+ tensors.push_back(nullptr); // there's never a tensor for layer 0
+ for (size_t il = 1; il < hparams.n_layer; il++) {
+ ggml_backend_buffer_type_t buft = model.select_buft(il);
+ ggml_context * ctx = ctx_for_buft(buft);
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+ return false;
+ }
+ ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+ tensors.push_back(tensor);
+ }
+
+ // allocate tensors / buffers and zero
+ bufs.reserve(ctx_map.size());
+ for (auto it : ctx_map) {
+ ggml_backend_buffer_type_t buft = it.first;
+ ggml_context * ctx = it.second;
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+ if (!buf) {
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+ return false;
+ }
+ ggml_backend_buffer_clear(buf, 0);
+ bufs.emplace_back(buf);
+ }
+
+ return true;
+}
+
+bool llama_adapter_cvec::apply(
+ const llama_model & model,
+ const float * data,
+ size_t len,
+ int32_t n_embd,
+ int32_t il_start,
+ int32_t il_end) {
+ const auto & hparams = model.hparams;
+
+ if (data == nullptr) {
+ // disable the current control vector (but leave allocated for later)
+ layer_start = -1;
+ layer_end = -1;
+ return true;
+ }
+
+ if (n_embd != (int) hparams.n_embd) {
+ LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+ return false;
+ }
+
+ if (tensors.empty()) {
+ if (!init(model)) {
+ return false;
+ }
+ }
+
+ layer_start = il_start;
+ layer_end = il_end;
+
+ for (size_t il = 1; il < hparams.n_layer; il++) {
+ assert(tensors[il] != nullptr);
+
+ const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
+ if (off + n_embd <= len) {
+ ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
+ }
+ }
+
+ return true;
+}
+
+// lora
+
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
+ const std::string name(w->name);
+
+ const auto pos = ab_map.find(name);
+ if (pos != ab_map.end()) {
+ return &pos->second;
+ }
+
+ return nullptr;
+}
+
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
+ LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
+
+ ggml_context * ctx_init;
+ gguf_init_params meta_gguf_params = {
+ /* .no_alloc = */ true,
+ /* .ctx = */ &ctx_init,
+ };
+
+ gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
+ if (!ctx_gguf) {
+ throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
+ }
+
+ ggml_context_ptr ctx { ctx_init };
+
+ // check metadata
+ {
+ const gguf_context * gguf_ctx = ctx_gguf.get();
+
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
+
+ // get metadata as string
+ for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
+ gguf_type type = gguf_get_kv_type(gguf_ctx, i);
+ const std::string type_name =
+ type == GGUF_TYPE_ARRAY
+ ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
+ : gguf_type_name(type);
+ const char * name = gguf_get_key(gguf_ctx, i);
+ const std::string value = gguf_kv_to_str(gguf_ctx, i);
+
+ if (type != GGUF_TYPE_ARRAY) {
+ adapter.gguf_kv.emplace(name, value);
+ }
+
+ const size_t MAX_VALUE_LEN = 40;
+ std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
+ replace_all(print_value, "\n", "\\n");
+
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
+ }
+
+ auto get_kv_str = [&](const std::string & key) -> std::string {
+ int id = gguf_find_key(gguf_ctx, key.c_str());
+ return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
+ };
+ auto get_kv_f32 = [&](const std::string & key) -> float {
+ int id = gguf_find_key(gguf_ctx, key.c_str());
+ return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
+ };
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
+
+ auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
+ if (general_type != "adapter") {
+ throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+ }
+
+ auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
+ auto general_arch = llm_arch_from_string(general_arch_str);
+ if (general_arch != model.arch) {
+ throw std::runtime_error("model arch and LoRA arch mismatch");
+ }
+
+ auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
+ if (adapter_type != "lora") {
+ throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+ }
+
+ adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
+
+ // parse alora invocation sequence vector
+ const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
+ const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
+ if (kid >= 0) {
+ if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
+ throw std::runtime_error("invalid gguf type for " + key);
+ }
+ const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
+ if (arr_type != GGUF_TYPE_UINT32) {
+ throw std::runtime_error("invalid gguf element type for " + key);
+ }
+ const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
+ const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
+ adapter.alora_invocation_tokens.resize(seq_len);
+ std::copy(
+ (const llama_token *)data,
+ (const llama_token *)data + seq_len,
+ adapter.alora_invocation_tokens.begin());
+ }
+ }
+
+ int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
+
+ // contexts for each buffer type
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ // add a new context
+ ggml_init_params params = {
+ /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ggml_context * buft_ctx = ggml_init(params);
+ if (!buft_ctx) {
+ return nullptr;
+ }
+ ctx_map[buft] = buft_ctx;
+ adapter.ctxs.emplace_back(buft_ctx);
+ return buft_ctx;
+ };
+ return it->second;
+ };
+
+ // bundle lora_a and lora_b into pairs
+ std::map<std::string, llama_adapter_lora_weight> ab_map;
+ auto str_endswith = [](const std::string & str, const std::string & suffix) {
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+ };
+
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
+ std::string name(cur->name);
+ if (str_endswith(name, ".lora_a")) {
+ replace_all(name, ".lora_a", "");
+ if (ab_map.find(name) == ab_map.end()) {
+ ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
+ } else {
+ ab_map[name].a = cur;
+ }
+ } else if (str_endswith(name, ".lora_b")) {
+ replace_all(name, ".lora_b", "");
+ if (ab_map.find(name) == ab_map.end()) {
+ ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
+ } else {
+ ab_map[name].b = cur;
+ }
+ } else if (str_endswith(name, "_norm.weight")) {
+ // TODO: add support for norm vector
+ // for now, we don't really care because most adapters still work fine without it
+ continue;
+ } else {
+ throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
+ }
+ }
+
+ // get extra buffer types of the CPU
+ // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
+ std::vector<ggml_backend_buffer_type_t> buft_extra;
+ {
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (!cpu_dev) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
+
+ if (ggml_backend_dev_get_extra_bufts_fn) {
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+ while (extra_bufts && *extra_bufts) {
+ buft_extra.emplace_back(*extra_bufts);
+ ++extra_bufts;
+ }
+ }
+ }
+
+ // add tensors
+ for (auto & it : ab_map) {
+ const std::string & name = it.first;
+ llama_adapter_lora_weight & w = it.second;
+ bool is_token_embd = str_endswith(name, "token_embd.weight");
+
+ if (!w.a || !w.b) {
+ throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
+ }
+
+ // device buft and device ctx
+ const auto * model_tensor = model.get_tensor(name.c_str());
+ if (!model_tensor) {
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
+ }
+
+ auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
+
+ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
+ for (auto & ex : buft_extra) {
+ if (ex == buft) {
+ LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
+
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (!cpu_dev) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+ buft = ggml_backend_dev_buffer_type(cpu_dev);
+
+ break;
+ }
+ }
+
+ LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
+
+ ggml_context * dev_ctx = ctx_for_buft(buft);
+ // validate tensor shape
+ if (is_token_embd) {
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
+ if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+ }
+ } else {
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+ }
+ if (w.a->ne[1] != w.b->ne[0]) {
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+ }
+ }
+
+ // save tensor to adapter
+ ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+ ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+ ggml_set_name(tensor_a, w.a->name);
+ ggml_set_name(tensor_b, w.b->name);
+ adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
+ }
+
+ // allocate tensors / buffers and zero
+ {
+ adapter.ctxs.reserve(ctx_map.size());
+ adapter.bufs.reserve(ctx_map.size());
+ for (auto & it : ctx_map) {
+ ggml_backend_buffer_type_t buft = it.first;
+ ggml_context * ctx_dev = it.second;
+ ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
+ if (!buf) {
+ throw std::runtime_error("failed to allocate buffer for lora adapter\n");
+ }
+ LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
+ adapter.bufs.emplace_back(std::move(buf));
+ }
+ }
+
+ // set tensor data
+ {
+ llama_file gguf_file(path_lora, "rb");
+ std::vector<uint8_t> read_buf;
+ auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
+ size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
+ size_t size = ggml_nbytes(orig);
+ read_buf.resize(size);
+ gguf_file.seek(offs, SEEK_SET);
+ gguf_file.read_raw(read_buf.data(), size);
+ ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
+ };
+ for (auto & it : adapter.ab_map) {
+ auto orig = ab_map[it.first];
+ auto dev = it.second;
+ set_tensor(orig.a, dev.a);
+ set_tensor(orig.b, dev.b);
+ }
+ }
+
+ // register adapter with model
+ model.loras.insert(&adapter);
+
+ LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
+}
+
+llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
+ llama_adapter_lora * adapter = new llama_adapter_lora();
+
+ try {
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
+ return adapter;
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
+
+ delete adapter;
+ }
+
+ return nullptr;
+}
+
+int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
+ const auto & it = adapter->gguf_kv.find(key);
+ if (it == adapter->gguf_kv.end()) {
+ if (buf_size > 0) {
+ buf[0] = '\0';
+ }
+ return -1;
+ }
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
+ return (int)adapter->gguf_kv.size();
+}
+
+int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
+ if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
+ if (buf_size > 0) {
+ buf[0] = '\0';
+ }
+ return -1;
+ }
+ auto it = adapter->gguf_kv.begin();
+ std::advance(it, i);
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
+}
+
+int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
+ if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
+ if (buf_size > 0) {
+ buf[0] = '\0';
+ }
+ return -1;
+ }
+ auto it = adapter->gguf_kv.begin();
+ std::advance(it, i);
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+void llama_adapter_lora_free(llama_adapter_lora *) {
+ // deprecated: adapters are freed by llama_model's destructor
+}
+
+uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
+ if (!adapter) {
+ return 0;
+ }
+ return adapter->alora_invocation_tokens.size();
+}
+
+const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
+ GGML_ASSERT(adapter);
+ return adapter->alora_invocation_tokens.data();
+}
diff --git a/llama.cpp/src/llama-adapter.h b/llama.cpp/src/llama-adapter.h
new file mode 100644
index 0000000..d275d25
--- /dev/null
+++ b/llama.cpp/src/llama-adapter.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include "llama.h"
+
+#include "ggml-cpp.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// TODO: pimpl
+
+//
+// llama_adapter_cvec
+//
+
+struct llama_adapter_cvec {
+ ggml_tensor * tensor_for(int il) const;
+
+ ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const;
+
+ bool apply(
+ const llama_model & model,
+ const float * data,
+ size_t len,
+ int32_t n_embd,
+ int32_t il_start,
+ int32_t il_end);
+
+private:
+ bool init(const llama_model & model);
+
+ int32_t layer_start = -1;
+ int32_t layer_end = -1;
+
+ std::vector<ggml_context_ptr> ctxs;
+ std::vector<ggml_backend_buffer_ptr> bufs;
+
+ std::vector<ggml_tensor *> tensors; // per layer
+};
+
+//
+// llama_adapter_lora
+//
+
+struct llama_adapter_lora_weight {
+ ggml_tensor * a = nullptr;
+ ggml_tensor * b = nullptr;
+
+ // get actual scale based on rank and alpha
+ float get_scale(float alpha, float adapter_scale) const {
+ const float rank = (float) b->ne[0];
+ const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
+ return scale;
+ }
+
+ llama_adapter_lora_weight() = default;
+ llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
+};
+
+struct llama_adapter_lora {
+ // map tensor name to lora_a_b
+ std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
+
+ std::vector<ggml_context_ptr> ctxs;
+ std::vector<ggml_backend_buffer_ptr> bufs;
+
+ float alpha;
+
+ // gguf metadata
+ std::unordered_map<std::string, std::string> gguf_kv;
+
+ // activated lora (aLoRA)
+ std::vector<llama_token> alora_invocation_tokens;
+
+ llama_adapter_lora() = default;
+ ~llama_adapter_lora() = default;
+
+ llama_adapter_lora_weight * get_weight(ggml_tensor * w);
+
+ uint32_t get_n_nodes() const {
+ return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
+ }
+};
+
+using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
diff --git a/llama.cpp/src/llama-arch.cpp b/llama.cpp/src/llama-arch.cpp
new file mode 100644
index 0000000..a943d40
--- /dev/null
+++ b/llama.cpp/src/llama-arch.cpp
@@ -0,0 +1,2757 @@
+#include "llama-arch.h"
+
+#include "llama-impl.h"
+
+#include <map>
+#include <set>
+
+static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+ { LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
+ { LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_LLAMA4, "llama4" },
+ { LLM_ARCH_DECI, "deci" },
+ { LLM_ARCH_FALCON, "falcon" },
+ { LLM_ARCH_GROK, "grok" },
+ { LLM_ARCH_GPT2, "gpt2" },
+ { LLM_ARCH_GPTJ, "gptj" },
+ { LLM_ARCH_GPTNEOX, "gptneox" },
+ { LLM_ARCH_MPT, "mpt" },
+ { LLM_ARCH_BAICHUAN, "baichuan" },
+ { LLM_ARCH_STARCODER, "starcoder" },
+ { LLM_ARCH_REFACT, "refact" },
+ { LLM_ARCH_BERT, "bert" },
+ { LLM_ARCH_MODERN_BERT, "modern-bert" },
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
+ { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
+ { LLM_ARCH_NEO_BERT, "neo-bert" },
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
+ { LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
+ { LLM_ARCH_BLOOM, "bloom" },
+ { LLM_ARCH_STABLELM, "stablelm" },
+ { LLM_ARCH_QWEN, "qwen" },
+ { LLM_ARCH_QWEN2, "qwen2" },
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
+ { LLM_ARCH_QWEN2VL, "qwen2vl" },
+ { LLM_ARCH_QWEN3, "qwen3" },
+ { LLM_ARCH_QWEN3MOE, "qwen3moe" },
+ { LLM_ARCH_QWEN3NEXT, "qwen3next" },
+ { LLM_ARCH_QWEN3VL, "qwen3vl" },
+ { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
+ { LLM_ARCH_QWEN35, "qwen35" },
+ { LLM_ARCH_QWEN35MOE, "qwen35moe" },
+ { LLM_ARCH_PHI2, "phi2" },
+ { LLM_ARCH_PHI3, "phi3" },
+ { LLM_ARCH_PHIMOE, "phimoe" },
+ { LLM_ARCH_PLAMO, "plamo" },
+ { LLM_ARCH_PLAMO2, "plamo2" },
+ { LLM_ARCH_PLAMO3, "plamo3" },
+ { LLM_ARCH_CODESHELL, "codeshell" },
+ { LLM_ARCH_ORION, "orion" },
+ { LLM_ARCH_INTERNLM2, "internlm2" },
+ { LLM_ARCH_MINICPM, "minicpm" },
+ { LLM_ARCH_MINICPM3, "minicpm3" },
+ { LLM_ARCH_GEMMA, "gemma" },
+ { LLM_ARCH_GEMMA2, "gemma2" },
+ { LLM_ARCH_GEMMA3, "gemma3" },
+ { LLM_ARCH_GEMMA3N, "gemma3n" },
+ { LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
+ { LLM_ARCH_STARCODER2, "starcoder2" },
+ { LLM_ARCH_MAMBA, "mamba" },
+ { LLM_ARCH_MAMBA2, "mamba2" },
+ { LLM_ARCH_JAMBA, "jamba" },
+ { LLM_ARCH_FALCON_H1, "falcon-h1" },
+ { LLM_ARCH_XVERSE, "xverse" },
+ { LLM_ARCH_COMMAND_R, "command-r" },
+ { LLM_ARCH_COHERE2, "cohere2" },
+ { LLM_ARCH_DBRX, "dbrx" },
+ { LLM_ARCH_OLMO, "olmo" },
+ { LLM_ARCH_OLMO2, "olmo2" },
+ { LLM_ARCH_OLMOE, "olmoe" },
+ { LLM_ARCH_OPENELM, "openelm" },
+ { LLM_ARCH_ARCTIC, "arctic" },
+ { LLM_ARCH_DEEPSEEK, "deepseek" },
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
+ { LLM_ARCH_CHATGLM, "chatglm" },
+ { LLM_ARCH_GLM4, "glm4" },
+ { LLM_ARCH_GLM4_MOE, "glm4moe" },
+ { LLM_ARCH_BITNET, "bitnet" },
+ { LLM_ARCH_T5, "t5" },
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
+ { LLM_ARCH_JAIS, "jais" },
+ { LLM_ARCH_NEMOTRON, "nemotron" },
+ { LLM_ARCH_NEMOTRON_H, "nemotron_h" },
+ { LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
+ { LLM_ARCH_EXAONE, "exaone" },
+ { LLM_ARCH_EXAONE4, "exaone4" },
+ { LLM_ARCH_EXAONE_MOE, "exaone-moe" },
+ { LLM_ARCH_RWKV6, "rwkv6" },
+ { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
+ { LLM_ARCH_RWKV7, "rwkv7" },
+ { LLM_ARCH_ARWKV7, "arwkv7" },
+ { LLM_ARCH_GRANITE, "granite" },
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
+ { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
+ { LLM_ARCH_CHAMELEON, "chameleon" },
+ { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+ { LLM_ARCH_PLM, "plm" },
+ { LLM_ARCH_BAILINGMOE, "bailingmoe" },
+ { LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
+ { LLM_ARCH_DOTS1, "dots1" },
+ { LLM_ARCH_ARCEE, "arcee" },
+ { LLM_ARCH_AFMOE, "afmoe" },
+ { LLM_ARCH_ERNIE4_5, "ernie4_5" },
+ { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
+ { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
+ { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
+ { LLM_ARCH_SMOLLM3, "smollm3" },
+ { LLM_ARCH_OPENAI_MOE, "gpt-oss" },
+ { LLM_ARCH_LFM2, "lfm2" },
+ { LLM_ARCH_LFM2MOE, "lfm2moe" },
+ { LLM_ARCH_DREAM, "dream" },
+ { LLM_ARCH_SMALLTHINKER, "smallthinker" },
+ { LLM_ARCH_LLADA, "llada" },
+ { LLM_ARCH_LLADA_MOE, "llada-moe" },
+ { LLM_ARCH_SEED_OSS, "seed_oss" },
+ { LLM_ARCH_GROVEMOE, "grovemoe" },
+ { LLM_ARCH_APERTUS, "apertus" },
+ { LLM_ARCH_MINIMAX_M2, "minimax-m2" },
+ { LLM_ARCH_COGVLM, "cogvlm" },
+ { LLM_ARCH_RND1, "rnd1" },
+ { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
+ { LLM_ARCH_MISTRAL3, "mistral3" },
+ { LLM_ARCH_MIMO2, "mimo2" },
+ { LLM_ARCH_STEP35, "step35" },
+ { LLM_ARCH_LLAMA_EMBED, "llama-embed" },
+ { LLM_ARCH_MAINCODER, "maincoder" },
+ { LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
+};
+
+static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+ { LLM_KV_GENERAL_TYPE, "general.type" },
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
+ { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
+ { LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
+ { LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
+ { LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
+ { LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
+ { LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
+ { LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
+ { LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
+ { LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
+ { LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
+ { LLM_KV_GENERAL_NAME, "general.name" },
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
+ { LLM_KV_GENERAL_VERSION, "general.version" },
+ { LLM_KV_GENERAL_URL, "general.url" },
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
+
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
+ { LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
+ { LLM_KV_FEATURES_LENGTH, "%s.features_length" },
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
+ { LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
+ { LLM_KV_SWIGLU_CLAMP_EXP, "%s.swiglu_clamp_exp" },
+ { LLM_KV_SWIGLU_CLAMP_SHEXP, "%s.swiglu_clamp_shexp" },
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
+ { LLM_KV_EXPERT_GROUP_COUNT, "%s.expert_group_count" },
+ { LLM_KV_EXPERT_GROUP_USED_COUNT, "%s.expert_group_used_count" },
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
+ { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
+ { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
+ { LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
+ { LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
+ { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
+ { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
+ { LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
+ { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
+ { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
+ { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
+ { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
+ { LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
+ { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
+ { LLM_KV_SWIN_NORM, "%s.swin_norm" },
+ { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
+ { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
+ { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
+ { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
+ { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
+ { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
+ { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
+ { LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" },
+
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_DECAY_LORA_RANK, "%s.attention.decay_lora_rank" },
+ { LLM_KV_ATTENTION_ICLR_LORA_RANK, "%s.attention.iclr_lora_rank" },
+ { LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, "%s.attention.value_residual_mix_lora_rank" },
+ { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
+ { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
+ { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
+ { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
+
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
+ { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
+ { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
+ { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
+ { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
+ { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
+
+ { LLM_KV_SPLIT_NO, "split.no" },
+ { LLM_KV_SPLIT_COUNT, "split.count" },
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
+
+ { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
+ { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
+ { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
+ { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
+ { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
+ { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
+
+ { LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" },
+
+ { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
+
+ { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
+ { LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" },
+
+ { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
+ { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
+
+ { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
+
+ { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
+ // sentence-transformers dense modules feature dims
+ { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
+ { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
+ { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
+ { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
+
+ { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
+ { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
+ { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
+ { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
+ { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
+ { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
+ { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
+ { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
+ { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
+ { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
+ { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
+ { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
+ { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
+ { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
+ { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
+ { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
+ { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
+ { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
+ { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
+
+ { LLM_KV_ADAPTER_TYPE, "adapter.type" },
+ { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
+ { LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
+ { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
+ { LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
+
+ { LLM_KV_XIELU_ALPHA_N, "xielu.alpha_n" },
+ { LLM_KV_XIELU_ALPHA_P, "xielu.alpha_p" },
+ { LLM_KV_XIELU_BETA, "xielu.beta" },
+ { LLM_KV_XIELU_EPS, "xielu.eps" },
+
+ // deprecated
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
+};
+
+static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT_NORM_LFM2, "token_embd_norm" }, // fix for wrong tensor name
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
+ { LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
+ { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
+ { LLM_TENSOR_CLS, "cls" },
+ { LLM_TENSOR_CLS_OUT, "cls.output" },
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+ { LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
+ { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
+ { LLM_TENSOR_SSM_ALPHA, "blk.%d.ssm_alpha" },
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+ { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
+ { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
+ { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
+ { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
+ { LLM_TENSOR_SSM_CONV1D_Q, "blk.%d.ssm_conv1d_q" },
+ { LLM_TENSOR_SSM_CONV1D_K, "blk.%d.ssm_conv1d_k" },
+ { LLM_TENSOR_SSM_CONV1D_V, "blk.%d.ssm_conv1d_v" },
+ { LLM_TENSOR_SSM_F_A, "blk.%d.ssm_f_a" },
+ { LLM_TENSOR_SSM_F_B, "blk.%d.ssm_f_b" },
+ { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" },
+ { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" },
+ { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" },
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
+ { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
+ { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
+ { LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
+ { LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" },
+ { LLM_TENSOR_ALTUP_PROJ, "altup_proj" },
+ { LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" },
+ { LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" },
+ { LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" },
+ { LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" },
+ { LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" },
+ { LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" },
+ { LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" },
+ { LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" },
+ { LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" },
+ { LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" },
+ { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
+ { LLM_TENSOR_DENSE_2_OUT, "dense_2" },
+ { LLM_TENSOR_DENSE_3_OUT, "dense_3" },
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
+ { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
+ { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
+ { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
+ { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
+ { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
+ { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
+ { LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
+ { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
+ { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" },
+ { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" },
+ { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" },
+ { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" },
+ { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" },
+ { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" },
+ { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" },
+ { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
+ { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" },
+ { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" },
+ { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" },
+ { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" },
+ { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
+ { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
+ { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
+ { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
+ { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
+ { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
+ { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
+ { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
+ { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
+ { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
+ { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
+ { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
+ { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
+ { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
+ { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
+ { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
+ { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
+ { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
+ { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
+ { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
+ { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
+ { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
+ { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
+ { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
+ { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
+ { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
+ { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
+ { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
+ { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
+ { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
+ { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
+ { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
+ { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
+ { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
+ { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
+ { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
+ { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
+ { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
+ { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
+ { LLM_TENSOR_CONV1D, "conv1d" },
+ { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
+ { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
+ { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
+ { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
+ { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
+ { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
+ { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
+ { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
+ { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
+ { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
+ { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
+ { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
+ { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
+ { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
+ { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
+ { LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
+ { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
+ { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
+ { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
+ { LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
+ { LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
+ { LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
+ { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
+ { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
+ { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
+ { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
+ { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
+};
+
+static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
+ switch (arch) {
+ case LLM_ARCH_CLIP:
+ return {};
+ case LLM_ARCH_LLAMA:
+ case LLM_ARCH_DECI:
+ case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXP,
+ LLM_TENSOR_FFN_DOWN_EXP,
+ LLM_TENSOR_FFN_UP_EXP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_ARCEE:
+ case LLM_ARCH_STARCODER2:
+ case LLM_ARCH_NEMOTRON:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_AFMOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_LLAMA4:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXP,
+ LLM_TENSOR_FFN_DOWN_EXP,
+ LLM_TENSOR_FFN_UP_EXP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ };
+ case LLM_ARCH_BAICHUAN:
+ case LLM_ARCH_ORION:
+ case LLM_ARCH_XVERSE:
+ case LLM_ARCH_EXAONE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_FALCON:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_GROK:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXP,
+ LLM_TENSOR_FFN_DOWN_EXP,
+ LLM_TENSOR_FFN_UP_EXP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ };
+ case LLM_ARCH_GPT2:
+ case LLM_ARCH_STARCODER:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_POS_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ };
+ case LLM_ARCH_GPTNEOX:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_MPT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_ACT,
+ LLM_TENSOR_POS_EMBD,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ };
+ case LLM_ARCH_REFACT:
+ case LLM_ARCH_QWEN2:
+ case LLM_ARCH_QWEN2VL:
+ case LLM_ARCH_INTERNLM2:
+ case LLM_ARCH_GRANITE:
+ case LLM_ARCH_ERNIE4_5:
+ case LLM_ARCH_SMOLLM3:
+ case LLM_ARCH_DREAM:
+ case LLM_ARCH_LLADA:
+ case LLM_ARCH_PANGU_EMBED:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_BERT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_TOKEN_TYPES,
+ LLM_TENSOR_POS_EMBD,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_CLS,
+ LLM_TENSOR_CLS_OUT,
+ };
+ case LLM_ARCH_NOMIC_BERT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_TOKEN_TYPES,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_NOMIC_BERT_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_TOKEN_TYPES,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_NEO_BERT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_ENC_OUTPUT_NORM,
+ LLM_TENSOR_CLS,
+ LLM_TENSOR_CLS_OUT,
+ };
+ case LLM_ARCH_MODERN_BERT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_CLS,
+ LLM_TENSOR_CLS_OUT,
+ };
+ case LLM_ARCH_JINA_BERT_V2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_TOKEN_TYPES,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_CLS,
+ };
+ case LLM_ARCH_JINA_BERT_V3:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_TOKEN_TYPES,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ };
+ case LLM_ARCH_BLOOM:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ };
+ case LLM_ARCH_STABLELM:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ };
+ case LLM_ARCH_QWEN:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_QWEN2MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ };
+ case LLM_ARCH_QWEN3:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_QWEN3MOE:
+ case LLM_ARCH_QWEN3VLMOE:
+ case LLM_ARCH_OLMOE:
+ case LLM_ARCH_LLADA_MOE:
+ case LLM_ARCH_RND1:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_QWEN3NEXT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_SSM_A_NOSCAN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_BETA_ALPHA,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ };
+ case LLM_ARCH_QWEN35:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_SSM_A_NOSCAN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_BETA,
+ LLM_TENSOR_SSM_ALPHA,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ };
+ case LLM_ARCH_QWEN35MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_SSM_A_NOSCAN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_BETA,
+ LLM_TENSOR_SSM_ALPHA,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ };
+ case LLM_ARCH_QWEN3VL:
+ case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_HUNYUAN_DENSE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_PHI2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_PHI3:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_PHIMOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_PLAMO:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_PLAMO2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_X,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_OUT,
+ LLM_TENSOR_SSM_DT_NORM,
+ LLM_TENSOR_SSM_B_NORM,
+ LLM_TENSOR_SSM_C_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ };
+ case LLM_ARCH_PLAMO3:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_CODESHELL:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_MINICPM:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXP,
+ LLM_TENSOR_FFN_DOWN_EXP,
+ LLM_TENSOR_FFN_UP_EXP,
+ };
+ case LLM_ARCH_MINICPM3:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q_A_NORM,
+ LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_A,
+ LLM_TENSOR_ATTN_Q_B,
+ LLM_TENSOR_ATTN_KV_A_MQA,
+ LLM_TENSOR_ATTN_KV_B,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ };
+ case LLM_ARCH_GEMMA:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_GEMMA2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_POST_NORM,
+ };
+ case LLM_ARCH_GEMMA3:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_POST_NORM,
+ };
+ case LLM_ARCH_GEMMA3N:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_PER_LAYER_TOKEN_EMBD,
+ LLM_TENSOR_PER_LAYER_MODEL_PROJ,
+ LLM_TENSOR_PER_LAYER_PROJ_NORM,
+ LLM_TENSOR_ALTUP_UNEMBD_PROJ,
+ LLM_TENSOR_ALTUP_PROJ,
+ LLM_TENSOR_PER_LAYER_INP_GATE,
+ LLM_TENSOR_PER_LAYER_PROJ,
+ LLM_TENSOR_PER_LAYER_POST_NORM,
+ LLM_TENSOR_ALTUP_CORRECT_COEF,
+ LLM_TENSOR_ALTUP_CORRECT_SCALE,
+ LLM_TENSOR_ALTUP_PREDICT_COEF,
+ LLM_TENSOR_ALTUP_ROUTER,
+ LLM_TENSOR_ALTUP_ROUTER_NORM,
+ LLM_TENSOR_LAUREL_L,
+ LLM_TENSOR_LAUREL_R,
+ LLM_TENSOR_LAUREL_POST_NORM,
+ };
+ case LLM_ARCH_GEMMA_EMBEDDING:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_DENSE_2_OUT,
+ LLM_TENSOR_DENSE_3_OUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_POST_NORM,
+ };
+ case LLM_ARCH_MAMBA:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_X,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_OUT,
+ };
+ case LLM_ARCH_MAMBA2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ };
+ case LLM_ARCH_JAMBA:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_X,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_DT_NORM,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_B_NORM,
+ LLM_TENSOR_SSM_C_NORM,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_OUT,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_FALCON_H1:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_COMMAND_R:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ };
+ case LLM_ARCH_COHERE2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_DBRX:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_OLMO:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_OLMO2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_OPENELM:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_ARCTIC:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_NORM_EXPS,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_DEEPSEEK:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ };
+ case LLM_ARCH_DEEPSEEK2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q_A_NORM,
+ LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_A,
+ LLM_TENSOR_ATTN_Q_B,
+ LLM_TENSOR_ATTN_KV_A_MQA,
+ LLM_TENSOR_ATTN_KV_B,
+ LLM_TENSOR_ATTN_K_B,
+ LLM_TENSOR_ATTN_V_B,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_PLM:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_KV_A_MQA,
+ LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_ATTN_KV_B,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_CHATGLM:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ };
+ case LLM_ARCH_GLM4:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ };
+ case LLM_ARCH_GLM4_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+ };
+ case LLM_ARCH_BITNET:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_SUB_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_SUB_NORM,
+ };
+ case LLM_ARCH_T5:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_DEC_OUTPUT_NORM,
+ LLM_TENSOR_DEC_ATTN_NORM,
+ LLM_TENSOR_DEC_ATTN_Q,
+ LLM_TENSOR_DEC_ATTN_K,
+ LLM_TENSOR_DEC_ATTN_V,
+ LLM_TENSOR_DEC_ATTN_OUT,
+ LLM_TENSOR_DEC_ATTN_REL_B,
+ LLM_TENSOR_DEC_CROSS_ATTN_NORM,
+ LLM_TENSOR_DEC_CROSS_ATTN_Q,
+ LLM_TENSOR_DEC_CROSS_ATTN_K,
+ LLM_TENSOR_DEC_CROSS_ATTN_V,
+ LLM_TENSOR_DEC_CROSS_ATTN_OUT,
+ LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
+ LLM_TENSOR_DEC_FFN_NORM,
+ LLM_TENSOR_DEC_FFN_GATE,
+ LLM_TENSOR_DEC_FFN_DOWN,
+ LLM_TENSOR_DEC_FFN_UP,
+ LLM_TENSOR_ENC_OUTPUT_NORM,
+ LLM_TENSOR_ENC_ATTN_NORM,
+ LLM_TENSOR_ENC_ATTN_Q,
+ LLM_TENSOR_ENC_ATTN_K,
+ LLM_TENSOR_ENC_ATTN_V,
+ LLM_TENSOR_ENC_ATTN_OUT,
+ LLM_TENSOR_ENC_ATTN_REL_B,
+ LLM_TENSOR_ENC_FFN_NORM,
+ LLM_TENSOR_ENC_FFN_GATE,
+ LLM_TENSOR_ENC_FFN_DOWN,
+ LLM_TENSOR_ENC_FFN_UP,
+ };
+ case LLM_ARCH_T5ENCODER:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ENC_OUTPUT_NORM,
+ LLM_TENSOR_ENC_ATTN_NORM,
+ LLM_TENSOR_ENC_ATTN_Q,
+ LLM_TENSOR_ENC_ATTN_K,
+ LLM_TENSOR_ENC_ATTN_V,
+ LLM_TENSOR_ENC_ATTN_OUT,
+ LLM_TENSOR_ENC_ATTN_REL_B,
+ LLM_TENSOR_ENC_FFN_NORM,
+ LLM_TENSOR_ENC_FFN_GATE,
+ LLM_TENSOR_ENC_FFN_DOWN,
+ LLM_TENSOR_ENC_FFN_UP,
+ };
+ case LLM_ARCH_JAIS:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ };
+ case LLM_ARCH_NEMOTRON_H:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_NEMOTRON_H_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ // mamba(2) ssm layers
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ // attention layers
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ // dense FFN
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ // MoE FFN (for MoE layers)
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ // MoE shared expert layer
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ };
+ case LLM_ARCH_EXAONE4:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_POST_NORM,
+ };
+ case LLM_ARCH_EXAONE_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+ };
+ case LLM_ARCH_RWKV6:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_TIME_MIX_W1,
+ LLM_TENSOR_TIME_MIX_W2,
+ LLM_TENSOR_TIME_MIX_LERP_X,
+ LLM_TENSOR_TIME_MIX_LERP_W,
+ LLM_TENSOR_TIME_MIX_LERP_K,
+ LLM_TENSOR_TIME_MIX_LERP_V,
+ LLM_TENSOR_TIME_MIX_LERP_R,
+ LLM_TENSOR_TIME_MIX_LERP_G,
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
+ LLM_TENSOR_TIME_MIX_FIRST,
+ LLM_TENSOR_TIME_MIX_DECAY,
+ LLM_TENSOR_TIME_MIX_DECAY_W1,
+ LLM_TENSOR_TIME_MIX_DECAY_W2,
+ LLM_TENSOR_TIME_MIX_KEY,
+ LLM_TENSOR_TIME_MIX_VALUE,
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
+ LLM_TENSOR_TIME_MIX_GATE,
+ LLM_TENSOR_TIME_MIX_LN,
+ LLM_TENSOR_TIME_MIX_OUTPUT,
+ LLM_TENSOR_CHANNEL_MIX_LERP_K,
+ LLM_TENSOR_CHANNEL_MIX_LERP_R,
+ LLM_TENSOR_CHANNEL_MIX_KEY,
+ LLM_TENSOR_CHANNEL_MIX_VALUE,
+ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
+ };
+ case LLM_ARCH_RWKV6QWEN2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_TIME_MIX_W1,
+ LLM_TENSOR_TIME_MIX_W2,
+ LLM_TENSOR_TIME_MIX_LERP_X,
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
+ LLM_TENSOR_TIME_MIX_FIRST,
+ LLM_TENSOR_TIME_MIX_DECAY,
+ LLM_TENSOR_TIME_MIX_DECAY_W1,
+ LLM_TENSOR_TIME_MIX_DECAY_W2,
+ LLM_TENSOR_TIME_MIX_KEY,
+ LLM_TENSOR_TIME_MIX_VALUE,
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
+ LLM_TENSOR_TIME_MIX_GATE,
+ LLM_TENSOR_TIME_MIX_OUTPUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_RWKV7:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_TIME_MIX_W0,
+ LLM_TENSOR_TIME_MIX_W1,
+ LLM_TENSOR_TIME_MIX_W2,
+ LLM_TENSOR_TIME_MIX_A0,
+ LLM_TENSOR_TIME_MIX_A1,
+ LLM_TENSOR_TIME_MIX_A2,
+ LLM_TENSOR_TIME_MIX_V0,
+ LLM_TENSOR_TIME_MIX_V1,
+ LLM_TENSOR_TIME_MIX_V2,
+ LLM_TENSOR_TIME_MIX_G1,
+ LLM_TENSOR_TIME_MIX_G2,
+ LLM_TENSOR_TIME_MIX_K_K,
+ LLM_TENSOR_TIME_MIX_K_A,
+ LLM_TENSOR_TIME_MIX_R_K,
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
+ LLM_TENSOR_TIME_MIX_KEY,
+ LLM_TENSOR_TIME_MIX_VALUE,
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
+ LLM_TENSOR_TIME_MIX_LN,
+ LLM_TENSOR_TIME_MIX_OUTPUT,
+ LLM_TENSOR_CHANNEL_MIX_LERP_K,
+ LLM_TENSOR_CHANNEL_MIX_KEY,
+ LLM_TENSOR_CHANNEL_MIX_VALUE,
+ };
+ case LLM_ARCH_ARWKV7:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_TIME_MIX_W0,
+ LLM_TENSOR_TIME_MIX_W1,
+ LLM_TENSOR_TIME_MIX_W2,
+ LLM_TENSOR_TIME_MIX_A0,
+ LLM_TENSOR_TIME_MIX_A1,
+ LLM_TENSOR_TIME_MIX_A2,
+ LLM_TENSOR_TIME_MIX_V0,
+ LLM_TENSOR_TIME_MIX_V1,
+ LLM_TENSOR_TIME_MIX_V2,
+ LLM_TENSOR_TIME_MIX_G1,
+ LLM_TENSOR_TIME_MIX_G2,
+ LLM_TENSOR_TIME_MIX_K_K,
+ LLM_TENSOR_TIME_MIX_K_A,
+ LLM_TENSOR_TIME_MIX_R_K,
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
+ LLM_TENSOR_TIME_MIX_KEY,
+ LLM_TENSOR_TIME_MIX_VALUE,
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
+ LLM_TENSOR_TIME_MIX_LN,
+ LLM_TENSOR_TIME_MIX_OUTPUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_GRANITE_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ };
+ case LLM_ARCH_GRANITE_HYBRID:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ };
+ case LLM_ARCH_WAVTOKENIZER_DEC:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_CONV1D,
+ LLM_TENSOR_CONVNEXT_DW,
+ LLM_TENSOR_CONVNEXT_NORM,
+ LLM_TENSOR_CONVNEXT_PW1,
+ LLM_TENSOR_CONVNEXT_PW2,
+ LLM_TENSOR_CONVNEXT_GAMMA,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_POS_NET_CONV1,
+ LLM_TENSOR_POS_NET_CONV2,
+ LLM_TENSOR_POS_NET_NORM,
+ LLM_TENSOR_POS_NET_NORM1,
+ LLM_TENSOR_POS_NET_NORM2,
+ LLM_TENSOR_POS_NET_ATTN_NORM,
+ LLM_TENSOR_POS_NET_ATTN_Q,
+ LLM_TENSOR_POS_NET_ATTN_K,
+ LLM_TENSOR_POS_NET_ATTN_V,
+ LLM_TENSOR_POS_NET_ATTN_OUT,
+ };
+ case LLM_ARCH_BAILINGMOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ };
+ case LLM_ARCH_BAILINGMOE2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ };
+ case LLM_ARCH_DOTS1:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_ERNIE4_5_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_HUNYUAN_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_OPENAI_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_SINKS,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_LFM2:
+ return {
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_SHORTCONV_CONV,
+ LLM_TENSOR_SHORTCONV_INPROJ,
+ LLM_TENSOR_SHORTCONV_OUTPROJ,
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM_LFM2,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_DENSE_2_OUT,
+ };
+ case LLM_ARCH_LFM2MOE:
+ return {
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_SHORTCONV_CONV,
+ LLM_TENSOR_SHORTCONV_INPROJ,
+ LLM_TENSOR_SHORTCONV_OUTPROJ,
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM_LFM2,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_SMALLTHINKER:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ };
+ case LLM_ARCH_APERTUS:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_SEED_OSS:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_GROVEMOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_CHEXPS,
+ LLM_TENSOR_FFN_DOWN_CHEXPS,
+ LLM_TENSOR_FFN_UP_CHEXPS,
+ };
+ case LLM_ARCH_MINIMAX_M2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_COGVLM:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_VISEXP_ATTN_QKV,
+ LLM_TENSOR_VISEXP_ATTN_OUT,
+ LLM_TENSOR_VISEXP_FFN_GATE,
+ LLM_TENSOR_VISEXP_FFN_DOWN,
+ LLM_TENSOR_VISEXP_FFN_UP,
+ };
+ case LLM_ARCH_MIMO2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_SINKS,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_STEP35:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
+ case LLM_ARCH_GPTJ:
+ case LLM_ARCH_UNKNOWN:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ };
+ case LLM_ARCH_MAINCODER:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
+ case LLM_ARCH_KIMI_LINEAR:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ // Dense FFN (layer 0 only)
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ // MoE FFN (layers 1+)
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ // Shared experts
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ // KDA (using SSM_ enum prefix, keeping GGUF names for backward compat)
+ LLM_TENSOR_SSM_CONV1D_Q,
+ LLM_TENSOR_SSM_CONV1D_K,
+ LLM_TENSOR_SSM_CONV1D_V,
+ LLM_TENSOR_SSM_F_A,
+ LLM_TENSOR_SSM_F_B,
+ LLM_TENSOR_SSM_BETA,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_G_A,
+ LLM_TENSOR_SSM_G_B,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_NORM,
+ // MLA
+ LLM_TENSOR_ATTN_Q_A,
+ LLM_TENSOR_ATTN_Q_B,
+ LLM_TENSOR_ATTN_Q_A_NORM,
+ LLM_TENSOR_ATTN_KV_A_MQA,
+ LLM_TENSOR_ATTN_KV_B,
+ LLM_TENSOR_ATTN_K_B,
+ LLM_TENSOR_ATTN_V_B,
+ LLM_TENSOR_ATTN_KV_A_NORM,
+ };
+ default:
+ GGML_ABORT("unknown architecture for tensor mapping");
+ }
+}
+
+// declare information about the model weight tensors:
+// - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
+// - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
+//
+// for example, input layers are usually assigned to CPU/host buffer types
+//
+// a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
+// assignment of the buffer types and extra overhead during computation
+// example: https://github.com/ggml-org/llama.cpp/pull/17548
+//
+static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+ {LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
+ {LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
+ {LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_OUTPUT_NORM_LFM2, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_ROPE_FREQS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+ {LLM_TENSOR_ROPE_FACTORS_LONG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+ {LLM_TENSOR_ROPE_FACTORS_SHORT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+ {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+ {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_CROSS_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_CROSS_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_CROSS_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_CROSS_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_DEC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ENC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ENC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ENC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ENC_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ENC_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ENC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_GATE_INP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_GATE_INP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_IN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_A2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_V1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_V2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_G1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_G2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_VALUE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_RECEPTANCE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_OUTPUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CHANNEL_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CHANNEL_MIX_VALUE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
+ {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
+ {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
+ {LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
+ {LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ // Kimi KDA - Conv tensors are 4D [d_conv, 1, d_inner, 1], reshaped to 2D at runtime
+ {LLM_TENSOR_SSM_CONV1D_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SSM_CONV1D_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SSM_CONV1D_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SSM_F_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_F_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_TIME_MIX_K_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_TIME_MIX_K_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_TIME_MIX_R_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_W0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_A0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_V0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
+ {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_FFN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_FFN_NORM_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_LAYER_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_Q_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_FFN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_DEC_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_DEC_CROSS_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_DEC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ENC_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ENC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_DEC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_ENC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ // altup / laurel (gemma 3n)
+ {LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_PER_LAYER_MODEL_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_PER_LAYER_PROJ_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_ALTUP_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ALTUP_UNEMBD_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_PER_LAYER_INP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_PER_LAYER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_PER_LAYER_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ALTUP_CORRECT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ALTUP_CORRECT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ALTUP_PREDICT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ALTUP_ROUTER, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ALTUP_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_LAUREL_L, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_LAUREL_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ // this tensor is loaded for T5, but never used
+ {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+ {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
+ {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+ {LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+ {LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+ {LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
+ {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ // NextN/MTP tensors are currently ignored (reserved for future MTP support)
+ // These tensors only exist in the last layer(s) and are treated as output tensors
+ {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+};
+
+LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
+
+std::string LLM_KV::operator()(llm_kv kv) const {
+ std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+
+ if (suffix != nullptr) {
+ name += ".";
+ name += suffix;
+ }
+
+ return name;
+}
+
+LLM_TN_IMPL::LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid)
+ : arch(arch), tensor(tensor), suffix(suffix), bid(bid), xid(xid),
+ model_tensors(llm_get_tensor_names(arch)) {}
+
+std::string LLM_TN_IMPL::str() const {
+ if (LLM_TENSOR_NAMES.find(tensor) == LLM_TENSOR_NAMES.end()) {
+ GGML_ABORT("unknown tensor name for tensor id %d", static_cast<int>(tensor));
+ }
+
+ if (model_tensors.find(tensor) == model_tensors.end()) {
+ return LLM_TENSOR_NAMES.at(tensor);
+ }
+
+ std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid);
+ if (suffix != nullptr) {
+ name += ".";
+ name += suffix;
+ }
+
+ return name;
+}
+
+const char * llm_arch_name(llm_arch arch) {
+ auto it = LLM_ARCH_NAMES.find(arch);
+ if (it == LLM_ARCH_NAMES.end()) {
+ return "unknown";
+ }
+ return it->second;
+}
+
+llm_arch llm_arch_from_string(const std::string & name) {
+ for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
+ if (kv.second == name) {
+ return kv.first;
+ }
+ }
+
+ return LLM_ARCH_UNKNOWN;
+}
+
+const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
+ return LLM_TENSOR_INFOS.at(tensor);
+}
+
+bool llm_arch_is_recurrent(const llm_arch & arch) {
+ switch (arch) {
+ case LLM_ARCH_MAMBA:
+ case LLM_ARCH_MAMBA2:
+ case LLM_ARCH_RWKV6:
+ case LLM_ARCH_RWKV6QWEN2:
+ case LLM_ARCH_RWKV7:
+ case LLM_ARCH_ARWKV7:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool llm_arch_is_hybrid(const llm_arch & arch) {
+ switch (arch) {
+ case LLM_ARCH_JAMBA:
+ case LLM_ARCH_FALCON_H1:
+ case LLM_ARCH_PLAMO2:
+ case LLM_ARCH_GRANITE_HYBRID:
+ case LLM_ARCH_LFM2:
+ case LLM_ARCH_LFM2MOE:
+ case LLM_ARCH_NEMOTRON_H:
+ case LLM_ARCH_NEMOTRON_H_MOE:
+ case LLM_ARCH_QWEN3NEXT:
+ case LLM_ARCH_KIMI_LINEAR:
+ case LLM_ARCH_QWEN35:
+ case LLM_ARCH_QWEN35MOE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool llm_arch_is_diffusion(const llm_arch & arch) {
+ switch (arch) {
+ case LLM_ARCH_DREAM:
+ case LLM_ARCH_LLADA:
+ case LLM_ARCH_LLADA_MOE:
+ case LLM_ARCH_RND1:
+ return true;
+ default:
+ return false;
+ }
+}
diff --git a/llama.cpp/src/llama-arch.h b/llama.cpp/src/llama-arch.h
new file mode 100644
index 0000000..4f7b51e
--- /dev/null
+++ b/llama.cpp/src/llama-arch.h
@@ -0,0 +1,606 @@
+#pragma once
+
+#include "ggml.h" // ggml_op
+
+#include <string>
+#include <set>
+
+//
+// gguf constants (sync with gguf.py)
+//
+
+enum llm_arch {
+ LLM_ARCH_CLIP,
+ LLM_ARCH_LLAMA,
+ LLM_ARCH_LLAMA4,
+ LLM_ARCH_DECI,
+ LLM_ARCH_FALCON,
+ LLM_ARCH_BAICHUAN,
+ LLM_ARCH_GROK,
+ LLM_ARCH_GPT2,
+ LLM_ARCH_GPTJ,
+ LLM_ARCH_GPTNEOX,
+ LLM_ARCH_MPT,
+ LLM_ARCH_STARCODER,
+ LLM_ARCH_REFACT,
+ LLM_ARCH_BERT,
+ LLM_ARCH_MODERN_BERT,
+ LLM_ARCH_NOMIC_BERT,
+ LLM_ARCH_NOMIC_BERT_MOE,
+ LLM_ARCH_NEO_BERT,
+ LLM_ARCH_JINA_BERT_V2,
+ LLM_ARCH_JINA_BERT_V3,
+ LLM_ARCH_BLOOM,
+ LLM_ARCH_STABLELM,
+ LLM_ARCH_QWEN,
+ LLM_ARCH_QWEN2,
+ LLM_ARCH_QWEN2MOE,
+ LLM_ARCH_QWEN2VL,
+ LLM_ARCH_QWEN3,
+ LLM_ARCH_QWEN3MOE,
+ LLM_ARCH_QWEN3NEXT,
+ LLM_ARCH_QWEN3VL,
+ LLM_ARCH_QWEN3VLMOE,
+ LLM_ARCH_QWEN35,
+ LLM_ARCH_QWEN35MOE,
+ LLM_ARCH_PHI2,
+ LLM_ARCH_PHI3,
+ LLM_ARCH_PHIMOE,
+ LLM_ARCH_PLAMO,
+ LLM_ARCH_PLAMO2,
+ LLM_ARCH_PLAMO3,
+ LLM_ARCH_CODESHELL,
+ LLM_ARCH_ORION,
+ LLM_ARCH_INTERNLM2,
+ LLM_ARCH_MINICPM,
+ LLM_ARCH_MINICPM3,
+ LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
+ LLM_ARCH_GEMMA3,
+ LLM_ARCH_GEMMA3N,
+ LLM_ARCH_GEMMA_EMBEDDING,
+ LLM_ARCH_STARCODER2,
+ LLM_ARCH_MAMBA,
+ LLM_ARCH_MAMBA2,
+ LLM_ARCH_JAMBA,
+ LLM_ARCH_FALCON_H1,
+ LLM_ARCH_XVERSE,
+ LLM_ARCH_COMMAND_R,
+ LLM_ARCH_COHERE2,
+ LLM_ARCH_DBRX,
+ LLM_ARCH_OLMO,
+ LLM_ARCH_OLMO2,
+ LLM_ARCH_OLMOE,
+ LLM_ARCH_OPENELM,
+ LLM_ARCH_ARCTIC,
+ LLM_ARCH_DEEPSEEK,
+ LLM_ARCH_DEEPSEEK2,
+ LLM_ARCH_CHATGLM,
+ LLM_ARCH_GLM4,
+ LLM_ARCH_GLM4_MOE,
+ LLM_ARCH_BITNET,
+ LLM_ARCH_T5,
+ LLM_ARCH_T5ENCODER,
+ LLM_ARCH_JAIS,
+ LLM_ARCH_NEMOTRON,
+ LLM_ARCH_NEMOTRON_H,
+ LLM_ARCH_NEMOTRON_H_MOE,
+ LLM_ARCH_EXAONE,
+ LLM_ARCH_EXAONE4,
+ LLM_ARCH_EXAONE_MOE,
+ LLM_ARCH_RWKV6,
+ LLM_ARCH_RWKV6QWEN2,
+ LLM_ARCH_RWKV7,
+ LLM_ARCH_ARWKV7,
+ LLM_ARCH_GRANITE,
+ LLM_ARCH_GRANITE_MOE,
+ LLM_ARCH_GRANITE_HYBRID,
+ LLM_ARCH_CHAMELEON,
+ LLM_ARCH_WAVTOKENIZER_DEC,
+ LLM_ARCH_PLM,
+ LLM_ARCH_BAILINGMOE,
+ LLM_ARCH_BAILINGMOE2,
+ LLM_ARCH_DOTS1,
+ LLM_ARCH_ARCEE,
+ LLM_ARCH_AFMOE,
+ LLM_ARCH_ERNIE4_5,
+ LLM_ARCH_ERNIE4_5_MOE,
+ LLM_ARCH_HUNYUAN_MOE,
+ LLM_ARCH_HUNYUAN_DENSE,
+ LLM_ARCH_SMOLLM3,
+ LLM_ARCH_OPENAI_MOE,
+ LLM_ARCH_LFM2,
+ LLM_ARCH_LFM2MOE,
+ LLM_ARCH_DREAM,
+ LLM_ARCH_SMALLTHINKER,
+ LLM_ARCH_LLADA,
+ LLM_ARCH_LLADA_MOE,
+ LLM_ARCH_SEED_OSS,
+ LLM_ARCH_GROVEMOE,
+ LLM_ARCH_APERTUS,
+ LLM_ARCH_MINIMAX_M2,
+ LLM_ARCH_COGVLM,
+ LLM_ARCH_RND1,
+ LLM_ARCH_PANGU_EMBED,
+ LLM_ARCH_MISTRAL3,
+ LLM_ARCH_MIMO2,
+ LLM_ARCH_STEP35,
+ LLM_ARCH_LLAMA_EMBED,
+ LLM_ARCH_MAINCODER,
+ LLM_ARCH_KIMI_LINEAR,
+ LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+ LLM_KV_GENERAL_TYPE,
+ LLM_KV_GENERAL_ARCHITECTURE,
+ LLM_KV_GENERAL_QUANTIZATION_VERSION,
+ LLM_KV_GENERAL_ALIGNMENT,
+ LLM_KV_GENERAL_FILE_TYPE,
+ LLM_KV_GENERAL_SAMPLING_SEQUENCE,
+ LLM_KV_GENERAL_SAMPLING_TOP_K,
+ LLM_KV_GENERAL_SAMPLING_TOP_P,
+ LLM_KV_GENERAL_SAMPLING_MIN_P,
+ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
+ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
+ LLM_KV_GENERAL_SAMPLING_TEMP,
+ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
+ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT,
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
+ LLM_KV_GENERAL_NAME,
+ LLM_KV_GENERAL_AUTHOR,
+ LLM_KV_GENERAL_VERSION,
+ LLM_KV_GENERAL_URL,
+ LLM_KV_GENERAL_DESCRIPTION,
+ LLM_KV_GENERAL_LICENSE,
+ LLM_KV_GENERAL_SOURCE_URL,
+ LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+ LLM_KV_VOCAB_SIZE,
+ LLM_KV_CONTEXT_LENGTH,
+ LLM_KV_EMBEDDING_LENGTH,
+ LLM_KV_EMBEDDING_LENGTH_OUT,
+ LLM_KV_FEATURES_LENGTH,
+ LLM_KV_BLOCK_COUNT,
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
+ LLM_KV_FEED_FORWARD_LENGTH,
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
+ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
+ LLM_KV_SWIGLU_CLAMP_EXP,
+ LLM_KV_SWIGLU_CLAMP_SHEXP,
+ LLM_KV_USE_PARALLEL_RESIDUAL,
+ LLM_KV_TENSOR_DATA_LAYOUT,
+ LLM_KV_EXPERT_COUNT,
+ LLM_KV_EXPERT_USED_COUNT,
+ LLM_KV_EXPERT_SHARED_COUNT,
+ LLM_KV_EXPERT_GROUP_COUNT,
+ LLM_KV_EXPERT_GROUP_USED_COUNT,
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
+ LLM_KV_EXPERT_WEIGHTS_NORM,
+ LLM_KV_EXPERT_GATING_FUNC,
+ LLM_KV_EXPERT_GROUP_SCALE,
+ LLM_KV_EXPERTS_PER_GROUP,
+ LLM_KV_MOE_EVERY_N_LAYERS,
+ LLM_KV_NEXTN_PREDICT_LAYERS,
+ LLM_KV_NUM_DEEPSTACK_LAYERS,
+ LLM_KV_POOLING_TYPE,
+ LLM_KV_LOGIT_SCALE,
+ LLM_KV_DECODER_START_TOKEN_ID,
+ LLM_KV_DECODER_BLOCK_COUNT,
+ LLM_KV_ATTN_LOGIT_SOFTCAPPING,
+ LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
+ LLM_KV_FINAL_LOGIT_SOFTCAPPING,
+ LLM_KV_SWIN_NORM,
+ LLM_KV_RESCALE_EVERY_N_LAYERS,
+ LLM_KV_TIME_MIX_EXTRA_DIM,
+ LLM_KV_TIME_DECAY_EXTRA_DIM,
+ LLM_KV_RESIDUAL_SCALE,
+ LLM_KV_EMBEDDING_SCALE,
+ LLM_KV_TOKEN_SHIFT_COUNT,
+ LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
+ LLM_KV_FULL_ATTENTION_INTERVAL,
+
+ LLM_KV_ATTENTION_HEAD_COUNT,
+ LLM_KV_ATTENTION_HEAD_COUNT_KV,
+ LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+ LLM_KV_ATTENTION_CLAMP_KQV,
+ LLM_KV_ATTENTION_KEY_LENGTH,
+ LLM_KV_ATTENTION_VALUE_LENGTH,
+ LLM_KV_ATTENTION_LAYERNORM_EPS,
+ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+ LLM_KV_ATTENTION_GROUPNORM_EPS,
+ LLM_KV_ATTENTION_GROUPNORM_GROUPS,
+ LLM_KV_ATTENTION_CAUSAL,
+ LLM_KV_ATTENTION_Q_LORA_RANK,
+ LLM_KV_ATTENTION_KV_LORA_RANK,
+ LLM_KV_ATTENTION_DECAY_LORA_RANK,
+ LLM_KV_ATTENTION_ICLR_LORA_RANK,
+ LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
+ LLM_KV_ATTENTION_GATE_LORA_RANK,
+ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
+ LLM_KV_ATTENTION_SLIDING_WINDOW,
+ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
+ LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_OUTPUT_SCALE,
+ LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+ LLM_KV_ATTENTION_TEMPERATURE_SCALE,
+ LLM_KV_ATTENTION_KEY_LENGTH_MLA,
+ LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+
+ LLM_KV_ROPE_DIMENSION_COUNT,
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
+ LLM_KV_ROPE_FREQ_BASE,
+ LLM_KV_ROPE_FREQ_BASE_SWA,
+ LLM_KV_ROPE_SCALE_LINEAR,
+ LLM_KV_ROPE_SCALING_TYPE,
+ LLM_KV_ROPE_SCALING_FACTOR,
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
+ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+ LLM_KV_ROPE_SCALING_FINETUNED,
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
+ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
+ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
+ LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
+ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
+
+ LLM_KV_SPLIT_NO,
+ LLM_KV_SPLIT_COUNT,
+ LLM_KV_SPLIT_TENSORS_COUNT,
+
+ LLM_KV_SSM_INNER_SIZE,
+ LLM_KV_SSM_CONV_KERNEL,
+ LLM_KV_SSM_STATE_SIZE,
+ LLM_KV_SSM_TIME_STEP_RANK,
+ LLM_KV_SSM_GROUP_COUNT,
+ LLM_KV_SSM_DT_B_C_RMS,
+
+ LLM_KV_KDA_HEAD_DIM,
+
+ LLM_KV_WKV_HEAD_SIZE,
+
+ LLM_KV_TOKENIZER_MODEL,
+ LLM_KV_TOKENIZER_PRE,
+ LLM_KV_TOKENIZER_LIST,
+ LLM_KV_TOKENIZER_TOKEN_TYPE,
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
+ LLM_KV_TOKENIZER_SCORES,
+ LLM_KV_TOKENIZER_MERGES,
+ LLM_KV_TOKENIZER_BOS_ID,
+ LLM_KV_TOKENIZER_EOS_ID,
+ LLM_KV_TOKENIZER_EOT_ID,
+ LLM_KV_TOKENIZER_EOM_ID,
+ LLM_KV_TOKENIZER_UNK_ID,
+ LLM_KV_TOKENIZER_SEP_ID,
+ LLM_KV_TOKENIZER_PAD_ID,
+ LLM_KV_TOKENIZER_CLS_ID,
+ LLM_KV_TOKENIZER_MASK_ID,
+ LLM_KV_TOKENIZER_ADD_BOS,
+ LLM_KV_TOKENIZER_ADD_EOS,
+ LLM_KV_TOKENIZER_ADD_SEP,
+ LLM_KV_TOKENIZER_ADD_PREFIX,
+ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
+ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
+ LLM_KV_TOKENIZER_HF_JSON,
+ LLM_KV_TOKENIZER_RWKV,
+ LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+ LLM_KV_TOKENIZER_FIM_PRE_ID,
+ LLM_KV_TOKENIZER_FIM_SUF_ID,
+ LLM_KV_TOKENIZER_FIM_MID_ID,
+ LLM_KV_TOKENIZER_FIM_PAD_ID,
+ LLM_KV_TOKENIZER_FIM_REP_ID,
+ LLM_KV_TOKENIZER_FIM_SEP_ID,
+
+ LLM_KV_ADAPTER_TYPE,
+ LLM_KV_ADAPTER_LORA_ALPHA,
+ LLM_KV_ADAPTER_LORA_TASK_NAME,
+ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
+ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
+
+ LLM_KV_POSNET_EMBEDDING_LENGTH,
+ LLM_KV_POSNET_BLOCK_COUNT,
+
+ LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
+ LLM_KV_CONVNEXT_BLOCK_COUNT,
+
+ LLM_KV_CLASSIFIER_OUTPUT_LABELS,
+
+ LLM_KV_SHORTCONV_L_CACHE,
+
+ LLM_KV_XIELU_ALPHA_N,
+ LLM_KV_XIELU_ALPHA_P,
+ LLM_KV_XIELU_BETA,
+ LLM_KV_XIELU_EPS,
+
+ // deprecated:
+ LLM_KV_TOKENIZER_PREFIX_ID,
+ LLM_KV_TOKENIZER_SUFFIX_ID,
+ LLM_KV_TOKENIZER_MIDDLE_ID,
+
+ // sentence-transformers dense layers in and out features
+ LLM_KV_DENSE_2_FEAT_IN,
+ LLM_KV_DENSE_2_FEAT_OUT,
+ LLM_KV_DENSE_3_FEAT_IN,
+ LLM_KV_DENSE_3_FEAT_OUT,
+};
+
+enum llm_tensor {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_TOKEN_TYPES,
+ LLM_TENSOR_POS_EMBD,
+ LLM_TENSOR_DENSE_2_OUT,
+ LLM_TENSOR_DENSE_3_OUT,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ROPE_FACTORS_LONG,
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_ATTN_SINKS,
+ LLM_TENSOR_ATTN_GATE,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_ACT,
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
+ LLM_TENSOR_FFN_GATE_EXP,
+ LLM_TENSOR_FFN_UP_EXP,
+ LLM_TENSOR_FFN_NORM_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_DOWN_CHEXPS,
+ LLM_TENSOR_FFN_GATE_CHEXPS,
+ LLM_TENSOR_FFN_UP_CHEXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_POST_ATTN_NORM,
+ LLM_TENSOR_POST_MLP_NORM,
+ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
+ LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n
+ LLM_TENSOR_PER_LAYER_INP_GATE, // gemma3n
+ LLM_TENSOR_PER_LAYER_PROJ, // gemma3n
+ LLM_TENSOR_PER_LAYER_PROJ_NORM, // gemma3n
+ LLM_TENSOR_PER_LAYER_POST_NORM, // gemma3n
+ LLM_TENSOR_ALTUP_PROJ, // gemma3n
+ LLM_TENSOR_ALTUP_UNEMBD_PROJ, // gemma3n
+ LLM_TENSOR_ALTUP_CORRECT_COEF, // gemma3n
+ LLM_TENSOR_ALTUP_CORRECT_SCALE, // gemma3n
+ LLM_TENSOR_ALTUP_PREDICT_COEF, // gemma3n
+ LLM_TENSOR_ALTUP_ROUTER, // gemma3n
+ LLM_TENSOR_ALTUP_ROUTER_NORM, // gemma3n
+ LLM_TENSOR_LAUREL_L, // gemma3n
+ LLM_TENSOR_LAUREL_R, // gemma3n
+ LLM_TENSOR_LAUREL_POST_NORM, // gemma3n
+ LLM_TENSOR_SSM_IN,
+ LLM_TENSOR_SSM_CONV1D,
+ LLM_TENSOR_SSM_X,
+ LLM_TENSOR_SSM_DT,
+ LLM_TENSOR_SSM_DT_NORM,
+ LLM_TENSOR_SSM_A,
+ LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
+ LLM_TENSOR_SSM_B_NORM,
+ LLM_TENSOR_SSM_C_NORM,
+ LLM_TENSOR_SSM_D,
+ LLM_TENSOR_SSM_NORM,
+ LLM_TENSOR_SSM_OUT,
+ LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
+ LLM_TENSOR_SSM_ALPHA, // qwen3.5
+ // Kimi Linear KDA (using SSM_ prefix for consistency)
+ LLM_TENSOR_SSM_CONV1D_Q, // kimi: Q conv1d weight
+ LLM_TENSOR_SSM_CONV1D_K, // kimi: K conv1d weight
+ LLM_TENSOR_SSM_CONV1D_V, // kimi: V conv1d weight
+ LLM_TENSOR_SSM_F_A, // kimi: forget gate projection A
+ LLM_TENSOR_SSM_F_B, // kimi: forget gate projection B
+ LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient and qwen3.5
+ LLM_TENSOR_SSM_G_A, // kimi: output gate projection A
+ LLM_TENSOR_SSM_G_B, // kimi: output gate projection B
+ LLM_TENSOR_TIME_MIX_W0,
+ LLM_TENSOR_TIME_MIX_W1,
+ LLM_TENSOR_TIME_MIX_W2,
+ LLM_TENSOR_TIME_MIX_A0,
+ LLM_TENSOR_TIME_MIX_A1,
+ LLM_TENSOR_TIME_MIX_A2,
+ LLM_TENSOR_TIME_MIX_V0,
+ LLM_TENSOR_TIME_MIX_V1,
+ LLM_TENSOR_TIME_MIX_V2,
+ LLM_TENSOR_TIME_MIX_G1,
+ LLM_TENSOR_TIME_MIX_G2,
+ LLM_TENSOR_TIME_MIX_K_K,
+ LLM_TENSOR_TIME_MIX_K_A,
+ LLM_TENSOR_TIME_MIX_R_K,
+ LLM_TENSOR_TIME_MIX_LERP_X,
+ LLM_TENSOR_TIME_MIX_LERP_W,
+ LLM_TENSOR_TIME_MIX_LERP_K,
+ LLM_TENSOR_TIME_MIX_LERP_V,
+ LLM_TENSOR_TIME_MIX_LERP_R,
+ LLM_TENSOR_TIME_MIX_LERP_G,
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
+ LLM_TENSOR_TIME_MIX_FIRST,
+ LLM_TENSOR_TIME_MIX_DECAY,
+ LLM_TENSOR_TIME_MIX_DECAY_W1,
+ LLM_TENSOR_TIME_MIX_DECAY_W2,
+ LLM_TENSOR_TIME_MIX_KEY,
+ LLM_TENSOR_TIME_MIX_VALUE,
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
+ LLM_TENSOR_TIME_MIX_GATE,
+ LLM_TENSOR_TIME_MIX_LN,
+ LLM_TENSOR_TIME_MIX_OUTPUT,
+ LLM_TENSOR_CHANNEL_MIX_LERP_K,
+ LLM_TENSOR_CHANNEL_MIX_LERP_R,
+ LLM_TENSOR_CHANNEL_MIX_KEY,
+ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
+ LLM_TENSOR_CHANNEL_MIX_VALUE,
+ LLM_TENSOR_ATTN_Q_A,
+ LLM_TENSOR_ATTN_Q_B,
+ LLM_TENSOR_ATTN_KV_A_MQA,
+ LLM_TENSOR_ATTN_KV_B,
+ LLM_TENSOR_ATTN_K_B,
+ LLM_TENSOR_ATTN_V_B,
+ LLM_TENSOR_ATTN_Q_A_NORM,
+ LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_ATTN_SUB_NORM,
+ LLM_TENSOR_FFN_SUB_NORM,
+ LLM_TENSOR_DEC_ATTN_NORM,
+ LLM_TENSOR_DEC_ATTN_Q,
+ LLM_TENSOR_DEC_ATTN_K,
+ LLM_TENSOR_DEC_ATTN_V,
+ LLM_TENSOR_DEC_ATTN_OUT,
+ LLM_TENSOR_DEC_ATTN_REL_B,
+ LLM_TENSOR_DEC_CROSS_ATTN_NORM,
+ LLM_TENSOR_DEC_CROSS_ATTN_Q,
+ LLM_TENSOR_DEC_CROSS_ATTN_K,
+ LLM_TENSOR_DEC_CROSS_ATTN_V,
+ LLM_TENSOR_DEC_CROSS_ATTN_OUT,
+ LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
+ LLM_TENSOR_DEC_FFN_NORM,
+ LLM_TENSOR_DEC_FFN_GATE,
+ LLM_TENSOR_DEC_FFN_DOWN,
+ LLM_TENSOR_DEC_FFN_UP,
+ LLM_TENSOR_DEC_OUTPUT_NORM,
+ LLM_TENSOR_ENC_ATTN_NORM,
+ LLM_TENSOR_ENC_ATTN_Q,
+ LLM_TENSOR_ENC_ATTN_K,
+ LLM_TENSOR_ENC_ATTN_V,
+ LLM_TENSOR_ENC_ATTN_OUT,
+ LLM_TENSOR_ENC_ATTN_REL_B,
+ LLM_TENSOR_ENC_FFN_NORM,
+ LLM_TENSOR_ENC_FFN_GATE,
+ LLM_TENSOR_ENC_FFN_DOWN,
+ LLM_TENSOR_ENC_FFN_UP,
+ LLM_TENSOR_ENC_OUTPUT_NORM,
+ LLM_TENSOR_CLS,
+ LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_CONV1D,
+ LLM_TENSOR_CONVNEXT_DW,
+ LLM_TENSOR_CONVNEXT_NORM,
+ LLM_TENSOR_CONVNEXT_PW1,
+ LLM_TENSOR_CONVNEXT_PW2,
+ LLM_TENSOR_CONVNEXT_GAMMA,
+ LLM_TENSOR_POS_NET_CONV1,
+ LLM_TENSOR_POS_NET_CONV2,
+ LLM_TENSOR_POS_NET_NORM,
+ LLM_TENSOR_POS_NET_NORM1,
+ LLM_TENSOR_POS_NET_NORM2,
+ LLM_TENSOR_POS_NET_ATTN_NORM,
+ LLM_TENSOR_POS_NET_ATTN_Q,
+ LLM_TENSOR_POS_NET_ATTN_K,
+ LLM_TENSOR_POS_NET_ATTN_V,
+ LLM_TENSOR_POS_NET_ATTN_OUT,
+ LLM_TENSOR_SHORTCONV_CONV,
+ LLM_TENSOR_SHORTCONV_INPROJ,
+ LLM_TENSOR_SHORTCONV_OUTPROJ,
+ LLM_TENSOR_VISEXP_ATTN_QKV,
+ LLM_TENSOR_VISEXP_ATTN_OUT,
+ LLM_TENSOR_VISEXP_FFN_GATE,
+ LLM_TENSOR_VISEXP_FFN_DOWN,
+ LLM_TENSOR_VISEXP_FFN_UP,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+};
+
+enum llm_tensor_layer {
+ LLM_TENSOR_LAYER_INPUT,
+ LLM_TENSOR_LAYER_REPEATING,
+ LLM_TENSOR_LAYER_OUTPUT,
+};
+
+struct LLM_KV {
+ LLM_KV(llm_arch arch, const char * suffix = nullptr);
+
+ llm_arch arch;
+ const char * suffix;
+
+ std::string operator()(llm_kv kv) const;
+};
+
+// helper to handle gguf constants
+// usage:
+//
+// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
+//
+// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
+// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
+// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
+//
+struct LLM_TN_IMPL {
+ const llm_arch arch;
+ const llm_tensor tensor;
+ const char * const suffix;
+ const int bid;
+ const int xid;
+
+ const std::set<llm_tensor> model_tensors;
+
+ LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
+
+ std::string str() const;
+
+ operator std::string() const {
+ return str();
+ }
+
+ friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
+ return str == tn.str();
+ }
+
+ friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
+ return str != tn.str();
+ }
+};
+
+struct LLM_TN {
+ LLM_TN(llm_arch arch) : arch(arch) {}
+
+ llm_arch arch;
+
+ LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
+ return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
+ }
+
+ LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
+ return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
+ }
+};
+
+
+struct llm_tensor_info {
+ llm_tensor_layer layer;
+ ggml_op op;
+};
+
+const char * llm_arch_name(llm_arch arch);
+
+llm_arch llm_arch_from_string(const std::string & name);
+
+const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
+
+bool llm_arch_is_recurrent(const llm_arch & arch);
+bool llm_arch_is_hybrid (const llm_arch & arch);
+bool llm_arch_is_diffusion(const llm_arch & arch);
diff --git a/llama.cpp/src/llama-batch.cpp b/llama.cpp/src/llama-batch.cpp
new file mode 100644
index 0000000..386fab0
--- /dev/null
+++ b/llama.cpp/src/llama-batch.cpp
@@ -0,0 +1,917 @@
+#include "llama-batch.h"
+
+#include "llama-impl.h"
+#include "llama-vocab.h"
+#include "llama-memory.h"
+
+#include <cassert>
+#include <cstring>
+#include <algorithm>
+#include <sstream>
+
+llama_batch_allocr::llama_batch_allocr(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {
+ const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
+ debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
+
+ seq_pos.resize(LLAMA_MAX_SEQ);
+ seq_cpl.resize(LLAMA_MAX_SEQ);
+ for (auto & cur : seq_cpl) {
+ cur.resize(LLAMA_MAX_SEQ);
+ }
+
+ seq_idx.resize(LLAMA_MAX_SEQ, -1);
+}
+
+bool llama_batch_allocr::init(
+ const llama_batch & batch_inp,
+ const llama_vocab & vocab,
+ const llama_memory_i * memory,
+ uint32_t n_embd,
+ uint32_t n_seq_max,
+ bool output_all) {
+ clear();
+
+ batch = batch_inp;
+
+ this->vocab = &vocab;
+
+ GGML_ASSERT(batch.n_tokens > 0);
+
+ //
+ // validate input batch
+ //
+
+ if (n_seq_max > LLAMA_MAX_SEQ) {
+ LLAMA_LOG_ERROR("%s: n_seq_max = %d > %d\n", __func__, n_seq_max, LLAMA_MAX_SEQ);
+ return false;
+ }
+
+ if (batch.token) {
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+ return false;
+ }
+ }
+ }
+
+ if (batch.seq_id) {
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+ if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
+ LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
+ return false;
+ }
+ }
+ }
+ }
+
+ //
+ // auto-generate missing fields
+ //
+
+ if (!batch.n_seq_id) {
+ n_seq_id.resize(batch.n_tokens);
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
+ n_seq_id[i] = seq_id_0.size();
+ }
+ batch.n_seq_id = n_seq_id.data();
+ }
+
+ if (!batch.seq_id) {
+ seq_id.resize(batch.n_tokens + 1);
+ seq_id[batch.n_tokens] = NULL;
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
+ seq_id[i] = seq_id_0.data();
+ }
+ batch.seq_id = seq_id.data();
+ }
+
+ if (!batch.pos) {
+ pos.resize(batch.n_tokens);
+
+ // initialize the starting position for each sequence based on the positions in the memory
+ llama_pos p0[LLAMA_MAX_SEQ];
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
+ if (!memory) {
+ // if no memory -> start from 0
+ p0[s] = 0;
+ } else {
+ p0[s] = memory->seq_pos_max(s) + 1;
+ }
+ }
+
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
+ const llama_seq_id seq_id = batch.seq_id[i][0];
+
+ pos[i] = p0[seq_id];
+
+ // update the starting position for all sequences that are assigned to the this token
+ for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = batch.seq_id[i][s];
+
+ p0[seq_id] = pos[i] + 1;
+ }
+ }
+
+ batch.pos = pos.data();
+ }
+
+ if (!batch.logits) {
+ if (output_all) {
+ // return the output for all tokens
+ output.resize(batch.n_tokens, true);
+ } else {
+ // return the output only for the last token
+ output.resize(batch.n_tokens, false);
+ output[output.size() - 1] = true;
+ }
+
+ batch.logits = output.data();
+ } else if (output_all) {
+ bool warn = false;
+
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ if (batch.logits[i] == 0) {
+ warn = true;
+ }
+ }
+
+ if (warn) {
+ LLAMA_LOG_WARN("%s: embeddings required but some input tokens were not marked as outputs -> overriding\n", __func__);
+
+ output.resize(batch.n_tokens, true);
+ batch.logits = output.data();
+ }
+ }
+
+ //
+ // compute stats
+ //
+
+ this->n_embd = n_embd;
+ this->n_seq_max = n_seq_max;
+
+ // count the outputs in this batch
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ n_outputs += batch.logits[i] != 0;
+ }
+
+ has_cpl = false;
+
+ // determine coupled sequences
+ // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ const llama_seq_id s0 = batch.seq_id[i][0];
+
+ for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+ const llama_seq_id s1 = batch.seq_id[i][s];
+
+ seq_pos[s1].insert(batch.pos[i]);
+
+ if (s > 0) {
+ // mark that sequence s1 is coupled to s0
+ seq_cpl[s1][s0] = true;
+
+ // note: tracking the other way around is not necessary for now
+ //seq_cpl[s0][s1] = true;
+
+ has_cpl = true;
+ }
+ }
+ }
+
+ // precompute the sequence sets for each token and determine the unique sequence ids that participate in the batch
+ {
+ seq_set_t seq_set_unq;
+
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ seq_set_t cur;
+ for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = batch.seq_id[i][s];
+
+ cur .set(seq_id);
+ seq_set_unq.set(seq_id);
+ }
+
+ seq_set.push_back(cur);
+ seq_set_map[cur].push_back(i);
+ }
+
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
+ if (seq_set_unq.test(s)) {
+ seq_idx[s] = seq_id_unq.size();
+ seq_id_unq.push_back(s);
+ }
+ }
+ }
+
+ if (debug > 0) {
+ LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
+
+ llama_ubatch ubatch {
+ /*.b_equal_seqs =*/ false,
+ /*.n_tokens =*/ (uint32_t) batch.n_tokens,
+ /*.n_seq_tokens =*/ (uint32_t) 1,
+ /*.n_seqs =*/ (uint32_t) batch.n_tokens,
+ /*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(),
+ /*.n_pos =*/ n_pos_per_embd,
+ /*.token =*/ batch.token,
+ /*.embd =*/ batch.embd,
+ /*.pos =*/ batch.pos,
+ /*.n_seq_id =*/ batch.n_seq_id,
+ /*.seq_id =*/ batch.seq_id,
+ /*.seq_id_unq =*/ this->seq_id_unq.data(),
+ /*.seq_idx =*/ this->seq_idx.data(),
+ /*.output =*/ batch.logits,
+ /*.data =*/ {},
+ };
+
+ ubatch_print(ubatch, debug);
+
+ LLAMA_LOG_DEBUG("%s: seq = [\n", __func__);
+ for (int s0 = 0; s0 < (int) seq_pos.size(); ++s0) {
+ if (seq_pos[s0].empty()) {
+ continue;
+ }
+
+ std::stringstream ss;
+ for (int s1 = 0; s1 < (int) seq_cpl[s0].size(); ++s1) {
+ if (seq_cpl[s0][s1]) {
+ ss << s1 << " ";
+ }
+ }
+
+ LLAMA_LOG_DEBUG("%s: %4d: pos = [%4d, %4d], cpl = %s\n",
+ __func__, s0, seq_pos_min(s0), seq_pos_max(s0), ss.str().empty() ? "-" : ss.str().c_str());
+ }
+ LLAMA_LOG_DEBUG("%s: ]\n", __func__);
+ }
+
+ //
+ // consistency checks
+ //
+
+ if (n_pos_per_embd > 1) {
+ // M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
+ if (seq_pos[s].empty()) {
+ continue;
+ }
+
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
+
+ if (batch.token) {
+ if (p0 >= 0 && p0 >= seq_pos_min(s)) {
+ LLAMA_LOG_ERROR(
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+ " for M-RoPE, it is required that the position satisfies: X < Y\n",
+ __func__, s, s, p0, s, seq_pos_min(s));
+
+ return false;
+ }
+ } else {
+ // embedding inputs can have overlapping positions
+ if (p0 >= 0 && p0 > seq_pos_min(s)) {
+ LLAMA_LOG_ERROR(
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+ " for M-RoPE, it is required that the position satisfies: X <= Y\n",
+ __func__, s, s, p0, s, seq_pos_min(s));
+
+ return false;
+ }
+ }
+ }
+ } else {
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
+ if (seq_pos[s].empty()) {
+ continue;
+ }
+
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
+
+ if (p0 >= 0) {
+ bool ok = true;
+
+ if (seq_pos_min(s) != p0 + 1) {
+ ok = false;
+ }
+
+ if (!ok) {
+ LLAMA_LOG_ERROR(
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+ " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+ __func__, s, s, p0, s, seq_pos_min(s));
+
+ return false;
+ }
+ }
+
+ if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+ LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
+ return false;
+ }
+ }
+ }
+
+ if (memory) {
+ for (uint32_t s0 = 0; s0 < n_seq_max; ++s0) {
+ for (uint32_t s1 = 0; s1 < n_seq_max; ++s1) {
+ if (seq_cpl[s0][s1]) {
+ if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
+ memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
+ LLAMA_LOG_ERROR("%s: sequence %d is coupled to %d in the input batch, but have divereged\n", __func__, s0, s1);
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ // disallow partial sequence sub-sets:
+ //
+ // invalid: x
+ // i: 0 1 2 ...
+ // ---------------------------------------
+ // seq_id[i][0]: 0 0 1
+ // seq_id[i][1]: 1 1 2
+ // seq_id[i][2]: 2
+ //
+ // disallow decreasing sequence positions:
+ //
+ // invalid: x
+ // i: 0 1 2 3 4 5 6 ...
+ // ---------------------------------------
+ // pos[i]: 4 5 0 1 6 2 3
+ // seq_id[i][0]: 0 0 1 1 0 1 0
+ //
+ {
+ seq_set_t cur_seq_set[LLAMA_MAX_SEQ];
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
+ cur_seq_set[s].set();
+ }
+
+ llama_pos cur_seq_pos[LLAMA_MAX_SEQ];
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
+ cur_seq_pos[s] = -1;
+ }
+
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ const llama_pos pos = batch.pos[i];
+
+ for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = batch.seq_id[i][s];
+
+ cur_seq_set[seq_id] &= seq_set[i];
+
+ if (cur_seq_set[seq_id].none()) {
+ LLAMA_LOG_ERROR("%s: sequence %d belongs to incompatible sequence sets (not allowed)\n", __func__, seq_id);
+ return false;
+ }
+
+ if (pos < cur_seq_pos[seq_id]) {
+ LLAMA_LOG_ERROR("%s: sequence %d positions are decreasing (not allowed)\n", __func__, seq_id);
+ return false;
+ }
+ }
+ }
+ }
+
+ split_reset();
+
+ return true;
+}
+
+llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs) {
+ const uint32_t n_tokens = n_seq_tokens*n_seqs;
+
+ clear();
+ split_reset();
+
+ auto udata = std::make_shared<llama_ubatch::data_t>();
+
+ udata->token .resize(n_tokens);
+ udata->embd .clear();
+ udata->pos .resize(n_tokens);
+ udata->n_seq_id .resize(n_tokens);
+ udata->seq_id .resize(n_tokens);
+ udata->seq_id_unq.resize(0);
+ udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
+ udata->output .resize(n_tokens);
+
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ udata->seq_idx[s] = s;
+ udata->seq_id_unq.push_back(s);
+ }
+
+ llama_ubatch res {
+ /*.b_equal_seqs =*/ true,
+ /*.n_tokens =*/ n_tokens,
+ /*.n_seq_tokens =*/ n_seq_tokens,
+ /*.n_seqs =*/ n_seqs,
+ /*.n_seqs_unq =*/ n_seqs,
+ /*.n_pos =*/ n_pos_per_embd,
+
+ /*.token =*/ udata->token.data(),
+ /*.embd =*/ nullptr,
+ /*.pos =*/ udata->pos.data(),
+ /*.n_seq_id =*/ udata->n_seq_id.data(),
+ /*.seq_id =*/ udata->seq_id.data(),
+ /*.seq_id_unq =*/ udata->seq_id_unq.data(),
+ /*.seq_idx =*/ udata->seq_idx.data(),
+ /*.output =*/ udata->output.data(),
+ /*.data =*/ std::move(udata),
+ };
+
+ return res;
+}
+
+const llama_batch & llama_batch_allocr::get_batch() const {
+ return batch;
+}
+
+uint32_t llama_batch_allocr::get_n_tokens() const {
+ return batch.n_tokens;
+}
+
+uint32_t llama_batch_allocr::get_n_outputs() const {
+ return n_outputs;
+}
+
+uint32_t llama_batch_allocr::get_n_used() const {
+ return n_used;
+}
+
+std::vector<int32_t> & llama_batch_allocr::get_out_ids() {
+ return out_ids;
+}
+
+llama_pos llama_batch_allocr::seq_pos_min(llama_seq_id seq_id) const {
+ return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].begin();
+}
+
+llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
+ return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].rbegin();
+}
+
+void llama_batch_allocr::split_reset() {
+ out_ids.clear();
+
+ n_used = 0;
+
+ used.clear();
+ used.resize(get_n_tokens(), false);
+}
+
+llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
+ // find the first unused token
+ uint32_t cur_idx = 0;
+ while (cur_idx < used.size() && used[cur_idx]) {
+ ++cur_idx;
+ }
+
+ // we are done
+ if (cur_idx >= used.size()) {
+ return {};
+ }
+
+ std::vector<int32_t> idxs;
+
+ while (true) {
+ idxs.push_back(cur_idx);
+
+ used[cur_idx] = true;
+ ++n_used;
+
+ ++cur_idx;
+
+ if (cur_idx >= used.size()) {
+ break;
+ }
+
+ if (idxs.size() >= n_ubatch) {
+ break;
+ }
+ }
+
+ return ubatch_add(idxs, idxs.size(), false);
+}
+
+llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
+ if (sequential && has_cpl) {
+ LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
+
+ return {};
+ }
+
+ std::vector<seq_set_t> cur_seq_set;
+
+ llama_seq_id last_seq_id = -1;
+
+ // determine the non-overlapping sequence sets participating in this ubatch
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
+ if (used[i]) {
+ continue;
+ }
+
+ bool add = true;
+
+ for (uint32_t s = 0; s < cur_seq_set.size(); ++s) {
+ // no overlap with existing sequence sets:
+ if (!(cur_seq_set[s] & seq_set[i]).none()) {
+ add = false;
+ break;
+ }
+ }
+
+ // accept only increasing sequence ids
+ if (sequential) {
+ add = add && (cur_seq_set.empty() || batch.seq_id[i][0] == last_seq_id + 1);
+ }
+
+ if (add) {
+ cur_seq_set.push_back(seq_set[i]);
+
+ last_seq_id = batch.seq_id[i][0];
+
+ if (cur_seq_set.size() > n_ubatch) {
+ break;
+ }
+ }
+ }
+
+ const uint32_t n_seqs = cur_seq_set.size();
+
+ // we are done
+ if (n_seqs == 0) {
+ return {};
+ }
+
+ // the current batch index of each sequence set
+ std::vector<int32_t> cur_idx(n_seqs, 0);
+
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ while (used[seq_set_map[cur_seq_set[s]][cur_idx[s]]]) {
+ ++cur_idx[s];
+ }
+ }
+
+ // the list of batch indices for each sequence set
+ // at the end we will concat these to get the final ubatch
+ std::vector<idx_vec_t> idxs_per_seq(n_seqs);
+
+ while (true) {
+ // we can only add new n_seq_tokens tokens if all the sequence sets have at least one more unused token and
+ // if we haven't reached n_ubatch
+ bool can_expand = true;
+
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ if (cur_idx[s] >= (int32_t) seq_set_map[cur_seq_set[s]].size()) {
+ can_expand = false;
+ break;
+ }
+ }
+
+ if (!can_expand) {
+ break;
+ }
+
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ const int32_t idx = seq_set_map[cur_seq_set[s]][cur_idx[s]];
+
+ idxs_per_seq[s].push_back(idx);
+
+ used[idx] = true;
+ ++n_used;
+
+ ++cur_idx[s];
+ }
+
+ if ((idxs_per_seq[0].size() + 1)*n_seqs > n_ubatch) {
+ break;
+ }
+ }
+
+ // concat the per-sequence-set lists
+ std::vector<int32_t> idxs;
+
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ idxs.insert(idxs.end(), idxs_per_seq[s].begin(), idxs_per_seq[s].end());
+ }
+
+ return ubatch_add(idxs, n_seqs, true);
+}
+
+llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) {
+ // find the first unused token
+ uint32_t cur_idx = 0;
+ while (cur_idx < used.size() && used[cur_idx]) {
+ ++cur_idx;
+ }
+
+ // we are done
+ if (cur_idx >= used.size()) {
+ return {};
+ }
+
+ // this is the starting sequence set
+ // we allow adding tokens only if their sequence set is a subset of the current sequence set
+ auto cur_seq_set = seq_set[cur_idx];
+
+ std::vector<int32_t> idxs;
+
+ while (true) {
+ idxs.push_back(cur_idx);
+
+ used[cur_idx] = true;
+ ++n_used;
+
+ if (idxs.size() >= n_ubatch) {
+ break;
+ }
+
+ do {
+ ++cur_idx;
+ } while (cur_idx < get_n_tokens() && (used[cur_idx] || ((cur_seq_set & seq_set[cur_idx]) != seq_set[cur_idx])));
+
+ if (cur_idx == get_n_tokens()) {
+ break;
+ }
+
+ cur_seq_set = seq_set[cur_idx];
+ }
+
+ return ubatch_add(idxs, 1, true);
+}
+
+void llama_batch_allocr::clear() {
+ n_outputs = 0;
+
+ batch = {};
+
+ pos .clear();
+ n_seq_id .clear();
+ seq_id .clear();
+ seq_id_unq.clear();
+ output .clear();
+
+ for (auto & cur : seq_pos) {
+ cur.clear();
+ }
+
+ for (auto & cur : seq_cpl) {
+ std::fill(cur.begin(), cur.end(), false);
+ }
+
+ seq_set.clear();
+
+ seq_set_map.clear();
+
+ std::fill(seq_idx.begin(), seq_idx.end(), -1);
+}
+
+llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs) {
+ const uint32_t n_tokens = idxs.size();
+
+ assert(n_tokens%n_seqs == 0);
+
+ auto udata = std::make_shared<llama_ubatch::data_t>();
+
+ const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
+ const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
+
+ udata->token .resize(n_tokens);
+ udata->embd .resize(n_embd_all);
+ udata->pos .resize(n_pos_all);
+ udata->n_seq_id .resize(n_tokens);
+ udata->seq_id .resize(n_tokens);
+ udata->seq_id_unq.resize(0);
+ udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
+ udata->output .resize(n_tokens);
+
+ udata->seq_id_data.reserve(n_tokens);
+
+ seq_set_t seq_set_unq;
+
+ for (size_t i = 0; i < idxs.size(); ++i) {
+ if (batch.token) {
+ udata->token[i] = batch.token[idxs[i]];
+ }
+
+ if (batch.embd) {
+ memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
+ }
+
+ for (size_t j = 0; j < (size_t)n_pos_per_embd; ++j) {
+ // if we are using M-RoPE
+ // if the current batch is text, we need to broadcast the same position across all RoPE sections
+ // otherwise, the input batch is image embeddings, we copy the positions as-is
+ // if we are not using M-RoPE, there is only one position per token (this loop runs only once)
+ size_t src_off = batch.token ? 0 : j*batch.n_tokens;
+ udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
+ }
+
+ udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
+ udata->output[i] = batch.logits[idxs[i]];
+
+ for (int s = 0; s < udata->n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
+
+ udata->seq_id_data.push_back(seq_id);
+ seq_set_unq.set(seq_id);
+ }
+
+ if (udata->output[i]) {
+ out_ids.push_back(idxs[i]);
+ }
+ }
+
+ llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
+ for (size_t i = 0; i < idxs.size(); ++i) {
+ udata->seq_id[i] = seq_id_ptr;
+ seq_id_ptr += udata->n_seq_id[i];
+ }
+
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
+ if (seq_set_unq.test(s)) {
+ udata->seq_idx[s] = udata->seq_id_unq.size();
+ udata->seq_id_unq.push_back(s);
+ }
+ }
+
+ llama_ubatch res {
+ /*.b_equal_seqs =*/ equal_seqs,
+ /*.n_tokens =*/ n_tokens,
+ /*.n_seq_tokens =*/ n_tokens/n_seqs,
+ /*.n_seqs =*/ n_seqs,
+ /*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
+ /*.n_pos =*/ n_pos_per_embd,
+
+ /*.token =*/ batch.token ? udata->token.data() : nullptr,
+ /*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
+ /*.pos =*/ udata->pos.data(),
+ /*.n_seq_id =*/ udata->n_seq_id.data(),
+ /*.seq_id =*/ udata->seq_id.data(),
+ /*.seq_id_unq =*/ udata->seq_id_unq.data(),
+ /*.seq_idx =*/ udata->seq_idx.data(),
+ /*.output =*/ udata->output.data(),
+ /*.data =*/ std::move(udata),
+ };
+
+ if (debug > 0) {
+ LLAMA_LOG_DEBUG("%s: added ubatch to split:\n", __func__);
+
+ ubatch_print(res, debug);
+ }
+
+ return res;
+}
+
+void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
+ if (debug > 0) {
+ LLAMA_LOG_DEBUG("%s: equal_seqs = %d\n", __func__, ubatch.equal_seqs());
+ LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, ubatch.n_tokens);
+ LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens);
+ LLAMA_LOG_DEBUG("%s: n_seqs = %d\n", __func__, ubatch.n_seqs);
+ LLAMA_LOG_DEBUG("%s: n_seqs_unq = %d\n", __func__, ubatch.n_seqs_unq);
+
+ std::stringstream ss_seq_id_unq;
+ std::stringstream ss_seq_idx;
+
+ ss_seq_id_unq << "[ ";
+ ss_seq_idx << "[";
+
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ ss_seq_id_unq << ubatch.seq_id_unq[s] << " ";
+ }
+
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ if (ubatch.seq_idx[s] >= 0) {
+ ss_seq_idx << ubatch.seq_idx[s]%10;
+ } else {
+ ss_seq_idx << ".";
+ }
+ }
+
+ ss_seq_id_unq << "]";
+ ss_seq_idx << "]";
+
+ LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) ubatch.token);
+ LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) ubatch.embd);
+ LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) ubatch.pos);
+ LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) ubatch.n_seq_id);
+ LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) ubatch.seq_id);
+ LLAMA_LOG_DEBUG("%s: seq_id_unq = %s\n", __func__, ss_seq_id_unq.str().c_str());
+ LLAMA_LOG_DEBUG("%s: seq_idx = %s\n", __func__, ss_seq_idx.str().c_str());
+ LLAMA_LOG_DEBUG("%s: output = %p\n", __func__, (void *) ubatch.output);
+ LLAMA_LOG_DEBUG("%s: n_outputs = %d\n", __func__, n_outputs);
+
+ if (debug > 1) {
+ int seq_id_max = 0;
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+ for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+ seq_id_max = std::max(seq_id_max, ubatch.seq_id[i][s]);
+ }
+ }
+ }
+ ++seq_id_max;
+
+ LLAMA_LOG_DEBUG("%s: token = [\n", __func__);
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ std::vector<int8_t> seq_id(seq_id_max);
+
+ for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+ seq_id[ubatch.seq_id[i][s]] = 1;
+ }
+
+ std::stringstream ss;
+ for (int s = 0; s < seq_id_max; ++s) {
+ if (seq_id[s]) {
+ ss << s%10;
+ } else {
+ ss << ".";
+ }
+ }
+
+ if (ubatch.token) {
+ LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
+ __func__, i, ubatch.token[i], vocab->token_to_piece(ubatch.token[i]).c_str(),
+ ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
+ } else {
+ LLAMA_LOG_DEBUG("%s: %4d: [embd], pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
+ __func__, i, ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
+ }
+ }
+ LLAMA_LOG_DEBUG("%s: ]\n", __func__);
+ }
+ }
+}
+
+//
+// interface implementation
+//
+
+struct llama_batch llama_batch_get_one(
+ llama_token * tokens,
+ int32_t n_tokens) {
+ return {
+ /*n_tokens =*/ n_tokens,
+ /*tokens =*/ tokens,
+ /*embd =*/ nullptr,
+ /*pos =*/ nullptr,
+ /*n_seq_id =*/ nullptr,
+ /*seq_id =*/ nullptr,
+ /*logits =*/ nullptr,
+ };
+}
+
+struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
+ llama_batch batch = {
+ /*n_tokens =*/ 0,
+ /*tokens =*/ nullptr,
+ /*embd =*/ nullptr,
+ /*pos =*/ nullptr,
+ /*n_seq_id =*/ nullptr,
+ /*seq_id =*/ nullptr,
+ /*logits =*/ nullptr,
+ };
+
+ if (embd) {
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+ } else {
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
+ }
+
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
+ batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
+ batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
+ for (int i = 0; i < n_tokens_alloc; ++i) {
+ batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+ }
+ batch.seq_id[n_tokens_alloc] = nullptr;
+
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
+
+ return batch;
+}
+
+void llama_batch_free(struct llama_batch batch) {
+ if (batch.token) free(batch.token);
+ if (batch.embd) free(batch.embd);
+ if (batch.pos) free(batch.pos);
+ if (batch.n_seq_id) free(batch.n_seq_id);
+ if (batch.seq_id) {
+ for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
+ free(batch.seq_id[i]);
+ }
+ free(batch.seq_id);
+ }
+ if (batch.logits) free(batch.logits);
+}
diff --git a/llama.cpp/src/llama-batch.h b/llama.cpp/src/llama-batch.h
new file mode 100644
index 0000000..8e6fac0
--- /dev/null
+++ b/llama.cpp/src/llama-batch.h
@@ -0,0 +1,173 @@
+#pragma once
+
+#include "llama.h"
+
+#include "llama-cparams.h"
+
+#include <array>
+#include <vector>
+#include <set>
+#include <bitset>
+#include <memory>
+#include <unordered_map>
+
+// keep this struct lightweight
+struct llama_ubatch {
+ bool equal_seqs() const {
+ return b_equal_seqs != 0;
+ }
+
+ // typical for M-RoPE cases:
+ // 0 - sequantial position of the tokens/embeddings in the sequence
+ // 1 - y position in the image
+ // 2 - x position in the image
+ // 3 - other
+ bool is_pos_2d() const {
+ // TODO @ngxson : we may need to check for model arch when more models use >1 positions
+ return n_pos >= 3;
+ }
+
+ uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
+ // otherwise address sanitizer complains
+ // TODO: whole_seqs for embeddings?
+
+ uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
+ uint32_t n_seq_tokens; // tokens per sequence set
+ uint32_t n_seqs; // sequence sets in the ubatch
+ uint32_t n_seqs_unq; // unique sequence ids in the ubatch
+ uint32_t n_pos; // number of position inputs for each token/embedding
+
+ // seq_id_unq: unique sequence ids in the ubatch
+ // seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
+ // used for extracting sequence pooled embeddings
+
+ // // size | idx | val
+ llama_token * token; // [n_tokens] | i | id, token
+ float * embd; // [n_embd, n_tokens] | i | embd
+ llama_pos * pos; // [n_tokens*n_pos] | i | pos
+ int32_t * n_seq_id; // [n_tokens] | i | -
+ llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
+ llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
+ int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
+ int8_t * output; // [n_tokens] | i | -
+
+ struct data_t {
+ std::vector<llama_token> token;
+ std::vector<float> embd;
+ std::vector<llama_pos> pos;
+ std::vector<int32_t> n_seq_id;
+ std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
+ std::vector<llama_seq_id> seq_id_unq;
+ std::vector<int32_t> seq_idx;
+ std::vector<int8_t> output;
+
+ std::vector<llama_seq_id> seq_id_data;
+ };
+
+ // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
+ std::shared_ptr<data_t> data;
+};
+
+// a helper for sanitizing, fulfilling and splitting a batch
+class llama_batch_allocr {
+public:
+ llama_batch_allocr(uint32_t n_pos_per_embd);
+
+ // sanitize and auto-gen missing data in the input batch
+ // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
+ bool init(
+ const llama_batch & batch_inp,
+ const llama_vocab & vocab,
+ const llama_memory_i * memory,
+ uint32_t n_embd,
+ uint32_t n_seq_max,
+ bool output_all);
+
+ const llama_batch & get_batch() const;
+
+ uint32_t get_n_tokens() const;
+ uint32_t get_n_outputs() const;
+ uint32_t get_n_used() const;
+
+ // the array of output indices in the order they were encountered during the ubatch splitting
+ std::vector<int32_t> & get_out_ids();
+
+ // min/max positions of each sequence in the current ubatch
+ llama_pos seq_pos_min(llama_seq_id seq_id) const;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const;
+
+ // call once before splitting the batch to reset the internal state
+ void split_reset();
+
+ // simple split, unknown number of sequence sets of unequal lengths
+ llama_ubatch split_simple(uint32_t n_ubatch);
+
+ // make ubatches of equal-length sequences sets
+ // if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
+ llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);
+
+ // sequence-set-wise split - each ubatch contains a single sequence-set
+ llama_ubatch split_seq(uint32_t n_ubatch);
+
+ // a helper method for creating a well-defined ubatch of tokens
+ // TODO: support embeddings if needed in the future
+ llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
+
+private:
+ void clear();
+
+ // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
+ // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
+ llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
+
+ // for debugging, start with LLAMA_BATCH_DEBUG=2
+ void ubatch_print(const llama_ubatch & ubatch, int debug);
+
+ llama_batch batch;
+
+ // only for debugging purposes
+ const llama_vocab * vocab;
+
+ // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
+ // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+ const uint32_t n_pos_per_embd;
+
+ uint32_t n_embd;
+ uint32_t n_seq_max;
+ uint32_t n_outputs;
+
+ std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
+
+ std::vector<llama_pos> pos;
+ std::vector<int32_t> n_seq_id;
+ std::vector<llama_seq_id *> seq_id;
+ std::vector<llama_seq_id> seq_id_unq;
+ std::vector<int32_t> seq_idx;
+ std::vector<int8_t> output;
+
+ using pos_set_t = std::set<llama_pos>;
+ using seq_cpl_t = std::vector<bool>;
+
+ // helper flag to quickly determine if there are any coupled sequences in the batch
+ bool has_cpl = false;
+
+ std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
+ std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
+
+ using idx_vec_t = std::vector<int32_t>;
+ using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
+
+ std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
+
+ std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
+
+ // batch indices of the output
+ std::vector<int32_t> out_ids;
+
+ uint32_t n_used;
+
+ // used[i] indicates if token i has already been used in a previous ubatch
+ std::vector<bool> used;
+
+ int debug;
+};
diff --git a/llama.cpp/src/llama-chat.cpp b/llama.cpp/src/llama-chat.cpp
new file mode 100644
index 0000000..c415a99
--- /dev/null
+++ b/llama.cpp/src/llama-chat.cpp
@@ -0,0 +1,896 @@
+#include "llama-chat.h"
+
+#include "llama.h"
+
+#include <map>
+#include <sstream>
+#include <algorithm>
+
+#if __cplusplus >= 202000L
+ #define LU8(x) (const char*)(u8##x)
+#else
+ #define LU8(x) u8##x
+#endif
+
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+ size_t start = 0;
+ size_t end = str.size();
+ while (start < end && isspace(static_cast<unsigned char>(str[start]))) {
+ start += 1;
+ }
+ while (end > start && isspace(static_cast<unsigned char>(str[end - 1]))) {
+ end -= 1;
+ }
+ return str.substr(start, end - start);
+}
+
+static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
+ { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
+ { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
+ { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
+ { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
+ { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
+ { "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
+ { "exaone-moe", LLM_CHAT_TEMPLATE_EXAONE_MOE },
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
+ { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
+ { "yandex", LLM_CHAT_TEMPLATE_YANDEX },
+ { "bailing", LLM_CHAT_TEMPLATE_BAILING },
+ { "bailing-think", LLM_CHAT_TEMPLATE_BAILING_THINK },
+ { "bailing2", LLM_CHAT_TEMPLATE_BAILING2 },
+ { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
+ { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
+ { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
+ { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
+ { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
+ { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
+ { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
+ { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
+ { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
+ { "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
+};
+
+llm_chat_template llm_chat_template_from_str(const std::string & name) {
+ return LLM_CHAT_TEMPLATES.at(name);
+}
+
+llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
+ try {
+ return llm_chat_template_from_str(tmpl);
+ } catch (const std::out_of_range &) {
+ // ignore
+ }
+
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
+ return tmpl.find(haystack) != std::string::npos;
+ };
+ if (tmpl_contains("<|im_start|>")) {
+ return tmpl_contains("<|im_sep|>")
+ ? LLM_CHAT_TEMPLATE_PHI_4
+ : tmpl_contains("<end_of_utterance>")
+ ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
+ : LLM_CHAT_TEMPLATE_CHATML;
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
+ } else if (
+ // catches official 'v1' template
+ tmpl_contains("' [INST] ' + system_message")
+ // catches official 'v3' and 'v3-tekken' templates
+ || tmpl_contains("[AVAILABLE_TOOLS]")
+ ) {
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+ if (tmpl_contains(" [INST]")) {
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
+ } else if (tmpl_contains("\"[INST]\"")) {
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
+ }
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
+ } else {
+ // llama2 template and its variants
+ // [variant] support system message
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+ bool support_system_message = tmpl_contains("<<SYS>>");
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
+ bool strip_message = tmpl_contains("content.strip()");
+ if (strip_message) {
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+ } else if (add_bos_inside_history) {
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+ } else if (support_system_message) {
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
+ } else {
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
+ }
+ }
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
+ return LLM_CHAT_TEMPLATE_PHI_3;
+ } else if (tmpl_contains("[gMASK]<sop>")) {
+ return LLM_CHAT_TEMPLATE_CHATGLM_4;
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
+ if (tmpl_contains("<|tool_declare|>")) {
+ return LLM_CHAT_TEMPLATE_EXAONE_MOE;
+ }
+ return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
+ } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
+ return LLM_CHAT_TEMPLATE_GLMEDGE;
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
+ } else if (tmpl_contains("bos_token + message['role']")) {
+ return LLM_CHAT_TEMPLATE_MONARCH;
+ } else if (tmpl_contains("<start_of_turn>")) {
+ return LLM_CHAT_TEMPLATE_GEMMA;
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
+ // OrionStarAI/Orion-14B-Chat
+ return LLM_CHAT_TEMPLATE_ORION;
+ } else if (tmpl_contains("GPT4 Correct ")) {
+ // openchat/openchat-3.5-0106
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
+ if (tmpl_contains("SYSTEM: ")) {
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
+ }
+ return LLM_CHAT_TEMPLATE_VICUNA;
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
+ // deepseek-ai/deepseek-coder-33b-instruct
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
+ // CohereForAI/c4ai-command-r-plus
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
+ } else if (tmpl_contains("[gMASK]sop")) {
+ // chatglm3-6b
+ return LLM_CHAT_TEMPLATE_CHATGLM_3;
+ } else if (tmpl_contains(LU8("<用户>"))) {
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+ return LLM_CHAT_TEMPLATE_MINICPM;
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
+ } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
+ if (tmpl_contains("[|tool|]")) {
+ return LLM_CHAT_TEMPLATE_EXAONE_4;
+ }
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
+ // EXAONE-3.0-7.8B-Instruct
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
+ } else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
+ } else if (tmpl_contains("<|start_of_role|>")) {
+ return LLM_CHAT_TEMPLATE_GRANITE;
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
+ } else if (tmpl_contains("<|role_start|>")) {
+ return LLM_CHAT_TEMPLATE_MEGREZ;
+ } else if (tmpl_contains(" Ассистент:")) {
+ return LLM_CHAT_TEMPLATE_YANDEX;
+ } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
+ return LLM_CHAT_TEMPLATE_BAILING;
+ } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("\"HUMAN\"") && tmpl_contains("<think>")) {
+ return LLM_CHAT_TEMPLATE_BAILING_THINK;
+ } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("<role>HUMAN</role>") && tmpl_contains("<|role_end|>")) {
+ return LLM_CHAT_TEMPLATE_BAILING2;
+ } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
+ return LLM_CHAT_TEMPLATE_LLAMA4;
+ } else if (tmpl_contains("<|endofuserprompt|>")) {
+ return LLM_CHAT_TEMPLATE_DOTS1;
+ } else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
+ return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
+ } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
+ return LLM_CHAT_TEMPLATE_OPENAI_MOE;
+ } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
+ return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
+ } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
+ return LLM_CHAT_TEMPLATE_KIMI_K2;
+ } else if (tmpl_contains("<seed:bos>")) {
+ return LLM_CHAT_TEMPLATE_SEED_OSS;
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
+ return LLM_CHAT_TEMPLATE_GROK_2;
+ } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
+ return LLM_CHAT_TEMPLATE_PANGU_EMBED;
+ } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
+ return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
+ }
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
+}
+
+// Simple version of "llama_apply_chat_template" that only works with strings
+// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
+int32_t llm_chat_apply_template(
+ llm_chat_template tmpl,
+ const std::vector<const llama_chat_message *> & chat,
+ std::string & dest, bool add_ass) {
+ // Taken from the research: https://github.com/ggml-org/llama.cpp/issues/5527
+ std::stringstream ss;
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
+ // chatml template
+ for (auto message : chat) {
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
+ }
+ if (add_ass) {
+ ss << "<|im_start|>assistant\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
+ // Official mistral 'v7' template
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+ // https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
+ const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
+ for (auto message : chat) {
+ std::string role(message->role);
+ std::string content(message->content);
+ if (role == "system") {
+ ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
+ } else if (role == "user") {
+ ss << "[INST]" << trailing_space << content << "[/INST]";
+ } else {
+ ss << trailing_space << content << "</s>";
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
+ bool is_inside_turn = false;
+ for (auto message : chat) {
+ if (!is_inside_turn) {
+ ss << leading_space << "[INST]" << trailing_space;
+ is_inside_turn = true;
+ }
+ std::string role(message->role);
+ std::string content(message->content);
+ if (role == "system") {
+ ss << content << "\n\n";
+ } else if (role == "user") {
+ ss << content << leading_space << "[/INST]";
+ } else {
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
+ is_inside_turn = false;
+ }
+ }
+ } else if (
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
+ // llama2 template and its variants
+ // [variant] support system message
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
+ // [variant] add BOS inside history
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+ // [variant] trim spaces from the input message
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+ // construct the prompt
+ bool is_inside_turn = true; // skip BOS at the beginning
+ ss << "[INST] ";
+ for (auto message : chat) {
+ std::string content = strip_message ? trim(message->content) : message->content;
+ std::string role(message->role);
+ if (!is_inside_turn) {
+ is_inside_turn = true;
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
+ }
+ if (role == "system") {
+ if (support_system_message) {
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
+ } else {
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
+ ss << content << "\n";
+ }
+ } else if (role == "user") {
+ ss << content << " [/INST]";
+ } else {
+ ss << content << "</s>";
+ is_inside_turn = false;
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
+ // Phi 3
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
+ }
+ if (add_ass) {
+ ss << "<|assistant|>\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
+ // chatml template
+ for (auto message : chat) {
+ ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
+ }
+ if (add_ass) {
+ ss << "<|im_start|>assistant<|im_sep|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
+ // Falcon 3
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|" << role << "|>\n" << message->content << "\n";
+ }
+ if (add_ass) {
+ ss << "<|assistant|>\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
+ // zephyr template
+ for (auto message : chat) {
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
+ }
+ if (add_ass) {
+ ss << "<|assistant|>\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
+ for (auto message : chat) {
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
+ }
+ if (add_ass) {
+ ss << "<s>assistant\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
+ // google/gemma-7b-it
+ std::string system_prompt = "";
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
+ system_prompt += trim(message->content);
+ continue;
+ }
+ // in gemma, "assistant" is "model"
+ role = role == "assistant" ? "model" : message->role;
+ ss << "<start_of_turn>" << role << "\n";
+ if (!system_prompt.empty() && role != "model") {
+ ss << system_prompt << "\n\n";
+ system_prompt = "";
+ }
+ ss << trim(message->content) << "<end_of_turn>\n";
+ }
+ if (add_ass) {
+ ss << "<start_of_turn>model\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
+ // OrionStarAI/Orion-14B-Chat
+ std::string system_prompt = "";
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ // there is no system message support, we will merge it with user prompt
+ system_prompt += message->content;
+ continue;
+ } else if (role == "user") {
+ ss << "Human: ";
+ if (!system_prompt.empty()) {
+ ss << system_prompt << "\n\n";
+ system_prompt = "";
+ }
+ ss << message->content << "\n\nAssistant: </s>";
+ } else {
+ ss << message->content << "</s>";
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
+ // openchat/openchat-3.5-0106,
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << message->content << "<|end_of_turn|>";
+ } else {
+ role[0] = toupper(role[0]);
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
+ }
+ }
+ if (add_ass) {
+ ss << "GPT4 Correct Assistant:";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ // Orca-Vicuna variant uses a system prefix
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
+ ss << "SYSTEM: " << message->content << "\n";
+ } else {
+ ss << message->content << "\n\n";
+ }
+ } else if (role == "user") {
+ ss << "USER: " << message->content << "\n";
+ } else if (role == "assistant") {
+ ss << "ASSISTANT: " << message->content << "</s>\n";
+ }
+ }
+ if (add_ass) {
+ ss << "ASSISTANT:";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
+ // deepseek-ai/deepseek-coder-33b-instruct
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << message->content;
+ } else if (role == "user") {
+ ss << "### Instruction:\n" << message->content << "\n";
+ } else if (role == "assistant") {
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
+ }
+ }
+ if (add_ass) {
+ ss << "### Response:\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
+ // CohereForAI/c4ai-command-r-plus
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
+ } else if (role == "user") {
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
+ } else if (role == "assistant") {
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
+ }
+ }
+ if (add_ass) {
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
+ // Llama 3
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
+ }
+ if (add_ass) {
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
+ // chatglm3-6b
+ ss << "[gMASK]" << "sop";
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|" << role << "|>" << "\n " << message->content;
+ }
+ if (add_ass) {
+ ss << "<|assistant|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
+ ss << "[gMASK]" << "<sop>";
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|" << role << "|>" << "\n" << message->content;
+ }
+ if (add_ass) {
+ ss << "<|assistant|>\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|" << role << "|>" << "\n" << message->content;
+ }
+ if (add_ass) {
+ ss << "<|assistant|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "user") {
+ ss << LU8("<用户>");
+ ss << trim(message->content);
+ ss << "<AI>";
+ } else {
+ ss << trim(message->content);
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
+ // DeepSeek-V2
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << message->content << "\n\n";
+ } else if (role == "user") {
+ ss << "User: " << message->content << "\n\n";
+ } else if (role == "assistant") {
+ ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
+ }
+ }
+ if (add_ass) {
+ ss << "Assistant:";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
+ // DeepSeek-V3
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << message->content << "\n\n";
+ } else if (role == "user") {
+ ss << LU8("<|User|>") << message->content;
+ } else if (role == "assistant") {
+ ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
+ }
+ }
+ if (add_ass) {
+ ss << LU8("<|Assistant|>");
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
+ // EXAONE-3.0-7.8B-Instruct
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
+ } else if (role == "user") {
+ ss << "[|user|]" << trim(message->content) << "\n";
+ } else if (role == "assistant") {
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
+ }
+ }
+ if (add_ass) {
+ ss << "[|assistant|]";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_4) {
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
+ } else if (role == "user") {
+ ss << "[|user|]" << trim(message->content) << "\n";
+ } else if (role == "assistant") {
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
+ } else if (role == "tool") {
+ ss << "[|tool|]" << trim(message->content) << "[|endofturn|]\n";
+ }
+ }
+ if (add_ass) {
+ ss << "[|assistant|]";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_MOE) {
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "<|system|>\n" << trim(message->content) << "<|endofturn|>\n";
+ } else if (role == "user") {
+ ss << "<|user|>\n" << trim(message->content) << "<|endofturn|>\n";
+ } else if (role == "assistant") {
+ ss << "<|assistant|>\n" << trim(message->content) << "<|endofturn|>\n";
+ } else if (role == "tool") {
+ ss << "<|tool|>\n" << trim(message->content) << "<|endofturn|>\n";
+ }
+ }
+ if (add_ass) {
+ ss << "<|assistant|>\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
+ // this template requires the model to have "\n\n" as EOT token
+ for (size_t i = 0; i < chat.size(); i++) {
+ std::string role(chat[i]->role);
+ if (role == "system") {
+ ss << "System: " << trim(chat[i]->content) << "\n\n";
+ } else if (role == "user") {
+ ss << "User: " << trim(chat[i]->content) << "\n\n";
+ if (i == chat.size() - 1) {
+ ss << "Assistant:";
+ }
+ } else if (role == "assistant") {
+ ss << "Assistant: " << trim(chat[i]->content) << "\n\n";
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
+ // IBM Granite template
+ for (const auto & message : chat) {
+ std::string role(message->role);
+ ss << "<|start_of_role|>" << role << "<|end_of_role|>";
+ if (role == "assistant_tool_call") {
+ ss << "<|tool_call|>";
+ }
+ ss << message->content << "<|end_of_text|>\n";
+ }
+ if (add_ass) {
+ ss << "<|start_of_role|>assistant<|end_of_role|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
+ // GigaChat template
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
+
+ // Handle system message if present
+ if (has_system) {
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
+ } else {
+ ss << "<s>";
+ }
+
+ // Process remaining messages
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
+ std::string role(chat[i]->role);
+ if (role == "user") {
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
+ << "available functions<|role_sep|>[]<|message_sep|>";
+ } else if (role == "assistant") {
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
+ }
+ }
+
+ // Add generation prompt if needed
+ if (add_ass) {
+ ss << "assistant<|role_sep|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
+ // Megrez template
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
+ }
+
+ if (add_ass) {
+ ss << "<|role_start|>assistant<|role_end|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
+ // Yandex template ("\n\n" is defined as EOT token)
+
+ for (size_t i = 0; i < chat.size(); i++) {
+ std::string role(chat[i]->role);
+ if (role == "user") {
+ ss << " Пользователь: " << chat[i]->content << "\n\n";
+ } else if (role == "assistant") {
+ ss << " Ассистент: " << chat[i]->content << "\n\n";
+ }
+ }
+
+ // Add generation prompt if needed
+ if (add_ass) {
+ ss << " Ассистент:[SEP]";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
+ // Bailing (Ling/Ring) template
+ for (auto message : chat) {
+ std::string role(message->role);
+
+ if (role == "user") {
+ role = "HUMAN";
+ } else {
+ std::transform(role.begin(), role.end(), role.begin(), ::toupper);
+ }
+
+ ss << "<role>" << role << "</role>" << message->content;
+ }
+
+ if (add_ass) {
+ ss << "<role>ASSISTANT</role>";
+
+ if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
+ ss << "<think>";
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) {
+ // Bailing2 (Ling 2.0) template
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
+
+ if (!has_system) {
+ ss << "<role>SYSTEM</role>detailed thinking off<|role_end|>";
+ }
+
+ for (auto message : chat) {
+ std::string role(message->role);
+
+ if (role == "user") {
+ role = "HUMAN";
+ } else {
+ std::transform(role.begin(), role.end(), role.begin(), ::toupper);
+ }
+
+ ss << "<role>" << role << "</role>" << message->content << "<|role_end|>";
+ }
+
+ if (add_ass) {
+ ss << "<role>ASSISTANT</role>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
+ // Llama 4
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
+ }
+ if (add_ass) {
+ ss << "<|header_start|>assistant<|header_end|>\n\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
+ // SmolVLM
+ ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << message->content << "\n\n";
+ } else if (role == "user") {
+ ss << "User: " << message->content << "<end_of_utterance>\n";
+ } else {
+ ss << "Assistant: " << message->content << "<end_of_utterance>\n";
+ }
+ }
+ if (add_ass) {
+ ss << "Assistant:";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
+ // dots.llm1.inst (DOTS1)
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "<|system|>" << message->content << "<|endofsystem|>";
+ } else if (role == "user") {
+ ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
+ } else {
+ ss << "<|response|>" << message->content << "<|endofresponse|>";
+ }
+ }
+ if (add_ass) {
+ ss << "<|response|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
+ // tencent/Hunyuan-A13B-Instruct
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "<|startoftext|>" << message->content << "<|extra_4|>";
+ } else if (role == "assistant") {
+ ss << message->content << "<|eos|>";
+ } else {
+ ss << "<|startoftext|>" << message->content << "<|extra_0|>";
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
+ // OpenAI MoE (based on Harmony chat template)
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|start|>" << role << "<|message|>" << message->content;
+ ss << (role == "assistant" ? "<|return|>" : "<|end|>");
+ }
+ if (add_ass) {
+ ss << "<|start|>assistant";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
+ // tencent/Hunyuan-4B-Instruct
+ for (size_t i = 0; i < chat.size(); i++) {
+ std::string role(chat[i]->role);
+ if (i == 0) {
+ if (role == "system") {
+ ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
+ }
+ }
+
+ if (role == "assistant") {
+ ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
+ } else if (role == "user") {
+ ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
+ }
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
+ // moonshotai/Kimi-K2-Instruct
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "<|im_system|>system<|im_middle|>";
+ } else if (role == "user") {
+ ss << "<|im_user|>user<|im_middle|>";
+ } else if (role == "assistant") {
+ ss << "<|im_assistant|>assistant<|im_middle|>";
+ } else if (role == "tool") {
+ ss << "<|im_system|>tool<|im_middle|>";
+ }
+
+ ss << message->content << "<|im_end|>";
+ }
+ if (add_ass) {
+ ss << "<|im_assistant|>assistant<|im_middle|>";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
+ for (auto message: chat) {
+ std::string role(message->role);
+ ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
+ }
+ if (add_ass) {
+ ss << "<seed:bos>assistant\n";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "System: " << trim(message->content) << "<|separator|>\n\n";
+ } else if (role == "user") {
+ ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
+ } else if (role == "assistant") {
+ ss << "Assistant: " << message->content << "<|separator|>\n\n";
+ }
+ }
+ if (add_ass) {
+ ss << "Assistant:";
+ }
+ }else if (tmpl == LLM_CHAT_TEMPLATE_PANGU_EMBED) {
+ // [unused9]系统:xxx[unused10]
+ // [unused9]用户:xxx[unused10]
+ // [unused9]助手:xxx[unused10]
+ // ...
+ for (size_t i = 0; i < chat.size(); ++i) {
+ const auto & msg = chat[i];
+ const std::string & role = msg->role;
+ const std::string & content = msg->content;
+
+ if (i == 0 && role != "system") {
+ ss << "[unused9]系统:[unused10]";
+ }
+
+ if (role == "system") {
+ ss << "[unused9]系统:" << content << "[unused10]";
+ } else if (role == "user") {
+ ss << "[unused9]用户:" << content << "[unused10]";
+ } else if (role == "assistant") {
+ ss << "[unused9]助手:" << content << "[unused10]";
+ } else if (role == "tool") {
+ ss << "[unused9]工具:" << content << "[unused10]";
+ } else if (role == "function") {
+ ss << "[unused9]方法:" << content << "[unused10]";
+ }
+ }
+ if (add_ass) {
+ ss << "[unused9]助手:";
+ }
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
+ }
+ if (add_ass) {
+ ss << "<|begin|>assistant";
+ }
+ } else {
+ // template not supported
+ return -1;
+ }
+ dest = ss.str();
+ return dest.size();
+}
+
+// public interface
+
+int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
+ auto it = LLM_CHAT_TEMPLATES.begin();
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
+ output[i] = it->first.c_str();
+ std::advance(it, 1);
+ }
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
+}
diff --git a/llama.cpp/src/llama-chat.h b/llama.cpp/src/llama-chat.h
new file mode 100644
index 0000000..9ed1db1
--- /dev/null
+++ b/llama.cpp/src/llama-chat.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <cstdint>
+
+enum llm_chat_template {
+ LLM_CHAT_TEMPLATE_CHATML,
+ LLM_CHAT_TEMPLATE_LLAMA_2,
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
+ LLM_CHAT_TEMPLATE_MISTRAL_V1,
+ LLM_CHAT_TEMPLATE_MISTRAL_V3,
+ LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
+ LLM_CHAT_TEMPLATE_MISTRAL_V7,
+ LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
+ LLM_CHAT_TEMPLATE_PHI_3,
+ LLM_CHAT_TEMPLATE_PHI_4,
+ LLM_CHAT_TEMPLATE_FALCON_3,
+ LLM_CHAT_TEMPLATE_ZEPHYR,
+ LLM_CHAT_TEMPLATE_MONARCH,
+ LLM_CHAT_TEMPLATE_GEMMA,
+ LLM_CHAT_TEMPLATE_ORION,
+ LLM_CHAT_TEMPLATE_OPENCHAT,
+ LLM_CHAT_TEMPLATE_VICUNA,
+ LLM_CHAT_TEMPLATE_VICUNA_ORCA,
+ LLM_CHAT_TEMPLATE_DEEPSEEK,
+ LLM_CHAT_TEMPLATE_DEEPSEEK_2,
+ LLM_CHAT_TEMPLATE_DEEPSEEK_3,
+ LLM_CHAT_TEMPLATE_COMMAND_R,
+ LLM_CHAT_TEMPLATE_LLAMA_3,
+ LLM_CHAT_TEMPLATE_CHATGLM_3,
+ LLM_CHAT_TEMPLATE_CHATGLM_4,
+ LLM_CHAT_TEMPLATE_GLMEDGE,
+ LLM_CHAT_TEMPLATE_MINICPM,
+ LLM_CHAT_TEMPLATE_EXAONE_3,
+ LLM_CHAT_TEMPLATE_EXAONE_4,
+ LLM_CHAT_TEMPLATE_EXAONE_MOE,
+ LLM_CHAT_TEMPLATE_RWKV_WORLD,
+ LLM_CHAT_TEMPLATE_GRANITE,
+ LLM_CHAT_TEMPLATE_GIGACHAT,
+ LLM_CHAT_TEMPLATE_MEGREZ,
+ LLM_CHAT_TEMPLATE_YANDEX,
+ LLM_CHAT_TEMPLATE_BAILING,
+ LLM_CHAT_TEMPLATE_BAILING_THINK,
+ LLM_CHAT_TEMPLATE_BAILING2,
+ LLM_CHAT_TEMPLATE_LLAMA4,
+ LLM_CHAT_TEMPLATE_SMOLVLM,
+ LLM_CHAT_TEMPLATE_DOTS1,
+ LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
+ LLM_CHAT_TEMPLATE_OPENAI_MOE,
+ LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
+ LLM_CHAT_TEMPLATE_KIMI_K2,
+ LLM_CHAT_TEMPLATE_SEED_OSS,
+ LLM_CHAT_TEMPLATE_GROK_2,
+ LLM_CHAT_TEMPLATE_PANGU_EMBED,
+ LLM_CHAT_TEMPLATE_SOLAR_OPEN,
+ LLM_CHAT_TEMPLATE_UNKNOWN,
+};
+
+struct llama_chat_message;
+
+llm_chat_template llm_chat_template_from_str(const std::string & name);
+
+llm_chat_template llm_chat_detect_template(const std::string & tmpl);
+
+int32_t llm_chat_apply_template(
+ llm_chat_template tmpl,
+ const std::vector<const llama_chat_message *> & chat,
+ std::string & dest, bool add_ass);
diff --git a/llama.cpp/src/llama-context.cpp b/llama.cpp/src/llama-context.cpp
new file mode 100644
index 0000000..6b43ca1
--- /dev/null
+++ b/llama.cpp/src/llama-context.cpp
@@ -0,0 +1,3691 @@
+#include "llama-context.h"
+
+#include "llama-arch.h"
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-io.h"
+#include "llama-memory.h"
+#include "llama-mmap.h"
+#include "llama-model.h"
+
+#include <cinttypes>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+
+//
+// llama_context
+//
+
+llama_context::llama_context(
+ const llama_model & model,
+ llama_context_params params) :
+ model(model),
+ balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
+ // TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
+ // may need to be backend-dependent
+ LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
+
+ t_start_us = model.t_start_us;
+ t_load_us = model.t_load_us;
+
+ const auto & hparams = model.hparams;
+
+ cparams.n_seq_max = std::max(1u, params.n_seq_max);
+ if (cparams.n_seq_max > LLAMA_MAX_SEQ) {
+ throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
+ }
+
+ cparams.n_threads = params.n_threads;
+ cparams.n_threads_batch = params.n_threads_batch;
+ cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
+ cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
+ cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
+ cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
+ cparams.embeddings = params.embeddings;
+ cparams.offload_kqv = params.offload_kqv;
+ cparams.no_perf = params.no_perf;
+ cparams.pooling_type = params.pooling_type;
+ cparams.warmup = false;
+
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
+ cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
+ cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
+
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
+ hparams.n_ctx_train;
+
+ cparams.cb_eval = params.cb_eval;
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
+
+ // Initialize backend samplers here so they are part of the sampling graph
+ // before the reserve passes run later in this function. This avoids a later
+ // re-reserve when graph nodes change.
+ if (params.samplers != nullptr && params.n_samplers > 0) {
+ for (size_t i = 0; i < params.n_samplers; ++i) {
+ const auto & config = params.samplers[i];
+
+ if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
+ throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
+ }
+
+ if (set_sampler(config.seq_id, config.sampler)) {
+ const int n_samplers = llama_sampler_chain_n(config.sampler);
+
+ LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
+ }
+ }
+ }
+
+ auto rope_scaling_type = params.rope_scaling_type;
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+ rope_scaling_type = hparams.rope_scaling_type_train;
+ }
+
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+ cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
+ }
+
+ if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+ }
+
+ if (cparams.yarn_ext_factor != 0) {
+ static auto get_mscale = [](float scale, float mscale) {
+ return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+ };
+
+ const float factor = 1.0f / cparams.rope_freq_scale;
+
+ // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
+ if (hparams.rope_yarn_log_mul != 0.0f) {
+ // note: here we assume `mscale == 1.0f`
+ // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
+ float mscale = 1.0f;
+ const float mscale_all_dims = hparams.rope_yarn_log_mul;
+
+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+ // special-case DEEPSEEK v2:
+ // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
+ if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
+ mscale = mscale_all_dims;
+ }
+
+ cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+
+ LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
+ __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
+ } else {
+ cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
+ }
+
+ // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
+ // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
+ //
+ // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
+ // https://github.com/ggml-org/llama.cpp/pull/17945
+ cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
+ }
+
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
+
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+ } else {
+ cparams.pooling_type = hparams.pooling_type;
+ }
+ }
+
+ if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+ cparams.causal_attn = hparams.causal_attn;
+ } else {
+ cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+ }
+
+ cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
+ cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
+
+ // with causal attention, the batch size is limited by the context size
+ cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
+ cparams.op_offload = params.op_offload;
+ cparams.kv_unified = params.kv_unified;
+
+ // intialized later
+ cparams.pipeline_parallel = false;
+
+ {
+ const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
+ graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
+
+ if (graph_reuse_disable) {
+ LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
+ }
+ }
+
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
+
+ if (cparams.kv_unified) {
+ cparams.n_ctx_seq = cparams.n_ctx;
+ } else {
+ cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
+ cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
+
+ if (cparams.n_ctx_seq == 0) {
+ throw std::runtime_error("n_ctx_seq == 0");
+ }
+
+ if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
+ cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
+ LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
+ }
+ }
+
+ LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
+ LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
+ LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
+ LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
+ LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
+ LLAMA_LOG_INFO("%s: flash_attn = %s\n", __func__, llama_flash_attn_type_name(params.flash_attn_type));
+ LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
+
+ if (cparams.n_ctx_seq < hparams.n_ctx_train) {
+ LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+ __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
+ }
+
+ if (cparams.n_ctx_seq > hparams.n_ctx_train) {
+ LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+ __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
+ }
+
+ if (!hparams.vocab_only) {
+ // GPU backends
+ for (auto * dev : model.devices) {
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+ if (backend == nullptr) {
+ throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
+ }
+ backends.emplace_back(backend);
+ }
+
+ // add ACCEL backends (such as BLAS)
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+ if (backend == nullptr) {
+ throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
+ }
+ backends.emplace_back(backend);
+ }
+ }
+
+ // add CPU backend
+ backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+ if (backend_cpu == nullptr) {
+ throw std::runtime_error("failed to initialize CPU backend");
+ }
+ backends.emplace_back(backend_cpu);
+
+ // create a list of the set_n_threads functions in the backends
+ for (auto & backend : backends) {
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+ if (reg) {
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+ if (ggml_backend_set_n_threads_fn) {
+ set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
+ }
+ }
+ }
+
+ llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
+
+ // graph outputs buffer
+ {
+ if (output_reserve(params.n_seq_max) < params.n_seq_max) {
+ throw std::runtime_error("failed to reserve initial output buffer");
+ }
+
+ LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buffer_name (buf_output.get()),
+ ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
+ }
+ }
+
+ // init the memory module
+ if (!hparams.vocab_only) {
+ llama_memory_params params_mem = {
+ /*.type_k =*/ params.type_k,
+ /*.type_v =*/ params.type_v,
+ /*.swa_full =*/ params.swa_full,
+ };
+
+ memory.reset(model.create_memory(params_mem, cparams));
+ }
+
+ // init backends
+ if (!hparams.vocab_only) {
+ LLAMA_LOG_DEBUG("%s: enumerating backends\n", __func__);
+
+ backend_buft.clear();
+ backend_ptrs.clear();
+ backend_buf_exp_size.clear();
+
+ for (auto & backend : backends) {
+ auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+ auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+
+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
+ // use the host buffer of the first device CPU for faster transfer of the intermediate state
+ auto * dev = model.devices[0];
+ auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+ if (host_buft) {
+ buft = host_buft;
+ }
+ }
+
+ backend_buft.push_back(buft);
+ backend_ptrs.push_back(backend.get());
+ backend_buf_exp_size.push_back(0);
+ }
+
+ LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
+
+ // TODO: move these checks to ggml_backend_sched
+ // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
+ bool pipeline_parallel =
+ model.n_devices() > 1 &&
+ model.n_gpu_layers() > model.hparams.n_layer &&
+ model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
+ cparams.offload_kqv &&
+ !model.has_tensor_overrides();
+
+ // pipeline parallelism requires support for async compute and events in all devices
+ if (pipeline_parallel) {
+ for (auto & backend : backends) {
+ auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+ // ignore CPU backend
+ // TODO: should we ignore ACCEL types too?
+ continue;
+ }
+ auto * dev = ggml_backend_get_device(backend.get());
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ if (!props.caps.async || !props.caps.events) {
+ // device does not support async compute or events
+ pipeline_parallel = false;
+ break;
+ }
+ }
+ }
+
+ cparams.pipeline_parallel = pipeline_parallel;
+
+ if (cparams.pipeline_parallel) {
+ LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
+ }
+
+ sched_reserve();
+
+ if (!cparams.flash_attn) {
+ if (ggml_is_quantized(params.type_v)) {
+ throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
+ }
+ }
+ }
+
+ // Initialize the full vocabulary token ids for backend samplers.
+ {
+ const int n_vocab = model.vocab.n_tokens();
+
+ sampling.token_ids_full_vocab.resize(n_vocab);
+ for (int i = 0; i < n_vocab; ++i) {
+ sampling.token_ids_full_vocab[i] = i;
+ }
+ }
+}
+
+llama_context::~llama_context() {
+ if (!model.hparams.no_alloc) {
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = backend_buft[i];
+
+ const size_t size_exp = backend_buf_exp_size[i];
+ const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ if (size_exp == size_act) {
+ LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+ } else {
+ LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+ }
+ }
+ }
+ ggml_opt_free(opt_ctx);
+}
+
+void llama_context::sched_reserve() {
+ if (!sched_need_reserve) {
+ return;
+ }
+
+ sched_need_reserve = false;
+
+ LLAMA_LOG_INFO("%s: reserving ...\n", __func__);
+
+ synchronize();
+
+ const int64_t t_start_us = ggml_time_us();
+
+ const uint32_t n_seqs = cparams.n_seq_max;
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+ const size_t max_nodes = this->graph_max_nodes(n_tokens);
+
+ LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
+
+ gf_res_prev.reset(new llm_graph_result(max_nodes));
+ gf_res_reserve.reset(new llm_graph_result(max_nodes));
+
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));
+
+ llama_memory_context_ptr mctx;
+ if (memory) {
+ LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
+ mctx = memory->init_full();
+ if (!mctx) {
+ throw std::runtime_error("failed to initialize memory module");
+ }
+ }
+
+ // avoid reserving graphs with zero outputs - assume one output per sequence
+ const int n_outputs = n_seqs;
+
+ LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
+
+ // resolve automatic Flash Attention use
+ if (cparams.auto_fa) {
+ auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
+ if (!gf) {
+ throw std::runtime_error("failed to split graph for Flash Attention check");
+ }
+
+ const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
+ bool fa_device_mismatch = false;
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+ ggml_tensor * n = ggml_graph_node(gf, i);
+ if (n->op != GGML_OP_FLASH_ATTN_EXT) {
+ continue;
+ }
+ ggml_backend_dev_t device_fa = ggml_backend_get_device(
+ ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+ // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
+ GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
+ const int il = std::stoi(n->name + prefix_len);
+ ggml_backend_dev_t device_kv = model.dev_layer(il);
+ if (device_fa != device_kv) {
+ LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
+ "is assigned to device %s (usually due to missing support)\n",
+ __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
+ // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
+ fa_device_mismatch = true;
+ break;
+ }
+ }
+ if (fa_device_mismatch) {
+ cparams.flash_attn = false;
+ LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
+ } else {
+ cparams.flash_attn = true;
+ LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
+ }
+
+ cparams.auto_fa = false;
+ }
+
+ // reserve worst-case graph
+ int n_splits_pp = -1;
+ int n_nodes_pp = -1;
+
+ int n_splits_tg = -1;
+ int n_nodes_tg = -1;
+
+ // reserve pp (prompt processing) graph first so that buffers are only allocated once
+ {
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+ model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
+ if (!gf) {
+ if (cparams.pipeline_parallel) {
+ LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+ cparams.pipeline_parallel = false;
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+ gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+ }
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute pp buffers");
+ }
+ }
+
+ n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+ n_nodes_pp = ggml_graph_n_nodes(gf);
+ }
+
+ // reserve with tg (token generation) graph to get the number of splits and nodes
+ {
+ auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute tg buffers");
+ }
+
+ n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+ n_nodes_tg = ggml_graph_n_nodes(gf);
+ }
+
+ // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+ {
+ // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
+ //
+ // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
+ //
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute pp buffers");
+ }
+ }
+
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = backend_buft[i];
+ if (!model.hparams.no_alloc) {
+ backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ }
+ if (backend_buf_exp_size[i] > 1) {
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buft_name(buft),
+ backend_buf_exp_size[i] / 1024.0 / 1024.0);
+ }
+ }
+
+ if (n_nodes_pp == n_nodes_tg) {
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
+ } else {
+ LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+ }
+
+ if (n_splits_pp == n_splits_tg) {
+ LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+ } else {
+ LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+ }
+
+ const int64_t t_end_us = ggml_time_us();
+
+ LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
+ __func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
+}
+
+void llama_context::synchronize() {
+ if (!sched) {
+ return;
+ }
+
+ ggml_backend_sched_synchronize(sched.get());
+
+ // FIXME: if multiple single tokens are evaluated without a synchronization,
+ // the stats will be added to the prompt evaluation stats
+ // this should only happen when using batch size 1 to evaluate a batch
+
+ // add the evaluation to the stats
+ if (n_queued_tokens == 1) {
+ if (!cparams.no_perf) {
+ t_eval_us += ggml_time_us() - t_compute_start_us;
+ }
+ n_eval++;
+ } else if (n_queued_tokens > 1) {
+ if (!cparams.no_perf) {
+ t_p_eval_us += ggml_time_us() - t_compute_start_us;
+ }
+ n_p_eval += n_queued_tokens;
+ }
+
+ // get a more accurate load time, upon first eval
+ if (n_queued_tokens > 0 && !has_evaluated_once) {
+ t_load_us = ggml_time_us() - t_start_us;
+ has_evaluated_once = true;
+ }
+
+ n_queued_tokens = 0;
+ t_compute_start_us = 0;
+}
+
+const llama_model & llama_context::get_model() const {
+ return model;
+}
+
+const llama_cparams & llama_context::get_cparams() const {
+ return cparams;
+}
+
+ggml_backend_sched_t llama_context::get_sched() const {
+ return sched.get();
+}
+
+uint32_t llama_context::n_ctx() const {
+ return cparams.n_ctx;
+}
+
+uint32_t llama_context::n_ctx_seq() const {
+ return cparams.n_ctx_seq;
+}
+
+uint32_t llama_context::n_batch() const {
+ return cparams.n_batch;
+}
+
+uint32_t llama_context::n_ubatch() const {
+ return cparams.n_ubatch;
+}
+
+uint32_t llama_context::n_seq_max() const {
+ return cparams.n_seq_max;
+}
+
+uint32_t llama_context::n_threads() const {
+ return cparams.n_threads;
+}
+
+uint32_t llama_context::n_threads_batch() const {
+ return cparams.n_threads_batch;
+}
+
+llama_memory_t llama_context::get_memory() const {
+ return memory.get();
+}
+
+bool llama_context::memory_update(bool optimize) {
+ if (!memory) {
+ return false;
+ }
+
+ {
+ const auto mctx = memory->init_update(this, optimize);
+ switch (mctx->get_status()) {
+ case LLAMA_MEMORY_STATUS_SUCCESS:
+ {
+ // noop
+ } break;
+ case LLAMA_MEMORY_STATUS_NO_UPDATE:
+ {
+ // no updates need to be performed
+ return false;
+ }
+ case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+ case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+ {
+ LLAMA_LOG_ERROR("%s: failed to prepare memory update\n", __func__);
+ return false;
+ }
+ }
+
+ // reset the previous graph result to make sure that it won't be reused
+ // TODO: change the mctx->apply() to return information if a graph reserve is needed
+ // reset the graph result only if the memory module did reset the scheduler
+ gf_res_prev->reset();
+
+ if (!mctx->apply()) {
+ LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
+ }
+ }
+
+ // if the memory module did any computation, we have to reserve a new worst-case graph
+ {
+ const auto mctx = memory->init_full();
+ if (!mctx) {
+ throw std::runtime_error("failed to initialize memory context");
+ }
+
+ const uint32_t n_seqs = cparams.n_seq_max;
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+ if (!gf) {
+ LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__);
+ }
+ }
+
+ return true;
+}
+
+enum llama_pooling_type llama_context::pooling_type() const {
+ return cparams.pooling_type;
+}
+
+float * llama_context::get_logits() {
+ output_reorder();
+
+ return logits.data;
+}
+
+int64_t llama_context::output_resolve_row(int32_t i) const {
+ int64_t j = -1;
+
+ // support negative indices (last output row)
+ if (i < 0) {
+ j = n_outputs + i;
+ if (j < 0) {
+ throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+ }
+ } else if ((size_t) i >= output_ids.size()) {
+ throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+ } else {
+ // use output_ids to translate the batch token index into a row number
+ // that holds this token's data.
+ j = output_ids[i];
+ }
+
+ if (j < 0) {
+ // the batch token was not configured to output anything
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
+ }
+
+ if (j >= n_outputs) {
+ throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
+ }
+
+ return j;
+}
+
+float * llama_context::get_logits_ith(int32_t i) {
+ int64_t j = -1;
+
+ output_reorder();
+
+ try {
+ if (logits.data == nullptr) {
+ throw std::runtime_error("no logits");
+ }
+
+ // TODO: use output_resolve_row()
+ if (i < 0) {
+ j = n_outputs + i;
+ if (j < 0) {
+ throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+ }
+ } else if ((size_t) i >= output_ids.size()) {
+ throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+ } else {
+ j = output_ids[i];
+ }
+
+ if (j < 0) {
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
+ }
+ if (j >= n_outputs) {
+ // This should not happen
+ throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
+ }
+
+ return logits.data + j*model.vocab.n_tokens();
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+ GGML_ABORT("fatal error");
+#else
+ return nullptr;
+#endif
+ }
+}
+
+float * llama_context::get_embeddings() {
+ output_reorder();
+
+ return embd.data;
+}
+
+llama_token * llama_context::get_sampled_tokens() const{
+ return sampling.sampled.data;
+}
+
+float * llama_context::get_embeddings_ith(int32_t i) {
+ int64_t j = -1;
+
+ output_reorder();
+
+ try {
+ if (embd.data == nullptr) {
+ throw std::runtime_error("no embeddings");
+ }
+
+ // TODO: use output_resolve_row()
+ if (i < 0) {
+ j = n_outputs + i;
+ if (j < 0) {
+ throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+ }
+ } else if ((size_t) i >= output_ids.size()) {
+ throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+ } else {
+ j = output_ids[i];
+ }
+
+ if (j < 0) {
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
+ }
+ if (j >= n_outputs) {
+ // This should not happen
+ throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
+ }
+
+ const uint32_t n_embd_out = model.hparams.n_embd_out();
+ return embd.data + j*n_embd_out;
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+ GGML_ABORT("fatal error");
+#else
+ return nullptr;
+#endif
+ }
+}
+
+float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
+ auto it = embd_seq.find(seq_id);
+ if (it == embd_seq.end()) {
+ return nullptr;
+ }
+
+ return it->second.data();
+}
+
+llama_token llama_context::get_sampled_token_ith(int32_t idx) {
+ output_reorder();
+
+ if (!sampling.sampled.has_data()) {
+ return LLAMA_TOKEN_NULL;
+ }
+
+ try {
+ const int64_t row = output_resolve_row(idx);
+ GGML_ASSERT(row < (int64_t) sampling.sampled.size);
+ return sampling.sampled.data[row];
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
+ return LLAMA_TOKEN_NULL;
+ }
+}
+
+float * llama_context::get_sampled_probs_ith(int32_t idx) {
+ output_reorder();
+
+ if (!sampling.probs.has_data()) {
+ return nullptr;
+ }
+
+ try {
+ const int64_t row = output_resolve_row(idx);
+ if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
+ return nullptr;
+ }
+ return sampling.probs.data + row*model.vocab.n_tokens();
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
+ return nullptr;
+ }
+}
+
+float * llama_context::get_sampled_logits_ith(int32_t idx) {
+ output_reorder();
+
+ if (!sampling.logits.has_data()) {
+ return nullptr;
+ }
+
+ try {
+ const int64_t row = output_resolve_row(idx);
+ if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
+ return nullptr;
+ }
+ return sampling.logits.data + row*model.vocab.n_tokens();
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
+ return nullptr;
+ }
+}
+
+const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
+ output_reorder();
+
+ try {
+ const int64_t row = output_resolve_row(idx);
+ if (sampling.candidates.has_data() &&
+ (size_t) row < sampling.candidates_count.size() &&
+ sampling.candidates_count[row] > 0) {
+ return sampling.candidates.data + row*model.vocab.n_tokens();
+ }
+ } catch (const std::exception & err) {
+ // fallback to full vocab list
+ }
+
+ return sampling.token_ids_full_vocab.data();
+}
+
+size_t llama_context::get_sampled_candidates_count(int32_t idx) {
+ output_reorder();
+
+ if (!sampling.candidates.has_data()) {
+ return 0;
+ }
+
+ try {
+ const int64_t row = output_resolve_row(idx);
+ if ((size_t) row >= sampling.candidates_count.size()) {
+ return 0;
+ }
+ return sampling.candidates_count[row];
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
+ return 0;
+ }
+}
+
+size_t llama_context::get_sampled_logits_count(int32_t idx) {
+ output_reorder();
+
+ if (!sampling.logits.has_data()) {
+ return model.vocab.n_tokens();
+ }
+
+ try {
+ const int64_t row = output_resolve_row(idx);
+ if ((size_t) row >= sampling.logits_count.size()) {
+ return 0;
+ }
+ return sampling.logits_count[row];
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
+ return 0;
+ }
+}
+
+size_t llama_context::get_sampled_probs_count(int32_t idx) {
+ output_reorder();
+
+ if (!sampling.probs.has_data()) {
+ return 0;
+ }
+
+ try {
+ const int64_t row = output_resolve_row(idx);
+ if ((size_t) row >= sampling.probs_count.size()) {
+ return 0;
+ }
+ return sampling.probs_count[row];
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
+ return 0;
+ }
+}
+
+
+void llama_context::attach_threadpool(
+ ggml_threadpool_t threadpool,
+ ggml_threadpool_t threadpool_batch) {
+ LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
+ this->threadpool = threadpool;
+ this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+}
+
+void llama_context::detach_threadpool() {
+ LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
+ this->threadpool = nullptr;
+ this->threadpool_batch = nullptr;
+}
+
+void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
+ LLAMA_LOG_DEBUG("%s: n_threads = %d, n_threads_batch = %d\n", __func__, n_threads, n_threads_batch);
+
+ cparams.n_threads = n_threads;
+ cparams.n_threads_batch = n_threads_batch;
+}
+
+void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
+ LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
+ this->abort_callback = abort_callback;
+ this->abort_callback_data = abort_callback_data;
+
+ for (auto & backend : backends) {
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
+ if (set_abort_callback_fn) {
+ set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
+ }
+ }
+}
+
+void llama_context::set_embeddings(bool value) {
+ LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
+ cparams.embeddings = value;
+
+ // TODO: not sure yet if we want to reserve here
+ //sched_need_reserve = true;
+}
+
+void llama_context::set_causal_attn(bool value) {
+ LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
+ if (cparams.causal_attn == value) {
+ return;
+ }
+
+ cparams.causal_attn = value;
+
+ sched_need_reserve = true;
+}
+
+void llama_context::set_warmup(bool value) {
+ LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
+ if (cparams.warmup == value) {
+ return;
+ }
+
+ cparams.warmup = value;
+
+ // warmups are usually with small batches, so no need to reserve
+ //sched_need_reserve = true;
+}
+
+bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
+ if (!sampler && sampling.samplers.count(seq_id) == 0) {
+ return true;
+ }
+
+ LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
+
+ const bool can_offload =
+ sampler &&
+ sampler->iface->backend_init &&
+ sampler->iface->backend_apply &&
+ llama_sampler_chain_n(sampler) > 0;
+
+ if (sampler && can_offload) {
+ auto * buft = ggml_backend_dev_buffer_type(model.dev_output());
+
+ sampler->iface->backend_init(sampler, buft);
+
+ sampling.samplers[seq_id] = sampler;
+
+ sched_need_reserve = true;
+
+ return true;
+ }
+
+ if (sampler && !can_offload) {
+ LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
+
+ if (sampling.samplers.count(seq_id) > 0) {
+ sched_need_reserve = true;
+ }
+
+ sampling.samplers.erase(seq_id);
+
+ return false;
+ }
+
+ sampling.samplers.erase(seq_id);
+
+ sched_need_reserve = true;
+
+ return true;
+}
+
+void llama_context::set_adapter_lora(
+ llama_adapter_lora * adapter,
+ float scale) {
+ LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
+
+ if (auto it = loras.find(adapter); it != loras.end()) {
+ if (it->second == scale) {
+ return;
+ }
+ }
+
+ loras[adapter] = scale;
+
+ sched_need_reserve = true;
+}
+
+bool llama_context::rm_adapter_lora(
+ llama_adapter_lora * adapter) {
+ LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
+
+ auto it = loras.find(adapter);
+ if (it != loras.end()) {
+ loras.erase(it);
+
+ sched_need_reserve = true;
+
+ return true;
+ }
+
+ return false;
+}
+
+void llama_context::clear_adapter_lora() {
+ LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
+ if (loras.empty()) {
+ return;
+ }
+
+ loras.clear();
+
+ sched_need_reserve = true;
+}
+
+bool llama_context::apply_adapter_cvec(
+ const float * data,
+ size_t len,
+ int32_t n_embd,
+ int32_t il_start,
+ int32_t il_end) {
+ LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
+
+ // TODO: should we reserve?
+
+ return cvec.apply(model, data, len, n_embd, il_start, il_end);
+}
+
+llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
+ if (mctx && !mctx->apply()) {
+ LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
+ ret = GGML_STATUS_FAILED;
+ return nullptr;
+ }
+
+ auto * res = gf_res_prev.get();
+ auto * gf = res->get_gf();
+
+ // the new graph parameters
+ // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
+ const auto gparams = graph_params(res, ubatch, mctx, gtype);
+
+ if (!graph_reuse_disable && res->can_reuse(gparams)) {
+ //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+
+ n_reused++;
+ } else {
+ res->reset();
+
+ ggml_backend_sched_reset(sched.get());
+ ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+ //const auto t_start_us = ggml_time_us();
+
+ gf = model.build_graph(gparams);
+
+ //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
+
+ if (!gf) {
+ LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
+ ret = GGML_STATUS_FAILED;
+ return nullptr;
+ }
+
+ if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
+ LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+ ret = GGML_STATUS_ALLOC_FAILED;
+ return nullptr;
+ }
+ }
+
+ // set the input data for the input tensors
+ {
+ //const auto t_start_us = ggml_time_us();
+
+ res->set_inputs(&ubatch);
+
+ //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
+ }
+
+ const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
+ if (status != GGML_STATUS_SUCCESS) {
+ LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
+ ret = status;
+ return nullptr;
+ }
+
+ ret = GGML_STATUS_SUCCESS;
+
+ return res;
+}
+
+int llama_context::encode(const llama_batch & batch_inp) {
+ GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
+
+ if (batch_inp.n_tokens == 0) {
+ LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+ return -1;
+ }
+
+ const auto & hparams = model.hparams;
+
+ const int64_t n_embd = hparams.n_embd_inp();
+ const int64_t n_vocab = model.vocab.n_tokens();
+
+ // note: during encode, we always pass the full sequence starting from pos = 0
+ if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
+ LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
+ return -1;
+ }
+
+ const uint32_t n_tokens = balloc->get_n_tokens();
+
+ // [TAG_NO_CACHE_PAD]
+ // TODO: add new split mode where we pad the input sequences so that ubatch.equal_seqs == true
+ const llama_ubatch ubatch = balloc->split_simple(n_tokens);
+
+ // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+ GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+
+ if (t_compute_start_us == 0) {
+ t_compute_start_us = ggml_time_us();
+ }
+
+ // TODO: this clear of the buffer can easily be forgotten - need something better
+ embd_seq.clear();
+
+ sched_reserve();
+
+ n_queued_tokens += n_tokens;
+
+ // reserve output buffer
+ if (output_reserve(n_tokens) < n_tokens) {
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+ return -2;
+ };
+
+ for (uint32_t i = 0; i < n_tokens; ++i) {
+ output_ids[i] = i;
+ }
+
+ n_outputs = n_tokens;
+
+ const auto causal_attn_org = cparams.causal_attn;
+
+ // always use non-causal attention for encoder graphs
+ // TODO: this is a tmp solution until we have a proper way to support enc-dec models
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
+ cparams.causal_attn = false;
+
+ ggml_status status;
+ const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
+
+ cparams.causal_attn = causal_attn_org;
+
+ if (!res) {
+ switch (status) {
+ case GGML_STATUS_ABORTED: return 2;
+ case GGML_STATUS_ALLOC_FAILED: return -2;
+ case GGML_STATUS_FAILED: return -3;
+ case GGML_STATUS_SUCCESS: GGML_ABORT("should not happen");
+ }
+ }
+
+ auto * t_logits = res->get_logits();
+ auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
+
+ // extract logits
+ if (logits.data && t_logits) {
+ ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+ GGML_ASSERT(backend_res != nullptr);
+ GGML_ASSERT(logits.data != nullptr);
+
+ ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_vocab*sizeof(float));
+ }
+
+ // extract embeddings
+ if (embd.data && t_embd) {
+ ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+ GGML_ASSERT(backend_embd != nullptr);
+
+ switch (cparams.pooling_type) {
+ case LLAMA_POOLING_TYPE_NONE:
+ {
+ // extract token embeddings
+ GGML_ASSERT(embd.data != nullptr);
+ const uint32_t n_embd_out = hparams.n_embd_out();
+
+ GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd.size);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_embd_out*sizeof(float));
+ } break;
+ case LLAMA_POOLING_TYPE_MEAN:
+ case LLAMA_POOLING_TYPE_CLS:
+ case LLAMA_POOLING_TYPE_LAST:
+ {
+ // extract sequence embeddings
+ auto & embd_seq_out = embd_seq;
+
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ const llama_seq_id seq_id = ubatch.seq_id_unq[s];
+ const int32_t seq_idx = ubatch.seq_idx[seq_id];
+
+ embd_seq_out[seq_id].resize(n_embd);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+ }
+ } break;
+ case LLAMA_POOLING_TYPE_RANK:
+ {
+ // extract the rerank score - n_cls_out floats per sequence
+ auto & embd_seq_out = embd_seq;
+
+ const uint32_t n_cls_out = hparams.n_cls_out;
+
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ const llama_seq_id seq_id = ubatch.seq_id_unq[s];
+ const int32_t seq_idx = ubatch.seq_idx[seq_id];
+
+ embd_seq_out[seq_id].resize(n_cls_out);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_idx)*sizeof(float), n_cls_out*sizeof(float));
+ }
+ } break;
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
+ {
+ GGML_ABORT("unknown pooling type");
+ }
+ }
+ }
+
+ // TODO: hacky solution
+ if (model.arch == LLM_ARCH_T5 && t_embd) {
+ //cross.t_embd = t_embd;
+
+ synchronize();
+
+ cross.n_embd = t_embd->ne[0];
+ cross.n_enc = t_embd->ne[1];
+ cross.v_embd.resize(cross.n_embd*cross.n_enc);
+ memcpy(cross.v_embd.data(), embd.data, ggml_nbytes(t_embd));
+
+ const auto & batch = balloc->get_batch();
+
+ // remember the sequence ids used during the encoding - needed for cross attention later
+ cross.seq_ids_enc.resize(n_tokens);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ cross.seq_ids_enc[i].clear();
+
+ for (int s = 0; s < batch.n_seq_id[i]; s++) {
+ const llama_seq_id seq_id = batch.seq_id[i][s];
+
+ cross.seq_ids_enc[i].insert(seq_id);
+ }
+ }
+ }
+
+ return 0;
+}
+
+static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
+ std::map<llama_seq_id, uint32_t> seq_to_row;
+ // how many output tokens we have seen so far for this ubatch.
+ uint32_t local = 0;
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ // skip tokens that are not output.
+ if (!ubatch.output[i]) {
+ continue;
+ }
+
+ const llama_seq_id seq_id = ubatch.seq_id[i][0];
+ // row_offset is the number of output tokens before this ubatch.
+ seq_to_row[seq_id] = row_offset + local;
+ ++local;
+ }
+ return seq_to_row;
+}
+
+static void copy_tensor_async_ints(
+ const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
+ const buffer_view<llama_token> & sampled,
+ const std::map<llama_seq_id, uint32_t> & seq_to_row,
+ ggml_backend_sched_t sched) {
+ if (!sampled.has_data()) {
+ return;
+ }
+
+ for (const auto & [seq_id, tensor] : tensor_map) {
+ auto it = seq_to_row.find(seq_id);
+ if (it == seq_to_row.end()) {
+ continue;
+ }
+
+ const uint32_t row = it->second;
+ GGML_ASSERT(row < sampled.size);
+
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
+
+ ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
+ ggml_backend_tensor_get_async(backend, tensor, sampled.data + row, 0, sizeof(sampled.data[row]));
+ }
+}
+
+static void copy_tensor_async_floats(
+ const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
+ const buffer_view<float> & dst,
+ size_t stride,
+ std::vector<uint32_t> & counts,
+ const std::map<llama_seq_id, uint32_t> & seq_to_row,
+ ggml_backend_sched_t sched) {
+ if (!dst.has_data()) {
+ return;
+ }
+
+ for (const auto & [seq_id, tensor] : tensor_map) {
+ auto it = seq_to_row.find(seq_id);
+ if (it == seq_to_row.end()) {
+ continue;
+ }
+
+ const uint32_t row = it->second;
+ GGML_ASSERT(row < counts.size());
+
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
+
+ ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
+ float * row_ptr = dst.data + (size_t) row * stride;
+ ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
+
+ // Update the actual number of logits/probabilities that were written for this row.
+ counts[row] = ggml_nelements(tensor);
+ }
+}
+
+static void copy_tensor_async_candidates(
+ const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
+ const buffer_view<llama_token> & dst,
+ size_t stride,
+ std::vector<uint32_t> & counts,
+ const std::map<llama_seq_id, uint32_t> & seq_to_row,
+ ggml_backend_sched_t sched) {
+ if (!dst.has_data()) {
+ return;
+ }
+
+ for (const auto & [seq_id, tensor] : tensor_map) {
+ auto it = seq_to_row.find(seq_id);
+ if (it == seq_to_row.end()) {
+ continue;
+ }
+
+ const uint32_t row = it->second;
+ GGML_ASSERT(row < counts.size());
+
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
+
+ ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
+ llama_token * row_ptr = dst.data + (size_t) row * stride;
+ ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
+
+ // Update the actual number of candidates that were written.
+ counts[row] = ggml_nelements(tensor);
+ }
+}
+
+static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_seq_id, llama_sampler *> & samplers) {
+ for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+ if (!ubatch.output[i]) {
+ continue;
+ }
+
+ // Check if the output token has at least one sequence without a backend sampler.
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
+ llama_seq_id seq_id = ubatch.seq_id[i][j];
+ if (samplers.find(seq_id) == samplers.end()) {
+ return true;
+ }
+ }
+ }
+ return false; // all sequences use backend sampling
+}
+
+int llama_context::decode(const llama_batch & batch_inp) {
+ GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
+
+ if (!memory) {
+ LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
+ return encode(batch_inp);
+ }
+
+ if (batch_inp.n_tokens == 0) {
+ LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+ return -1;
+ }
+
+ const auto & vocab = model.vocab;
+ const auto & hparams = model.hparams;
+
+ const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_embd = hparams.n_embd_inp();
+
+ // when computing embeddings, all tokens are output
+ const bool output_all = cparams.embeddings;
+ const bool has_samplers = !sampling.samplers.empty();
+
+ const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
+
+ // TODO: avoid this workaround in the future
+ if (has_samplers && batch_inp.logits) {
+ std::vector<int32_t> seq_output_count(n_seq_max, 0);
+
+ for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
+ if (batch_inp.logits[i] == 0) {
+ continue;
+ }
+
+ const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
+
+ for (int32_t s = 0; s < ns; ++s) {
+ const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
+
+ seq_output_count[seq_id]++;
+ if (seq_output_count[seq_id] > 1) {
+ LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
+ __func__, seq_id, seq_output_count[seq_id]);
+ return -1;
+ }
+ }
+ }
+ }
+
+ if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
+ LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
+ return -1;
+ }
+
+ const uint32_t n_tokens_all = balloc->get_n_tokens();
+ const uint32_t n_outputs_all = balloc->get_n_outputs();
+
+ if (output_all) {
+ // require that all tokens are output
+ if (n_outputs_all != n_tokens_all) {
+ LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
+ __func__, n_outputs_all, n_tokens_all);
+ return -1;
+ }
+ }
+
+ GGML_ASSERT(n_tokens_all <= cparams.n_batch);
+
+ GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
+
+ if (t_compute_start_us == 0) {
+ t_compute_start_us = ggml_time_us();
+ }
+ n_queued_tokens += n_tokens_all;
+
+ // TODO: this clear of the buffer can easily be forgotten - need something better
+ embd_seq.clear();
+ output_swaps.clear();
+
+ sched_reserve();
+
+ bool did_optimize = false;
+
+ // handle any pending shifts/copies
+ memory_update(false);
+
+ llama_memory_context_ptr mctx;
+
+ while (true) {
+ mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
+ if (!mctx) {
+ return -2;
+ }
+
+ switch (mctx->get_status()) {
+ case LLAMA_MEMORY_STATUS_SUCCESS:
+ {
+ } break;
+ case LLAMA_MEMORY_STATUS_NO_UPDATE:
+ {
+ LLAMA_LOG_ERROR("%s: unexpected memory context status: %d\n", __func__, mctx->get_status());
+
+ return -2;
+ }
+ case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+ {
+ if (!did_optimize) {
+ did_optimize = true;
+
+ if (memory_update(true)) {
+ LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
+
+ continue;
+ }
+ }
+
+ LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, balloc->get_n_tokens());
+
+ return 1;
+ }
+ case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+ {
+ LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, balloc->get_n_tokens());
+
+ return -2;
+ }
+ }
+
+ break;
+ }
+
+ // reserve output buffer
+ if (output_reserve(n_outputs_all) < n_outputs_all) {
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
+ return -2;
+ };
+
+ int64_t n_outputs_prev = 0;
+
+ do {
+ const auto & ubatch = mctx->get_ubatch();
+
+ // count the outputs in this ubatch
+ {
+ int32_t n_outputs_new = 0;
+
+ if (n_outputs_all == n_tokens_all) {
+ n_outputs_new = ubatch.n_tokens;
+ } else {
+ for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+ n_outputs_new += (int32_t) (ubatch.output[i] != 0);
+ }
+ }
+
+ // needs to happen before the graph is built
+ n_outputs = n_outputs_new;
+ }
+
+ ggml_status status;
+ const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
+
+ if (!res) {
+ // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
+ llama_pos pos_min[LLAMA_MAX_SEQ];
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ pos_min[s] = std::numeric_limits<llama_pos>::max();
+ }
+
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ const auto & seq_id = ubatch.seq_id[i][0];
+
+ pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
+ }
+
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
+ continue;
+ }
+
+ LLAMA_LOG_WARN("%s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
+
+ memory->seq_rm(s, pos_min[s], -1);
+ }
+
+ switch (status) {
+ case GGML_STATUS_ABORTED: return 2;
+ case GGML_STATUS_ALLOC_FAILED: return -2;
+ case GGML_STATUS_FAILED: return -3;
+ case GGML_STATUS_SUCCESS: GGML_ABORT("should not happen");
+ }
+ }
+
+ // plot the computation graph in dot format (for debugging purposes)
+ //if (n_past%100 == 0) {
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
+ //}
+
+ auto * t_logits = res->get_logits();
+ auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
+
+ if (t_embd && res->get_embd_pooled()) {
+ t_embd = res->get_embd_pooled();
+ }
+
+ // extract logits
+ if (logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
+ ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+ GGML_ASSERT(backend_res != nullptr);
+ GGML_ASSERT(logits.data != nullptr);
+
+ float * logits_out = logits.data + n_outputs_prev*n_vocab;
+
+ if (n_outputs) {
+ GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+ GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size);
+ ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+ }
+ }
+
+ // extract embeddings
+ if (embd.data && t_embd && n_outputs > 0) {
+ ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+ GGML_ASSERT(backend_embd != nullptr);
+
+ switch (cparams.pooling_type) {
+ case LLAMA_POOLING_TYPE_NONE:
+ {
+ // extract token embeddings
+ GGML_ASSERT(embd.data != nullptr);
+ const uint32_t n_embd_out = hparams.n_embd_out();
+ float * embd_out = embd.data + n_outputs_prev*n_embd_out;
+
+ if (n_outputs) {
+ GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+ GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd.size);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
+ }
+ } break;
+ case LLAMA_POOLING_TYPE_MEAN:
+ case LLAMA_POOLING_TYPE_CLS:
+ case LLAMA_POOLING_TYPE_LAST:
+ {
+ // extract sequence embeddings (cleared before processing each batch)
+ auto & embd_seq_out = embd_seq;
+
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ const llama_seq_id seq_id = ubatch.seq_id_unq[s];
+ const int32_t seq_idx = ubatch.seq_idx[seq_id];
+
+ embd_seq_out[seq_id].resize(n_embd);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+ }
+ } break;
+ case LLAMA_POOLING_TYPE_RANK:
+ {
+ // extract the rerank score - n_cls_out floats per sequence
+ auto & embd_seq_out = embd_seq;
+
+ const uint32_t n_cls_out = hparams.n_cls_out;
+
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ const llama_seq_id seq_id = ubatch.seq_id_unq[s];
+ const int32_t seq_idx = ubatch.seq_idx[seq_id];
+
+ embd_seq_out[seq_id].resize(n_cls_out);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_idx)*sizeof(float), n_cls_out*sizeof(float));
+ }
+ } break;
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
+ {
+ GGML_ABORT("unknown pooling type");
+ }
+ }
+ }
+
+ // Copy backend sampling output if this ubatch produced any sampling tensors.
+ if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) {
+ const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
+ const auto stride = n_vocab;
+
+ // async copy the sampling data from the backend to the host
+ copy_tensor_async_ints(res->t_sampled, sampling.sampled, seq_to_output_row, sched.get());
+
+ copy_tensor_async_floats (res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
+ copy_tensor_async_floats (res->t_sampled_probs, sampling.probs, stride, sampling.probs_count, seq_to_output_row, sched.get());
+ copy_tensor_async_candidates(res->t_candidates, sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
+ }
+
+ n_outputs_prev += n_outputs;
+ } while (mctx->next());
+
+ // set to total number of outputs in the batch, for use in llama_get_logits_ith
+ n_outputs = n_outputs_all;
+
+ // set output mappings
+ if (n_outputs > 0) {
+ bool sorted_output = true;
+
+ auto & out_ids = balloc->get_out_ids();
+
+ GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
+
+ for (int64_t i = 0; i < n_outputs; ++i) {
+ int64_t out_id = out_ids[i];
+ output_ids[out_id] = i;
+ if (out_id != i) {
+ sorted_output = false;
+ }
+ }
+
+ // make the outputs have the same order they had in the user-provided batch
+ // note: this is mostly relevant for recurrent models atm
+ if (!sorted_output && n_outputs > 1) {
+ GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+ // TODO: is there something more efficient which also minimizes swaps?
+ // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+ for (uint32_t i = 0; i < n_outputs - 1; ++i) {
+ uint32_t j_min = i;
+ for (uint32_t j = i + 1; j < n_outputs; ++j) {
+ if (out_ids[j] < out_ids[j_min]) {
+ j_min = j;
+ }
+ }
+ if (j_min == i) {
+ continue;
+ }
+ std::swap(out_ids[i], out_ids[j_min]);
+
+ // remember the swaps and apply them lazily upon logits/embeddings access
+ output_swaps.push_back({ i, j_min });
+ }
+
+ std::fill(output_ids.begin(), output_ids.end(), -1);
+
+ for (uint32_t i = 0; i < n_outputs; ++i) {
+ output_ids[out_ids[i]] = i;
+ }
+ }
+ }
+
+ // wait for the computation to finish (automatically done when obtaining the model output)
+ //synchronize();
+
+ return 0;
+}
+
+//
+// output
+//
+
+uint32_t llama_context::output_reserve(int32_t n_outputs) {
+
+ const auto & hparams = model.hparams;
+ const auto & vocab = model.vocab;
+
+ const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
+
+ const auto n_batch = cparams.n_batch;
+ const auto n_vocab = vocab.n_tokens();
+ const auto n_embd_out = hparams.n_embd_out();
+
+ bool has_logits = true;
+ bool has_embd = cparams.embeddings;
+
+ // TODO: hacky enc-dec support
+ if (model.arch == LLM_ARCH_T5) {
+ has_logits = true;
+ has_embd = true;
+ }
+
+
+ size_t backend_float_count = 0;
+ size_t backend_token_count = 0;
+
+ logits.size = has_logits ? n_vocab*n_outputs_max : 0;
+ embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
+
+ // Allocate backend sampling output buffers if there are backend samplers configured.
+ const bool has_sampling = !sampling.samplers.empty();
+ if (has_sampling) {
+ backend_float_count = 2 * n_vocab * n_outputs_max; // logits + probs
+ backend_token_count = (1 + n_vocab) * n_outputs_max; // sampled + candidates
+ }
+
+ if (output_ids.empty()) {
+ // init, never resized afterwards
+ output_ids.resize(n_batch);
+ }
+
+ const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
+ const size_t new_size =
+ (logits.size + embd.size + backend_float_count) * sizeof(float) +
+ ( backend_token_count) * sizeof(llama_token);
+
+ // alloc only when more than the current capacity is required
+ // TODO: also consider shrinking the buffer
+ if (!buf_output || prev_size < new_size) {
+ if (buf_output) {
+#ifndef NDEBUG
+ // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+ LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+ synchronize();
+
+ // TODO: not needed?
+ buf_output = nullptr;
+ logits.data = nullptr;
+ embd.data = nullptr;
+ }
+
+ auto * buft = ggml_backend_cpu_buffer_type();
+ // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+ auto * output_dev = model.dev_output();
+ auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+ if (output_dev_host_buft) {
+ buft = output_dev_host_buft;
+ }
+ buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+ if (buf_output == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+ return 0;
+ }
+ }
+
+ float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
+
+ size_t offset = 0;
+ uint8_t * base = (uint8_t *) output_base;
+
+ logits = has_logits ? buffer_view<float>{output_base, logits.size} : buffer_view<float>{nullptr, 0};
+ offset += logits.size * sizeof(float);
+
+ embd = has_embd ? buffer_view<float>{(float *) (base + offset), embd.size} : buffer_view<float>{nullptr, 0};
+ offset += embd.size * sizeof(float);
+
+ sampling.logits = {nullptr, 0};
+ sampling.probs = {nullptr, 0};
+ sampling.sampled = {nullptr, 0};
+ sampling.candidates = {nullptr, 0};
+
+ if (has_sampling) {
+ sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
+ offset += sampling.logits.size * sizeof(float);
+
+ sampling.probs = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
+ offset += sampling.probs.size * sizeof(float);
+
+ sampling.sampled = {(llama_token *) (base + offset), (size_t)n_outputs_max};
+ offset += sampling.sampled.size * sizeof(llama_token);
+
+ sampling.candidates = {(llama_token *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
+ offset += sampling.candidates.size * sizeof(llama_token);
+
+ // The count vectors keep track of the actual number of logits/probs/candidates
+ // copied from the backend for each output row.
+
+ sampling.logits_count.resize(n_outputs_max);
+ sampling.probs_count.resize(n_outputs_max);
+ sampling.candidates_count.resize(n_outputs_max);
+
+ std::fill(sampling.logits_count.begin(), sampling.logits_count.end(), 0);
+ std::fill(sampling.probs_count.begin(), sampling.probs_count.end(), 0);
+ std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
+
+ std::fill_n(sampling.sampled.data, sampling.sampled.size, LLAMA_TOKEN_NULL);
+ }
+
+ // set all ids as invalid (negative)
+ std::fill(output_ids.begin(), output_ids.end(), -1);
+
+ this->n_outputs = 0;
+
+ return n_outputs_max;
+}
+
+void llama_context::output_reorder() {
+ const uint64_t n_vocab = model.vocab.n_tokens();
+ const uint64_t n_embd = model.hparams.n_embd;
+
+ for (size_t s = 0; s < output_swaps.size(); ++s) {
+ const uint64_t i0 = output_swaps[s].i0;
+ const uint64_t i1 = output_swaps[s].i1;
+
+ if (logits.size > 0) {
+ for (uint64_t k = 0; k < n_vocab; k++) {
+ std::swap(logits.data[i0*n_vocab + k], logits.data[i1*n_vocab + k]);
+ }
+ }
+
+ if (embd.size > 0) {
+ for (uint64_t k = 0; k < n_embd; k++) {
+ std::swap(embd.data[i0*n_embd + k], embd.data[i1*n_embd + k]);
+ }
+ }
+
+ if (sampling.logits.has_data()) {
+ for (uint64_t k = 0; k < n_vocab; ++k) {
+ std::swap(sampling.logits.data[i0*n_vocab + k], sampling.logits.data[i1*n_vocab + k]);
+ }
+ }
+
+ if (sampling.probs.has_data()) {
+ for (uint64_t k = 0; k < n_vocab; ++k) {
+ std::swap(sampling.probs.data[i0*n_vocab + k], sampling.probs.data[i1*n_vocab + k]);
+ }
+ }
+
+ if (sampling.candidates.has_data()) {
+ for (uint64_t k = 0; k < n_vocab; ++k) {
+ std::swap(sampling.candidates.data[i0*n_vocab + k], sampling.candidates.data[i1*n_vocab + k]);
+ }
+ }
+
+ if (sampling.sampled.has_data()) {
+ std::swap(sampling.sampled.data[i0], sampling.sampled.data[i1]);
+ }
+
+ if (!sampling.logits_count.empty()) {
+ std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
+ }
+
+ if (!sampling.probs_count.empty()) {
+ std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
+ }
+
+ if (!sampling.candidates_count.empty()) {
+ std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
+ }
+ }
+
+ output_swaps.clear();
+}
+
+//
+// graph
+//
+
+uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
+ if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
+ return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+ }
+ uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
+ for (const auto & lora : model.loras) {
+ res += lora->get_n_nodes();
+ }
+ return res;
+}
+
+llm_graph_result * llama_context::get_gf_res_reserve() const {
+ return static_cast<llm_graph_result *>(gf_res_reserve.get());
+}
+
+ggml_cgraph * llama_context::graph_reserve(
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
+ LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
+ GGML_ASSERT(n_outputs >= 1);
+
+ if (n_tokens % n_seqs != 0) {
+ n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
+ n_outputs = std::max(n_outputs, n_tokens);
+
+ LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
+ }
+
+ ggml_backend_sched_reset(sched.get());
+
+ // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
+ gf_res_prev->reset();
+
+ // store the n_outputs as it is, and restore it afterwards
+ // TODO: not sure if needed, might simplify in the future by removing this
+ const auto save_n_outputs = this->n_outputs;
+
+ this->n_outputs = n_outputs;
+
+ llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
+ llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
+
+ // set one output token per sequence in order to activate all backend samplers
+ std::vector<llama_seq_id> seq_ids(n_seqs);
+ for (uint32_t i = 0; i < n_seqs; ++i) {
+ seq_ids[i] = i;
+ ubatch.n_seq_id[i] = 1;
+ ubatch.seq_id[i] = &seq_ids[i];
+ ubatch.output[i] = true;
+ }
+
+ auto * res = gf_res_reserve.get();
+
+ const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
+
+ res->reset();
+
+ auto * gf = model.build_graph(gparams);
+
+ this->n_outputs = save_n_outputs;
+
+ // initialize scheduler with the specified graph
+ if (split_only) {
+ if (sizes) {
+ ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
+ } else {
+ ggml_backend_sched_split_graph(sched.get(), gf);
+ }
+ } else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+ GGML_ASSERT(!sizes);
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+ return nullptr;
+ }
+
+ return gf;
+}
+
+llm_graph_params llama_context::graph_params(
+ llm_graph_result * res,
+ const llama_ubatch & ubatch,
+ const llama_memory_context_i * mctx,
+ llm_graph_type gtype) const {
+ return {
+ /*.arch =*/ model.arch,
+ /*.hparams =*/ model.hparams,
+ /*.cparams =*/ cparams,
+ /*.ubatch =*/ ubatch,
+ /*.gtype =*/ gtype,
+ /*.sched =*/ sched.get(),
+ /*.backend_cpu =*/ backend_cpu,
+ /*.cvec =*/ &cvec,
+ /*.loras =*/ &loras,
+ /*.mctx =*/ mctx,
+ /*.cross =*/ &cross,
+ /*.samplers =*/ sampling.samplers,
+ /*.n_outputs =*/ n_outputs,
+ /*.cb =*/ graph_get_cb(),
+ /*.res =*/ res,
+ };
+}
+
+ggml_status llama_context::graph_compute(
+ ggml_cgraph * gf,
+ bool batched) {
+ int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads;
+ ggml_threadpool_t tp = batched ? threadpool_batch : threadpool;
+
+ if (backend_cpu != nullptr) {
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+ auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+ if (set_threadpool_fn) {
+ set_threadpool_fn(backend_cpu, tp);
+ }
+ }
+
+ // set the number of threads for all the backends
+ for (const auto & set_n_threads_fn : set_n_threads_fns) {
+ set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+ }
+
+ auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf);
+ if (status != GGML_STATUS_SUCCESS) {
+ LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+ }
+
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+
+ return status;
+}
+
+llm_graph_cb llama_context::graph_get_cb() const {
+ return [&](const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il) {
+ if (il >= 0) {
+ ggml_format_name(cur, "%s-%d", name, il);
+ } else {
+ ggml_set_name(cur, name);
+ }
+
+ // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
+ // FIXME: fix in ggml_backend_sched
+ const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
+ if (ubatch.n_tokens < 32 || full_offload) {
+ if (il != -1 && strcmp(name, "norm") == 0) {
+ const auto & dev_layer = model.dev_layer(il);
+ for (const auto & backend : backends) {
+ if (ggml_backend_get_device(backend.get()) == dev_layer) {
+ if (ggml_backend_supports_op(backend.get(), cur)) {
+ ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
+ }
+ }
+ }
+ }
+ }
+ };
+}
+
+//
+// state save/load
+//
+
+class llama_io_write_dummy : public llama_io_write_i {
+public:
+ llama_io_write_dummy() = default;
+
+ void write(const void * /* src */, size_t size) override {
+ size_written += size;
+ }
+
+ void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
+ size_written += size;
+ }
+
+ size_t n_bytes() override {
+ return size_written;
+ }
+
+private:
+ size_t size_written = 0;
+};
+
+class llama_io_write_buffer : public llama_io_write_i {
+public:
+ llama_io_write_buffer(
+ uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+
+ void write(const void * src, size_t size) override {
+ if (size > buf_size) {
+ throw std::runtime_error("unexpectedly reached end of buffer");
+ }
+ memcpy(ptr, src, size);
+ ptr += size;
+ size_written += size;
+ buf_size -= size;
+ }
+
+ void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
+ if (size > buf_size) {
+ throw std::runtime_error("unexpectedly reached end of buffer");
+ }
+ ggml_backend_tensor_get(tensor, ptr, offset, size);
+ ptr += size;
+ size_written += size;
+ buf_size -= size;
+ }
+
+ size_t n_bytes() override {
+ return size_written;
+ }
+
+private:
+ uint8_t * ptr;
+ size_t buf_size = 0;
+ size_t size_written = 0;
+};
+
+class llama_io_read_buffer : public llama_io_read_i {
+public:
+ llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+
+ const uint8_t * read(size_t size) override {
+ const uint8_t * base_ptr = ptr;
+ if (size > buf_size) {
+ throw std::runtime_error("unexpectedly reached end of buffer");
+ }
+ ptr += size;
+ size_read += size;
+ buf_size -= size;
+ return base_ptr;
+ }
+
+ void read_to(void * dst, size_t size) override {
+ memcpy(dst, read(size), size);
+ }
+
+ size_t n_bytes() override {
+ return size_read;
+ }
+
+private:
+ const uint8_t * ptr;
+ size_t buf_size = 0;
+ size_t size_read = 0;
+};
+
+class llama_io_write_file : public llama_io_write_i {
+public:
+ llama_io_write_file(llama_file * f) : file(f) {}
+
+ void write(const void * src, size_t size) override {
+ file->write_raw(src, size);
+ size_written += size;
+ }
+
+ void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
+ temp_buffer.resize(size);
+ ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
+ write(temp_buffer.data(), temp_buffer.size());
+ }
+
+ size_t n_bytes() override {
+ return size_written;
+ }
+
+private:
+ llama_file * file;
+ size_t size_written = 0;
+ std::vector<uint8_t> temp_buffer;
+};
+
+class llama_io_read_file : public llama_io_read_i {
+public:
+ llama_io_read_file(llama_file * f) : file(f) {}
+
+ void read_to(void * dst, size_t size) override {
+ file->read_raw(dst, size);
+ size_read += size;
+ }
+
+ const uint8_t * read(size_t size) override {
+ temp_buffer.resize(size);
+ read_to(temp_buffer.data(), size);
+ return temp_buffer.data();
+ }
+
+ size_t n_bytes() override {
+ return size_read;
+ }
+
+private:
+ llama_file * file;
+ size_t size_read = 0;
+ std::vector<uint8_t> temp_buffer;
+};
+
+size_t llama_context::state_get_size() {
+ llama_io_write_dummy io;
+ try {
+ return state_write_data(io);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+size_t llama_context::state_get_data(uint8_t * dst, size_t size) {
+ llama_io_write_buffer io(dst, size);
+ try {
+ return state_write_data(io);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
+ llama_io_read_buffer io(src, size);
+ try {
+ return state_read_data(io);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
+ llama_io_write_dummy io;
+ try {
+ return state_seq_write_data(io, seq_id, flags);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
+ llama_io_write_buffer io(dst, size);
+ try {
+ return state_seq_write_data(io, seq_id, flags);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
+ llama_io_read_buffer io(src, size);
+ try {
+ return state_seq_read_data(io, seq_id, flags);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ llama_file file(filepath, "rb");
+
+ // sanity checks
+ {
+ const uint32_t magic = file.read_u32();
+ const uint32_t version = file.read_u32();
+
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+ return false;
+ }
+ }
+
+ // load the prompt
+ {
+ const uint32_t n_token_count = file.read_u32();
+
+ if (n_token_count > n_token_capacity) {
+ LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+ return false;
+ }
+
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+ *n_token_count_out = n_token_count;
+ }
+
+ // restore the context state
+ {
+ const size_t n_state_size_cur = file.size() - file.tell();
+
+ llama_io_read_file io( &file);
+ const size_t n_read = state_read_data(io);
+
+ if (n_read != n_state_size_cur) {
+ LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
+ llama_file file(filepath, "wb");
+
+ file.write_u32(LLAMA_SESSION_MAGIC);
+ file.write_u32(LLAMA_SESSION_VERSION);
+
+ // save the prompt
+ file.write_u32((uint32_t) n_token_count);
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+ // save the context state using stream saving
+ llama_io_write_file io(&file);
+ state_write_data(io);
+
+ return true;
+}
+
+size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ llama_file file(filepath, "rb");
+
+ // version checks
+ {
+ const uint32_t magic = file.read_u32();
+ const uint32_t version = file.read_u32();
+
+ if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
+ return 0;
+ }
+ }
+
+ // load the prompt
+ {
+ const uint32_t n_token_count = file.read_u32();
+
+ if (n_token_count > n_token_capacity) {
+ LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+ return 0;
+ }
+
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+ *n_token_count_out = n_token_count;
+ }
+
+ // restore the context state
+ {
+ const size_t state_size = file.size() - file.tell();
+ llama_io_read_file io(&file);
+ const size_t nread = state_seq_read_data(io, seq_id, 0);
+ if (!nread) {
+ LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
+ return 0;
+ }
+ GGML_ASSERT(nread <= state_size);
+ GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
+ }
+
+ return file.tell();
+}
+
+size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
+ llama_file file(filepath, "wb");
+
+ file.write_u32(LLAMA_STATE_SEQ_MAGIC);
+ file.write_u32(LLAMA_STATE_SEQ_VERSION);
+
+ // save the prompt
+ file.write_u32((uint32_t) n_token_count);
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+ // save the context state using stream saving
+ llama_io_write_file io(&file);
+ state_seq_write_data(io, seq_id, 0);
+
+ const size_t res = file.tell();
+ GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
+
+ return res;
+}
+
+size_t llama_context::state_write_data(llama_io_write_i & io) {
+ LLAMA_LOG_DEBUG("%s: writing state\n", __func__);
+
+ // write model info
+ {
+ LLAMA_LOG_DEBUG("%s: - writing model info\n", __func__);
+
+ const std::string arch_str = llm_arch_name(model.arch);
+ io.write_string(arch_str);
+ // TODO: add more model-specific info which should prevent loading the session file if not identical
+ }
+
+ // write output ids
+ {
+ LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
+
+ const auto n_outputs = this->n_outputs;
+ const auto & output_ids = this->output_ids;
+
+ std::vector<int32_t> w_output_pos;
+
+ w_output_pos.resize(n_outputs);
+
+ // build a more compact representation of the output ids
+ for (size_t i = 0; i < n_batch(); ++i) {
+ // map an output id to a position in the batch
+ int64_t pos = output_ids[i];
+ if (pos >= 0) {
+ GGML_ASSERT(pos < n_outputs);
+ w_output_pos[pos] = i;
+ }
+ }
+
+ io.write(&n_outputs, sizeof(n_outputs));
+
+ if (n_outputs) {
+ io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
+ }
+ }
+
+ // [TAG_CONTEXT_STATE_LOGITS]
+ // write logits
+ {
+ LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
+
+ const uint64_t logits_size = std::min((uint64_t) this->logits.size, (uint64_t) n_outputs * model.vocab.n_tokens());
+
+ io.write(&logits_size, sizeof(logits_size));
+
+ if (logits_size) {
+ io.write(logits.data, logits_size * sizeof(float));
+ }
+ }
+
+ // write embeddings
+ {
+ LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
+
+ const uint64_t embd_size = std::min((uint64_t) this->embd.size, (uint64_t) n_outputs * model.hparams.n_embd);
+
+ io.write(&embd_size, sizeof(embd_size));
+
+ if (embd_size) {
+ io.write(embd.data, embd_size * sizeof(float));
+ }
+ }
+
+ // TODO: handle sampling buffers and samplers state ?
+ // https://github.com/ggml-org/llama.cpp/pull/17004
+
+ if (memory != nullptr) {
+ LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
+ memory->state_write(io);
+ }
+
+ return io.n_bytes();
+}
+
+size_t llama_context::state_read_data(llama_io_read_i & io) {
+ LLAMA_LOG_DEBUG("%s: reading state\n", __func__);
+
+ // read model info
+ {
+ LLAMA_LOG_DEBUG("%s: - reading model info\n", __func__);
+
+ const std::string cur_arch_str = llm_arch_name(model.arch);
+
+ std::string arch_str;
+ io.read_string(arch_str);
+ if (cur_arch_str != arch_str) {
+ throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
+ }
+ // TODO: add more info which needs to be identical but which is not verified otherwise
+ }
+
+ // read output ids
+ {
+ LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__);
+
+ auto n_outputs = this->n_outputs;
+ io.read_to(&n_outputs, sizeof(n_outputs));
+
+ if (n_outputs > output_reserve(n_outputs)) {
+ throw std::runtime_error("could not reserve outputs");
+ }
+
+ std::vector<int32_t> output_pos;
+
+ if (n_outputs) {
+ output_pos.resize(n_outputs);
+ io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+
+ for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
+ int32_t id = output_pos[i];
+ if ((uint32_t) id >= n_batch()) {
+ throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
+ }
+ this->output_ids[id] = i;
+ }
+
+ this->n_outputs = n_outputs;
+ }
+ }
+
+ // read logits
+ {
+ LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__);
+
+ uint64_t logits_size;
+ io.read_to(&logits_size, sizeof(logits_size));
+
+ if (this->logits.size < logits_size) {
+ throw std::runtime_error("logits buffer too small");
+ }
+
+ if (logits_size) {
+ io.read_to(this->logits.data, logits_size * sizeof(float));
+ }
+ }
+
+ // read embeddings
+ {
+ LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__);
+
+ uint64_t embd_size;
+ io.read_to(&embd_size, sizeof(embd_size));
+
+ if (this->embd.size < embd_size) {
+ throw std::runtime_error("embeddings buffer too small");
+ }
+
+ if (embd_size) {
+ io.read_to(this->embd.data, embd_size * sizeof(float));
+ }
+ }
+
+ // TODO: handle sampling buffers and samplers state ?
+ // https://github.com/ggml-org/llama.cpp/pull/17004
+
+ if (memory) {
+ LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
+
+ memory->state_read(io);
+ }
+
+ return io.n_bytes();
+}
+
+size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ GGML_UNUSED(seq_id);
+
+ if (memory) {
+ memory->state_write(io, seq_id, flags);
+ }
+
+ return io.n_bytes();
+}
+
+size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ GGML_UNUSED(seq_id);
+
+ if (memory) {
+ memory->state_read(io, seq_id, flags);
+ }
+
+ return io.n_bytes();
+}
+
+//
+// perf
+//
+
+llama_perf_context_data llama_context::perf_get_data() const {
+ llama_perf_context_data data = {};
+
+ data.t_start_ms = 1e-3 * t_start_us;
+ data.t_load_ms = 1e-3 * t_load_us;
+ data.t_p_eval_ms = 1e-3 * t_p_eval_us;
+ data.t_eval_ms = 1e-3 * t_eval_us;
+ data.n_p_eval = std::max(1, n_p_eval);
+ data.n_eval = std::max(1, n_eval);
+ data.n_reused = std::max(0, n_reused);
+
+ return data;
+}
+
+void llama_context::perf_reset() {
+ t_start_us = ggml_time_us();
+ t_eval_us = n_eval = 0;
+ t_p_eval_us = n_p_eval = 0;
+ n_reused = 0;
+}
+
+std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
+ for (const auto & [buft, size] : model.memory_breakdown()) {
+ ret[buft].model += size;
+ }
+ if (memory) {
+ for (const auto & [buft, size] : memory->memory_breakdown()) {
+ ret[buft].context += size;
+ }
+ }
+ if (model.hparams.no_alloc) {
+ for (size_t i = 0; i < backends.size(); ++i) {
+ ggml_backend_t backend = backends[i].get();
+ ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
+ ret[buft].compute += backend_buf_exp_size[i];
+ }
+ } else {
+ for (const auto & backend_ptr : backends) {
+ ggml_backend_t backend = backend_ptr.get();
+ ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
+ ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ }
+ }
+ return ret;
+}
+
+//
+// training
+//
+
+static void llama_set_param(struct ggml_tensor * tensor, llama_opt_param_filter param_filter, void * userdata) {
+ if (!tensor || tensor->type != GGML_TYPE_F32) {
+ return;
+ }
+ if (!param_filter(tensor, userdata)) {
+ return;
+ }
+ if (strcmp(tensor->name, "token_embd.weight") == 0) {
+ return; // FIXME
+ }
+ if (strcmp(tensor->name, "rope_freqs.weight") == 0) {
+ return; // FIXME
+ }
+ ggml_set_param(tensor);
+}
+
+void llama_context::opt_init(struct llama_model * model, struct llama_opt_params lopt_params) {
+ GGML_ASSERT(!opt_ctx);
+ model->hparams.n_ctx_train = lopt_params.n_ctx_train > 0 ? lopt_params.n_ctx_train : n_ctx();
+ const uint32_t n_batch = std::min(this->n_batch(), model->hparams.n_ctx_train);
+ const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch);
+ GGML_ASSERT(model->hparams.n_ctx_train % n_batch == 0);
+ GGML_ASSERT(n_batch % n_ubatch == 0);
+
+ ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY);
+ opt_params.opt_period = n_batch / n_ubatch;
+ opt_params.get_opt_pars = lopt_params.get_opt_pars;
+ opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
+ opt_params.optimizer = lopt_params.optimizer_type;
+ opt_ctx = ggml_opt_init(opt_params);
+
+ llama_opt_param_filter param_filter = lopt_params.param_filter;
+ void * param_filter_ud = lopt_params.param_filter_ud;
+
+ //llama_set_param(model->tok_embd, param_filter, param_filter_ud); // FIXME
+ llama_set_param(model->type_embd, param_filter, param_filter_ud);
+ llama_set_param(model->pos_embd, param_filter, param_filter_ud);
+ llama_set_param(model->tok_norm, param_filter, param_filter_ud);
+ llama_set_param(model->tok_norm_b, param_filter, param_filter_ud);
+ llama_set_param(model->output_norm, param_filter, param_filter_ud);
+ llama_set_param(model->output_norm_b, param_filter, param_filter_ud);
+ llama_set_param(model->output, param_filter, param_filter_ud);
+ llama_set_param(model->output_b, param_filter, param_filter_ud);
+ llama_set_param(model->output_norm_enc, param_filter, param_filter_ud);
+ llama_set_param(model->cls, param_filter, param_filter_ud);
+ llama_set_param(model->cls_b, param_filter, param_filter_ud);
+ llama_set_param(model->cls_out, param_filter, param_filter_ud);
+ llama_set_param(model->cls_out_b, param_filter, param_filter_ud);
+
+ for (struct llama_layer & layer : model->layers) {
+ for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
+ llama_set_param(reinterpret_cast<struct ggml_tensor **>(&layer)[i], param_filter, param_filter_ud);
+ }
+ }
+}
+
+void llama_context::opt_epoch_iter(
+ ggml_opt_dataset_t dataset,
+ ggml_opt_result_t result,
+ const std::vector<llama_token> & tokens,
+ const std::vector<llama_token> & labels_sparse,
+ llama_batch & batch,
+ ggml_opt_epoch_callback callback,
+ bool train,
+ int64_t idata_in_loop,
+ int64_t ndata_in_loop,
+ int64_t t_loop_start) {
+ GGML_ASSERT(opt_ctx);
+ const uint32_t n_ctx = llama_model_n_ctx_train(&model);
+ const uint32_t n_batch = std::min(this->n_batch(), n_ctx);
+ const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch);
+
+ memory->clear(true);
+
+ for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) {
+ batch.n_tokens = n_batch;
+ for (uint32_t pos_batch = 0; pos_batch < n_batch; ++pos_batch) {
+ batch.token [pos_batch] = tokens[pos_ctx + pos_batch];
+ batch.pos [pos_batch] = pos_ctx + pos_batch;
+ batch.n_seq_id[pos_batch] = 1;
+ batch.seq_id [pos_batch][0] = 0;
+ batch.logits [pos_batch] = true;
+ }
+
+ if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
+ LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
+ return;
+ }
+
+ const uint32_t n_tokens_all = balloc->get_n_tokens();
+
+ n_queued_tokens += n_tokens_all;
+
+ embd_seq.clear();
+
+ uint32_t n_outputs_all = n_tokens_all;
+
+ auto mctx = memory->init_batch(*balloc, cparams.n_ubatch, true);
+ if (!mctx || mctx->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
+ LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
+ break;
+ }
+
+ // reserve output buffer
+ if (output_reserve(n_outputs_all) < n_outputs_all) {
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
+ GGML_ABORT("TODO: handle this error");
+ };
+
+ uint32_t pos_batch = 0;
+ do {
+ const auto & ubatch = mctx->get_ubatch();
+
+ n_outputs = ubatch.n_tokens;
+
+ if (!mctx->apply()) {
+ LLAMA_LOG_ERROR("%s: failed to update the memory context\n", __func__);
+ break;
+ }
+
+ auto * res = gf_res_prev.get();
+
+ const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
+
+ res->reset();
+
+ auto * gf = model.build_graph(gparams);
+
+ struct ggml_context * ctx_compute_opt;
+ {
+ const size_t size_gf = ggml_graph_size(gf);
+ const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
+ struct ggml_init_params params = {
+ /*.mem_size =*/ size_meta,
+ /*.mem_buffer =*/ nullptr,
+ /*.no_alloc =*/ true,
+ };
+ ctx_compute_opt = ggml_init(params);
+ }
+ ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
+ ggml_opt_alloc(opt_ctx, train);
+
+ res->set_inputs(&ubatch);
+ {
+ struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
+ GGML_ASSERT(labels->ne[1] == n_ubatch);
+ ggml_set_zero(labels);
+ const float onef = 1.0f;
+ for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) {
+ const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch;
+ GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]);
+ ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
+ }
+ }
+ ggml_opt_eval(opt_ctx, result);
+ if (callback) {
+ callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
+ }
+ ggml_free(ctx_compute_opt);
+
+ pos_batch += ubatch.n_tokens;
+ } while (mctx->next());
+ }
+}
+
+void llama_context::opt_epoch(
+ ggml_opt_dataset_t dataset,
+ ggml_opt_result_t result_train,
+ ggml_opt_result_t result_eval,
+ int64_t idata_split,
+ ggml_opt_epoch_callback callback_train,
+ ggml_opt_epoch_callback callback_eval) {
+ const uint32_t n_ctx = this->n_ctx();
+ const uint32_t n_batch = std::min(cparams.n_batch, n_ctx);
+ const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch);
+ const int64_t ndata = ggml_opt_dataset_ndata(dataset);
+
+ GGML_ASSERT(idata_split >= 0);
+ GGML_ASSERT(idata_split <= ndata);
+
+ const uint32_t ubatch_per_ctx = n_ctx / n_ubatch;
+
+ struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+ std::vector<llama_token> tokens(n_ctx);
+ std::vector<llama_token> labels_sparse(n_ctx);
+
+ int64_t idata = 0;
+
+ int64_t t_loop_start = ggml_time_us();
+ int64_t ndata_in_loop = idata_split*ubatch_per_ctx;
+ for (; idata < idata_split; ++idata) {
+ constexpr bool train = true;
+ const int64_t idata_in_loop = idata*ubatch_per_ctx;
+
+ ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
+ opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch,
+ callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start);
+ }
+
+ t_loop_start = ggml_time_us();
+ ndata_in_loop = (ndata - idata_split)*ubatch_per_ctx;
+ for (; idata < ndata; ++idata) {
+ constexpr bool train = false;
+ const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx;
+
+ ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
+ opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch,
+ callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start);
+ }
+
+ llama_batch_free(batch);
+}
+
+//
+// interface implementation
+//
+
+llama_context_params llama_context_default_params() {
+ llama_context_params result = {
+ /*.n_ctx =*/ 512,
+ /*.n_batch =*/ 2048,
+ /*.n_ubatch =*/ 512,
+ /*.n_seq_max =*/ 1,
+ /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
+ /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+ /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
+ /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
+ /*.flash_attn_type =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
+ /*.rope_freq_base =*/ 0.0f,
+ /*.rope_freq_scale =*/ 0.0f,
+ /*.yarn_ext_factor =*/ -1.0f,
+ /*.yarn_attn_factor =*/ -1.0f,
+ /*.yarn_beta_fast =*/ -1.0f,
+ /*.yarn_beta_slow =*/ -1.0f,
+ /*.yarn_orig_ctx =*/ 0,
+ /*.defrag_thold =*/ -1.0f,
+ /*.cb_eval =*/ nullptr,
+ /*.cb_eval_user_data =*/ nullptr,
+ /*.type_k =*/ GGML_TYPE_F16,
+ /*.type_v =*/ GGML_TYPE_F16,
+ /*.abort_callback =*/ nullptr,
+ /*.abort_callback_data =*/ nullptr,
+ /*.embeddings =*/ false,
+ /*.offload_kqv =*/ true,
+ /*.no_perf =*/ true,
+ /*.op_offload =*/ true,
+ /*.swa_full =*/ true,
+ /*.kv_unified =*/ false,
+ /*.sampler =*/ nullptr,
+ /*.n_sampler =*/ 0,
+ };
+
+ return result;
+}
+
+llama_context * llama_init_from_model(
+ llama_model * model,
+ llama_context_params params) {
+ if (!model) {
+ LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
+ return nullptr;
+ }
+
+ if (params.n_batch == 0 && params.n_ubatch == 0) {
+ LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
+ return nullptr;
+ }
+
+ if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
+ LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
+ return nullptr;
+ }
+
+ if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && model->arch == LLM_ARCH_GROK) {
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+ }
+
+ if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
+ const uint32_t blck_size = ggml_blck_size(params.type_k);
+ if (model->hparams.n_embd_head_k % blck_size != 0) {
+ LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+ __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
+ return nullptr;
+ }
+ }
+
+ if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
+ const uint32_t blck_size = ggml_blck_size(params.type_v);
+ if (model->hparams.n_embd_head_v % blck_size != 0) {
+ LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+ __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
+ return nullptr;
+ }
+ }
+
+ if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
+ LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
+ return nullptr;
+ }
+
+ if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&
+ params.pooling_type != model->hparams.pooling_type) {
+ //user-specified pooling-type is different from the model default
+ LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
+ model->hparams.pooling_type, params.pooling_type);
+ }
+
+ try {
+ auto * ctx = new llama_context(*model, params);
+ return ctx;
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what());
+ }
+
+ return nullptr;
+}
+
+// deprecated
+llama_context * llama_new_context_with_model(
+ llama_model * model,
+ llama_context_params params) {
+ return llama_init_from_model(model, params);
+}
+
+void llama_free(llama_context * ctx) {
+ delete ctx;
+}
+
+uint32_t llama_n_ctx(const llama_context * ctx) {
+ return ctx->n_ctx();
+}
+
+uint32_t llama_n_ctx_seq(const llama_context * ctx) {
+ return ctx->n_ctx_seq();
+}
+
+uint32_t llama_n_batch(const llama_context * ctx) {
+ return ctx->n_batch();
+}
+
+uint32_t llama_n_ubatch(const llama_context * ctx) {
+ return ctx->n_ubatch();
+}
+
+uint32_t llama_n_seq_max(const llama_context * ctx) {
+ return ctx->n_seq_max();
+}
+
+const llama_model * llama_get_model(const llama_context * ctx) {
+ return &ctx->get_model();
+}
+
+enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
+ return ctx->pooling_type();
+}
+
+void llama_attach_threadpool(
+ llama_context * ctx,
+ ggml_threadpool_t threadpool,
+ ggml_threadpool_t threadpool_batch) {
+ ctx->attach_threadpool(threadpool, threadpool_batch);
+}
+
+void llama_detach_threadpool(llama_context * ctx) {
+ ctx->detach_threadpool();
+}
+
+void llama_set_n_threads(llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
+ ctx->set_n_threads(n_threads, n_threads_batch);
+}
+
+int32_t llama_n_threads(llama_context * ctx) {
+ return ctx->n_threads();
+}
+
+int32_t llama_n_threads_batch(llama_context * ctx) {
+ return ctx->n_threads_batch();
+}
+
+void llama_set_abort_callback(llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
+ ctx->set_abort_callback(abort_callback, abort_callback_data);
+}
+
+void llama_set_embeddings(llama_context * ctx, bool embeddings) {
+ ctx->set_embeddings(embeddings);
+}
+
+void llama_set_causal_attn(llama_context * ctx, bool causal_attn) {
+ ctx->set_causal_attn(causal_attn);
+}
+
+void llama_set_warmup(llama_context * ctx, bool warmup) {
+ ctx->set_warmup(warmup);
+}
+
+void llama_synchronize(llama_context * ctx) {
+ ctx->synchronize();
+}
+
+float * llama_get_logits(llama_context * ctx) {
+ ctx->synchronize();
+
+ return ctx->get_logits();
+}
+
+float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ float * res = nullptr;
+
+ res = ctx->get_sampled_logits_ith(i);
+
+ if (!res) {
+ res = ctx->get_logits_ith(i);
+ }
+
+ return res;
+}
+
+float * llama_get_embeddings(llama_context * ctx) {
+ ctx->synchronize();
+
+ return ctx->get_embeddings();
+}
+
+float * llama_get_embeddings_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return ctx->get_embeddings_ith(i);
+}
+
+float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
+ ctx->synchronize();
+
+ return ctx->get_embeddings_seq(seq_id);
+}
+
+bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
+ return ctx->set_sampler(seq_id, smpl);
+}
+
+llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return ctx->get_sampled_token_ith(i);
+}
+
+float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return ctx->get_sampled_probs_ith(i);
+}
+
+float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return ctx->get_sampled_logits_ith(i);
+}
+
+llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
+}
+
+uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
+}
+
+uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
+}
+
+uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
+ ctx->synchronize();
+
+ return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
+}
+
+// llama adapter API
+
+int32_t llama_set_adapter_lora(
+ llama_context * ctx,
+ llama_adapter_lora * adapter,
+ float scale) {
+ ctx->set_adapter_lora(adapter, scale);
+
+ return 0;
+}
+
+int32_t llama_rm_adapter_lora(
+ llama_context * ctx,
+ llama_adapter_lora * adapter) {
+ bool res = ctx->rm_adapter_lora(adapter);
+
+ return res ? 0 : -1;
+}
+
+void llama_clear_adapter_lora(llama_context * ctx) {
+ ctx->clear_adapter_lora();
+}
+
+int32_t llama_apply_adapter_cvec(
+ llama_context * ctx,
+ const float * data,
+ size_t len,
+ int32_t n_embd,
+ int32_t il_start,
+ int32_t il_end) {
+ bool res = ctx->apply_adapter_cvec(data, len, n_embd, il_start, il_end);
+
+ return res ? 0 : -1;
+}
+
+//
+// memory
+//
+
+llama_memory_t llama_get_memory(const struct llama_context * ctx) {
+ return ctx->get_memory();
+}
+
+void llama_memory_clear(llama_memory_t mem, bool data) {
+ if (!mem) {
+ return;
+ }
+
+ mem->clear(data);
+}
+
+bool llama_memory_seq_rm(
+ llama_memory_t mem,
+ llama_seq_id seq_id,
+ llama_pos p0,
+ llama_pos p1) {
+ if (!mem) {
+ return true;
+ }
+
+ return mem->seq_rm(seq_id, p0, p1);
+}
+
+void llama_memory_seq_cp(
+ llama_memory_t mem,
+ llama_seq_id seq_id_src,
+ llama_seq_id seq_id_dst,
+ llama_pos p0,
+ llama_pos p1) {
+ if (!mem) {
+ return;
+ }
+
+ mem->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_memory_seq_keep(
+ llama_memory_t mem,
+ llama_seq_id seq_id) {
+ if (!mem) {
+ return;
+ }
+
+ mem->seq_keep(seq_id);
+}
+
+void llama_memory_seq_add(
+ llama_memory_t mem,
+ llama_seq_id seq_id,
+ llama_pos p0,
+ llama_pos p1,
+ llama_pos delta) {
+ if (!mem) {
+ return;
+ }
+
+ mem->seq_add(seq_id, p0, p1, delta);
+}
+
+void llama_memory_seq_div(
+ llama_memory_t mem,
+ llama_seq_id seq_id,
+ llama_pos p0,
+ llama_pos p1,
+ int d) {
+ if (!mem) {
+ return;
+ }
+
+ mem->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_memory_seq_pos_min(
+ llama_memory_t mem,
+ llama_seq_id seq_id) {
+ if (!mem) {
+ return -1;
+ }
+
+ return mem->seq_pos_min(seq_id);
+}
+
+llama_pos llama_memory_seq_pos_max(
+ llama_memory_t mem,
+ llama_seq_id seq_id) {
+ if (!mem) {
+ return -1;
+ }
+
+ return mem->seq_pos_max(seq_id);
+}
+
+bool llama_memory_can_shift(llama_memory_t mem) {
+ if (!mem) {
+ return false;
+ }
+
+ return mem->get_can_shift();
+}
+
+// llama state API
+
+// deprecated
+size_t llama_get_state_size(llama_context * ctx) {
+ return llama_state_get_size(ctx);
+}
+
+// deprecated
+size_t llama_copy_state_data(llama_context * ctx, uint8_t * dst) {
+ return llama_state_get_data(ctx, dst, -1);
+}
+
+// deprecated
+size_t llama_set_state_data(llama_context * ctx, const uint8_t * src) {
+ return llama_state_set_data(ctx, src, -1);
+}
+
+// deprecated
+bool llama_load_session_file(llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+}
+
+// deprecated
+bool llama_save_session_file(llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+ return llama_state_save_file(ctx, path_session, tokens, n_token_count);
+}
+
+// Returns the *actual* size of the state.
+// Intended to be used when saving to state to a buffer.
+size_t llama_state_get_size(llama_context * ctx) {
+ return ctx->state_get_size();
+}
+
+size_t llama_state_get_data(llama_context * ctx, uint8_t * dst, size_t size) {
+ ctx->synchronize();
+
+ return ctx->state_get_data(dst, size);
+}
+
+// Sets the state reading from the specified source address
+size_t llama_state_set_data(llama_context * ctx, const uint8_t * src, size_t size) {
+ ctx->synchronize();
+
+ return ctx->state_set_data(src, size);
+}
+
+bool llama_state_load_file(llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ ctx->synchronize();
+
+ try {
+ return ctx->state_load_file(path_session, tokens_out, n_token_capacity, n_token_count_out);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
+ return false;
+ }
+}
+
+bool llama_state_save_file(llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+ ctx->synchronize();
+
+ try {
+ return ctx->state_save_file(path_session, tokens, n_token_count);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
+ return false;
+ }
+}
+
+size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
+ return llama_state_seq_get_size_ext(ctx, seq_id, 0);
+}
+
+size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
+ return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
+}
+
+size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
+ return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
+}
+
+size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ return ctx->state_seq_get_size(seq_id, flags);
+}
+
+size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ ctx->synchronize();
+
+ return ctx->state_seq_get_data(seq_id, dst, size, flags);
+}
+
+size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ ctx->synchronize();
+
+ return ctx->state_seq_set_data(seq_id, src, size, flags);
+}
+
+size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
+ ctx->synchronize();
+
+ try {
+ return ctx->state_seq_save_file(seq_id, filepath, tokens, n_token_count);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+size_t llama_state_seq_load_file(llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ ctx->synchronize();
+
+ try {
+ return ctx->state_seq_load_file(dest_seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
+ return 0;
+ }
+}
+
+///
+
+int32_t llama_encode(
+ llama_context * ctx,
+ llama_batch batch) {
+ const int ret = ctx->encode(batch);
+ if (ret != 0) {
+ LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
+ }
+
+ return ret;
+}
+
+int32_t llama_decode(
+ llama_context * ctx,
+ llama_batch batch) {
+ const int ret = ctx->decode(batch);
+ if (ret != 0 && ret != 1) {
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
+ }
+
+ return ret;
+}
+
+//
+// perf
+//
+
+llama_perf_context_data llama_perf_context(const llama_context * ctx) {
+ llama_perf_context_data data = {};
+
+ if (ctx == nullptr) {
+ return data;
+ }
+
+ data = ctx->perf_get_data();
+
+ return data;
+}
+
+void llama_perf_context_print(const llama_context * ctx) {
+ const auto data = llama_perf_context(ctx);
+
+ const double t_end_ms = 1e-3 * ggml_time_us();
+
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+ __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
+ __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+ LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
+}
+
+void llama_perf_context_reset(llama_context * ctx) {
+ ctx->perf_reset();
+}
+
+void llama_memory_breakdown_print(const struct llama_context * ctx) {
+ const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
+
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
+
+ std::vector<std::array<std::string, 9>> table_data;
+ table_data.reserve(devices.size());
+ const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
+ const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
+ const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
+
+ table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
+
+ constexpr size_t MiB = 1024 * 1024;
+ const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
+
+ // track seen buffer types to avoid double counting:
+ std::set<ggml_backend_buffer_type_t> seen_buffer_types;
+
+ // accumulative memory breakdown for each device and for host:
+ std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
+ llama_memory_breakdown_data mb_host;
+
+ for (const auto & buft_mb : memory_breakdown) {
+ ggml_backend_buffer_type_t buft = buft_mb.first;
+ const llama_memory_breakdown_data & mb = buft_mb.second;
+ if (ggml_backend_buft_is_host(buft)) {
+ mb_host.model += mb.model;
+ mb_host.context += mb.context;
+ mb_host.compute += mb.compute;
+ seen_buffer_types.insert(buft);
+ continue;
+ }
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+ if (dev) {
+ int i_dev = -1;
+ for (size_t i = 0; i < devices.size(); i++) {
+ if (devices[i] == dev) {
+ i_dev = i;
+ break;
+ }
+ }
+ if (i_dev != -1) {
+ mb_dev[i_dev].model += mb.model;
+ mb_dev[i_dev].context += mb.context;
+ mb_dev[i_dev].compute += mb.compute;
+ seen_buffer_types.insert(buft);
+ continue;
+ }
+ }
+ }
+
+ // print memory breakdown for each device:
+ for (size_t i = 0; i < devices.size(); i++) {
+ ggml_backend_dev_t dev = devices[i];
+ llama_memory_breakdown_data mb = mb_dev[i];
+
+ const std::string name = ggml_backend_dev_name(dev);
+ std::string desc = ggml_backend_dev_description(dev);
+ for (const std::string & prefix : desc_prefixes_strip) {
+ if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
+ desc = desc.substr(prefix.length());
+ }
+ }
+
+ size_t free, total;
+ ggml_backend_dev_memory(dev, &free, &total);
+
+ const size_t self = mb.model + mb.context + mb.compute;
+ const size_t unaccounted = total - self - free;
+
+ table_data.push_back({
+ template_gpu,
+ " - " + name + " (" + desc + ")",
+ std::to_string(total / MiB),
+ std::to_string(free / MiB),
+ std::to_string(self / MiB),
+ std::to_string(mb.model / MiB),
+ std::to_string(mb.context / MiB),
+ std::to_string(mb.compute / MiB),
+ std::to_string(unaccounted / MiB)});
+ }
+
+ // print memory breakdown for host:
+ {
+ const size_t self = mb_host.model + mb_host.context + mb_host.compute;
+ table_data.push_back({
+ template_other,
+ " - Host",
+ "", // total
+ "", // free
+ std::to_string(self / MiB),
+ std::to_string(mb_host.model / MiB),
+ std::to_string(mb_host.context / MiB),
+ std::to_string(mb_host.compute / MiB),
+ ""}); // unaccounted
+ }
+
+ // print memory breakdown for all remaining buffer types:
+ for (const auto & buft_mb : memory_breakdown) {
+ ggml_backend_buffer_type_t buft = buft_mb.first;
+ const llama_memory_breakdown_data & mb = buft_mb.second;
+ if (seen_buffer_types.count(buft) == 1) {
+ continue;
+ }
+ const std::string name = ggml_backend_buft_name(buft);
+ const size_t self = mb.model + mb.context + mb.compute;
+ table_data.push_back({
+ template_other,
+ " - " + name,
+ "", // total
+ "", // free
+ std::to_string(self / MiB),
+ std::to_string(mb.model / MiB),
+ std::to_string(mb.context / MiB),
+ std::to_string(mb.compute / MiB),
+ ""}); // unaccounted
+ seen_buffer_types.insert(buft);
+ }
+
+ for (size_t j = 1; j < table_data[0].size(); j++) {
+ size_t max_len = 0;
+ for (const auto & td : table_data) {
+ max_len = std::max(max_len, td[j].length());
+ }
+ for (auto & td : table_data) {
+ td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
+ }
+ }
+ for (const auto & td : table_data) {
+ LLAMA_LOG_INFO(td[0].c_str(),
+ __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
+ td[6].c_str(), td[7].c_str(), td[8].c_str());
+ }
+}
+
+//
+// training
+//
+
+bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata) {
+ GGML_UNUSED(tensor);
+ GGML_UNUSED(userdata);
+ return true;
+}
+
+void llama_opt_init(struct llama_context * ctx, struct llama_model * model, struct llama_opt_params lopt_params) {
+ ctx->opt_init(model, lopt_params);
+}
+
+void llama_opt_epoch(
+ struct llama_context * ctx,
+ ggml_opt_dataset_t dataset,
+ ggml_opt_result_t result_train,
+ ggml_opt_result_t result_eval,
+ int64_t idata_split,
+ ggml_opt_epoch_callback callback_train,
+ ggml_opt_epoch_callback callback_eval) {
+ ctx->opt_epoch(
+ dataset,
+ result_train,
+ result_eval,
+ idata_split,
+ callback_train,
+ callback_eval);
+}
diff --git a/llama.cpp/src/llama-context.h b/llama.cpp/src/llama-context.h
new file mode 100644
index 0000000..d995117
--- /dev/null
+++ b/llama.cpp/src/llama-context.h
@@ -0,0 +1,361 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-cparams.h"
+#include "llama-graph.h"
+#include "llama-adapter.h"
+#include "llama-impl.h"
+
+#include "ggml-cpp.h"
+#include "ggml-opt.h"
+
+#include <map>
+#include <vector>
+
+struct llama_model;
+class llama_batch_allocr;
+
+class llama_io_read_i;
+class llama_io_write_i;
+
+// "memory" as in abstract memory for the context
+struct llama_memory_i;
+struct llama_memory_context_i;
+
+// "memory" as in physical memory for a buffer type, in bytes
+struct llama_memory_breakdown_data {
+ size_t model = 0; // memory allocated for the model
+ size_t context = 0; // memory allocated for the context
+ size_t compute = 0; // memory allocated for temporary compute buffers
+
+ size_t total() const {
+ return model + context + compute;
+ }
+};
+
+struct llama_context {
+ // init scheduler and compute buffers, reserve worst-case graphs
+ llama_context(
+ const llama_model & model,
+ llama_context_params params);
+
+ ~llama_context();
+
+ // reserve a new backend scheduler (if needed)
+ // for example, when:
+ // - changing loras
+ // - changing samplers
+ // - changing attention type
+ // - etc.
+ void sched_reserve();
+
+ void synchronize();
+
+ const llama_model & get_model() const;
+ const llama_cparams & get_cparams() const;
+
+ ggml_backend_sched_t get_sched() const;
+
+ uint32_t n_ctx() const;
+ uint32_t n_ctx_seq() const;
+ uint32_t n_batch() const;
+ uint32_t n_ubatch() const;
+ uint32_t n_seq_max() const;
+
+ uint32_t n_threads() const;
+ uint32_t n_threads_batch() const;
+
+ llama_memory_t get_memory() const;
+
+ // return true if the memory was updated
+ bool memory_update(bool optimize);
+
+ enum llama_pooling_type pooling_type() const;
+
+ float * get_logits();
+ float * get_logits_ith(int32_t i);
+
+ float * get_embeddings();
+ float * get_embeddings_ith(int32_t i);
+ float * get_embeddings_seq(llama_seq_id seq_id);
+
+ llama_token * get_sampled_tokens() const;
+ llama_token get_sampled_token_ith(int32_t idx);
+
+ float * get_sampled_logits_ith(int32_t idx);
+ size_t get_sampled_logits_count(int32_t idx);
+
+ float * get_sampled_probs_ith(int32_t idx);
+ size_t get_sampled_probs_count(int32_t idx);
+
+ const llama_token * get_sampled_candidates_ith(int32_t idx);
+ size_t get_sampled_candidates_count(int32_t idx);
+
+ void attach_threadpool(
+ ggml_threadpool_t threadpool,
+ ggml_threadpool_t threadpool_batch);
+
+ void detach_threadpool();
+
+ void set_n_threads(int32_t n_threads, int32_t n_threads_batch);
+
+ void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
+
+ void set_embeddings (bool value);
+ void set_causal_attn(bool value);
+ void set_warmup(bool value);
+
+ void set_adapter_lora(
+ llama_adapter_lora * adapter,
+ float scale);
+
+ bool rm_adapter_lora(
+ llama_adapter_lora * adapter);
+
+ void clear_adapter_lora();
+
+ bool apply_adapter_cvec(
+ const float * data,
+ size_t len,
+ int32_t n_embd,
+ int32_t il_start,
+ int32_t il_end);
+
+ // process a single ubatch with a specific graph type
+ // if memory_context is provided, it will be applied first to the context's memory
+ // ret contains the status of the graph computation
+ // returns nullptr only if ret != GGML_STATUS_SUCCESS
+ llm_graph_result * process_ubatch(
+ const llama_ubatch & ubatch,
+ llm_graph_type gtype,
+ llama_memory_context_i * mctx,
+ ggml_status & ret);
+
+ int encode(const llama_batch & batch_inp);
+ int decode(const llama_batch & batch_inp);
+
+ //
+ // state save/load
+ //
+
+ size_t state_get_size();
+ size_t state_get_data( uint8_t * dst, size_t size);
+ size_t state_set_data(const uint8_t * src, size_t size);
+
+ size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
+ size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
+ size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
+
+ bool state_load_file(
+ const char * filepath,
+ llama_token * tokens_out,
+ size_t n_token_capacity,
+ size_t * n_token_count_out);
+
+ bool state_save_file(
+ const char * filepath,
+ const llama_token * tokens,
+ size_t n_token_count);
+
+ size_t state_seq_load_file(
+ llama_seq_id seq_id,
+ const char * filepath,
+ llama_token * tokens_out,
+ size_t n_token_capacity,
+ size_t * n_token_count_out);
+
+ size_t state_seq_save_file(
+ llama_seq_id seq_id,
+ const char * filepath,
+ const llama_token * tokens,
+ size_t n_token_count);
+
+ //
+ // perf
+ //
+
+ llama_perf_context_data perf_get_data() const;
+ void perf_reset();
+
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
+
+ //
+ // training
+ //
+
+ void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
+
+ // TODO: more flexible combinations of logical/physical batch size and context size
+ void opt_epoch(
+ ggml_opt_dataset_t dataset,
+ ggml_opt_result_t result_train,
+ ggml_opt_result_t result_eval,
+ int64_t idata_split,
+ ggml_opt_epoch_callback callback_train,
+ ggml_opt_epoch_callback callback_eval);
+
+ void opt_epoch_iter(
+ ggml_opt_dataset_t dataset,
+ ggml_opt_result_t result,
+ const std::vector<llama_token> & tokens,
+ const std::vector<llama_token> & labels_sparse,
+ llama_batch & batch,
+ ggml_opt_epoch_callback callback,
+ bool train,
+ int64_t idata_in_loop,
+ int64_t ndata_in_loop,
+ int64_t t_loop_start);
+
+private:
+ //
+ // output
+ //
+
+ // Make sure enough space is available for outputs.
+ // Returns max number of outputs for which space was reserved.
+ uint32_t output_reserve(int32_t n_outputs);
+
+ void output_reorder();
+
+ // map the output row index `i` to batch index
+ int64_t output_resolve_row(int32_t i) const;
+
+ //
+ // graph
+ //
+
+public:
+ uint32_t graph_max_nodes(uint32_t n_tokens) const;
+
+ // can reuse the llm_graph_result instance of the context (for example to update a memory module)
+ llm_graph_result * get_gf_res_reserve() const;
+
+ // returns the result of ggml_backend_sched_graph_compute_async execution
+ ggml_status graph_compute(ggml_cgraph * gf, bool batched);
+
+ // reserve a graph with a dummy ubatch of the specified size
+ ggml_cgraph * graph_reserve(
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
+
+ bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
+
+private:
+ llm_graph_params graph_params(
+ llm_graph_result * res,
+ const llama_ubatch & ubatch,
+ const llama_memory_context_i * mctx,
+ llm_graph_type gtype) const;
+
+ llm_graph_cb graph_get_cb() const;
+
+ // TODO: read/write lora adapters and cvec
+ size_t state_write_data(llama_io_write_i & io);
+ size_t state_read_data (llama_io_read_i & io);
+
+ size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
+ size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
+
+ //
+ // members
+ //
+
+ const llama_model & model;
+
+ llama_cparams cparams;
+ llama_adapter_cvec cvec;
+ llama_adapter_loras loras;
+
+ llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
+
+ std::unique_ptr<llama_memory_i> memory;
+
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
+ struct buffer_view<float> logits = {nullptr, 0};
+
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
+ struct buffer_view<float> embd = {nullptr, 0};
+
+ struct sampling_info {
+ std::map<llama_seq_id, llama_sampler *> samplers;
+
+ struct buffer_view<float> logits = {nullptr, 0};
+ struct buffer_view<llama_token> sampled = {nullptr, 0};
+ struct buffer_view<float> probs = {nullptr, 0};
+ struct buffer_view<llama_token> candidates = {nullptr, 0};
+
+ std::vector<uint32_t> logits_count;
+ std::vector<uint32_t> probs_count;
+ std::vector<uint32_t> candidates_count;
+
+ std::vector<llama_token> token_ids_full_vocab;
+ };
+
+ sampling_info sampling;
+
+ // sequence embeddings output (map of [n_embd] vectors)
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
+
+ // reuse the batch_allocr to avoid unnecessary memory allocations
+ std::unique_ptr<llama_batch_allocr> balloc;
+
+ uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
+
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
+
+ struct swap_info {
+ uint32_t i0;
+ uint32_t i1;
+ };
+
+ std::vector<swap_info> output_swaps;
+
+ ggml_backend_sched_ptr sched;
+
+ bool sched_need_reserve = true;
+
+ ggml_backend_t backend_cpu = nullptr;
+ std::vector<ggml_backend_ptr> backends;
+
+ // training
+ ggml_opt_context_t opt_ctx = nullptr;
+
+ ggml_threadpool_t threadpool = nullptr;
+ ggml_threadpool_t threadpool_batch = nullptr;
+
+ ggml_abort_callback abort_callback = nullptr;
+ void * abort_callback_data = nullptr;
+
+ std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+
+ // pointers and buffer types used for the compute buffer of each backend
+ std::vector<ggml_backend_t> backend_ptrs;
+ std::vector<ggml_backend_buffer_type_t> backend_buft;
+ std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
+
+ llm_graph_result_ptr gf_res_prev;
+ llm_graph_result_ptr gf_res_reserve;
+
+ // host buffer for the model output (logits and embeddings)
+ ggml_backend_buffer_ptr buf_output;
+
+ bool has_evaluated_once = false;
+
+ // env: LLAMA_GRAPH_REUSE_DISABLE
+ bool graph_reuse_disable = false;
+
+ // perf
+ mutable int64_t t_start_us = 0;
+ mutable int64_t t_load_us = 0;
+ mutable int64_t t_p_eval_us = 0;
+ mutable int64_t t_eval_us = 0;
+
+ mutable int64_t t_compute_start_us = 0;
+ mutable int64_t n_queued_tokens = 0;
+
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+ mutable int32_t n_eval = 0; // number of eval calls
+
+ mutable int32_t n_reused = 0; // number of times the previous graph was reused
+};
diff --git a/llama.cpp/src/llama-cparams.cpp b/llama.cpp/src/llama-cparams.cpp
new file mode 100644
index 0000000..a3e7a37
--- /dev/null
+++ b/llama.cpp/src/llama-cparams.cpp
@@ -0,0 +1,5 @@
+#include "llama-cparams.h"
+
+size_t llama_max_parallel_sequences(void) {
+ return LLAMA_MAX_SEQ;
+}
diff --git a/llama.cpp/src/llama-cparams.h b/llama.cpp/src/llama-cparams.h
new file mode 100644
index 0000000..2da3bbd
--- /dev/null
+++ b/llama.cpp/src/llama-cparams.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "llama.h"
+
+#include <cstdint>
+
+#define LLAMA_MAX_SEQ 256
+
+struct llama_cparams {
+ uint32_t n_ctx; // context size used during inference
+ uint32_t n_ctx_seq; // context for a single sequence
+ uint32_t n_batch;
+ uint32_t n_ubatch;
+ uint32_t n_seq_max;
+ int32_t n_threads; // number of threads to use for generation
+ int32_t n_threads_batch; // number of threads to use for batch processing
+
+ float rope_freq_base;
+ float rope_freq_scale;
+
+ uint32_t n_ctx_orig_yarn;
+ // These hyperparameters are not exposed in GGUF, because all
+ // existing YaRN models use the same values for them.
+ float yarn_ext_factor;
+ float yarn_attn_factor;
+ float yarn_beta_fast;
+ float yarn_beta_slow;
+
+ bool embeddings;
+ bool causal_attn;
+ bool offload_kqv;
+ bool flash_attn;
+ bool auto_fa;
+ bool no_perf;
+ bool warmup;
+ bool op_offload;
+ bool kv_unified;
+ bool pipeline_parallel;
+
+ enum llama_pooling_type pooling_type;
+
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
+};
diff --git a/llama.cpp/src/llama-grammar.cpp b/llama.cpp/src/llama-grammar.cpp
new file mode 100644
index 0000000..2d55070
--- /dev/null
+++ b/llama.cpp/src/llama-grammar.cpp
@@ -0,0 +1,1464 @@
+#include "llama-grammar.h"
+
+#include "llama-impl.h"
+#include "llama-vocab.h"
+#include "llama-sampler.h"
+
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <stdexcept>
+
+#define MAX_REPETITION_THRESHOLD 2000
+//
+// helpers
+//
+
+// NOTE: assumes valid utf8 (but checks for overrun)
+static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+ uint8_t first_byte = static_cast<uint8_t>(*src);
+ uint8_t highbits = first_byte >> 4;
+ int len = lookup[highbits];
+ uint8_t mask = (1 << (8 - len)) - 1;
+ uint32_t value = first_byte & mask;
+ const char * end = src + len; // may overrun!
+ const char * pos = src + 1;
+ for ( ; pos < end && *pos; pos++) {
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+ }
+ return std::make_pair(value, pos);
+}
+
+static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+ const std::string & src,
+ llama_partial_utf8 partial_start) {
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
+ const char * pos = src.c_str();
+ std::vector<uint32_t> code_points;
+
+ // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
+ code_points.reserve(src.size() + 1);
+ uint32_t value = partial_start.value;
+ int n_remain = partial_start.n_remain;
+
+ // continue previous decode, if applicable
+ while (*pos != 0 && n_remain > 0) {
+ uint8_t next_byte = static_cast<uint8_t>(*pos);
+ if ((next_byte >> 6) != 2) {
+ // invalid sequence, abort
+ code_points.push_back(0);
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
+ }
+ value = (value << 6) + (next_byte & 0x3F);
+ ++pos;
+ --n_remain;
+ }
+
+ if (partial_start.n_remain > 0 && n_remain == 0) {
+ code_points.push_back(value);
+ }
+
+ // decode any subsequent utf-8 sequences, which may end in an incomplete one
+ while (*pos != 0) {
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
+ uint8_t highbits = first_byte >> 4;
+ n_remain = lookup[highbits] - 1;
+
+ if (n_remain < 0) {
+ // invalid sequence, abort
+ code_points.clear();
+ code_points.push_back(0);
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
+ }
+
+ uint8_t mask = (1 << (7 - n_remain)) - 1;
+ value = first_byte & mask;
+
+ ++pos;
+ while (*pos != 0 && n_remain > 0) {
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+ ++pos;
+ --n_remain;
+ }
+ if (n_remain == 0) {
+ code_points.push_back(value);
+ }
+ }
+ code_points.push_back(0);
+
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
+}
+
+static bool is_digit_char(char c) {
+ return '0' <= c && c <= '9';
+}
+
+static bool is_word_char(char c) {
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
+}
+
+static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+ const char * pos = src;
+ const char * end = src + size;
+ uint32_t value = 0;
+ for ( ; pos < end && *pos; pos++) {
+ value <<= 4;
+ char c = *pos;
+ if ('a' <= c && c <= 'f') {
+ value += c - 'a' + 10;
+ } else if ('A' <= c && c <= 'F') {
+ value += c - 'A' + 10;
+ } else if ('0' <= c && c <= '9') {
+ value += c - '0';
+ } else {
+ break;
+ }
+ }
+ if (pos != end) {
+ throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+ }
+ return std::make_pair(value, pos);
+}
+
+static const char * parse_space(const char * src, bool newline_ok) {
+ const char * pos = src;
+ while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+ (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+ if (*pos == '#') {
+ while (*pos && *pos != '\r' && *pos != '\n') {
+ pos++;
+ }
+ } else {
+ pos++;
+ }
+ }
+ return pos;
+}
+
+static const char * parse_name(const char * src) {
+ const char * pos = src;
+ while (is_word_char(*pos)) {
+ pos++;
+ }
+ if (pos == src) {
+ throw std::runtime_error(std::string("expecting name at ") + src);
+ }
+ return pos;
+}
+
+static const char * parse_int(const char * src) {
+ const char * pos = src;
+ while (is_digit_char(*pos)) {
+ pos++;
+ }
+ if (pos == src) {
+ throw std::runtime_error(std::string("expecting integer at ") + src);
+ }
+ return pos;
+}
+
+static std::pair<uint32_t, const char *> parse_char(const char * src) {
+ if (*src == '\\') {
+ switch (src[1]) {
+ case 'x': return parse_hex(src + 2, 2);
+ case 'u': return parse_hex(src + 2, 4);
+ case 'U': return parse_hex(src + 2, 8);
+ case 't': return std::make_pair('\t', src + 2);
+ case 'r': return std::make_pair('\r', src + 2);
+ case 'n': return std::make_pair('\n', src + 2);
+ case '\\':
+ case '"':
+ case '[':
+ case ']':
+ return std::make_pair(src[1], src + 2);
+ default:
+ throw std::runtime_error(std::string("unknown escape at ") + src);
+ }
+ } else if (*src) {
+ return decode_utf8(src);
+ }
+ throw std::runtime_error("unexpected end of input");
+}
+
+static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
+ const char * pos = src;
+ if (*pos != '<') {
+ throw std::runtime_error(std::string("expecting '<' at ") + pos);
+ }
+ pos++;
+
+ // Parse <[id]>
+ if (*pos == '[') {
+ pos++;
+ const char * int_end = parse_int(pos);
+ uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
+ pos = int_end;
+ if (*pos != ']') {
+ throw std::runtime_error(std::string("expecting ']' at ") + pos);
+ }
+ pos++;
+ if (*pos != '>') {
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
+ }
+ pos++;
+ return std::make_pair(token_id, pos);
+ }
+
+ if (vocab == nullptr) {
+ throw std::runtime_error(std::string("no vocab to parse token at ") + src);
+ }
+
+ // Parse <token> and tokenize to obtain the token id
+ while (*pos != 0 && *pos != '>') {
+ pos++;
+ }
+ if (*pos != '>') {
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
+ }
+ pos++;
+
+ llama_token tokens[2];
+ int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
+ if (n_tokens != 1) {
+ // must tokenize to exactly 1 token
+ throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
+ }
+ return std::make_pair(tokens[0], pos);
+}
+
+static void print_grammar_char(FILE * file, uint32_t c) {
+ if (0x20 <= c && c <= 0x7f) {
+ fprintf(file, "%c", static_cast<char>(c));
+ } else {
+ // cop out of encoding UTF-8
+ fprintf(file, "<U+%04X>", c);
+ }
+}
+
+static bool is_char_element(llama_grammar_element elem) {
+ switch (elem.type) {
+ case LLAMA_GRETYPE_CHAR: return true;
+ case LLAMA_GRETYPE_CHAR_NOT: return true;
+ case LLAMA_GRETYPE_CHAR_ALT: return true;
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+ case LLAMA_GRETYPE_CHAR_ANY: return true;
+ default: return false;
+ }
+}
+
+static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
+ for (auto elem : rule) {
+ switch (elem.type) {
+ case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
+ case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
+ case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
+ case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
+ case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+ case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
+ case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
+ case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
+ case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
+ }
+ switch (elem.type) {
+ case LLAMA_GRETYPE_END:
+ case LLAMA_GRETYPE_ALT:
+ case LLAMA_GRETYPE_RULE_REF:
+ fprintf(file, "(%u) ", elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR:
+ case LLAMA_GRETYPE_CHAR_NOT:
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ case LLAMA_GRETYPE_CHAR_ALT:
+ case LLAMA_GRETYPE_CHAR_ANY:
+ fprintf(file, "(\"");
+ print_grammar_char(file, elem.value);
+ fprintf(file, "\") ");
+ break;
+ case LLAMA_GRETYPE_TOKEN:
+ fprintf(file, "<[");
+ fprintf(file, "%u", elem.value);
+ fprintf(file, "]> ");
+ break;
+ case LLAMA_GRETYPE_TOKEN_NOT:
+ fprintf(file, "!");
+ fprintf(file, "<[");
+ fprintf(file, "%u", elem.value);
+ fprintf(file, "]> ");
+ break;
+ }
+ }
+ fprintf(file, "\n");
+}
+
+static void print_rule(
+ FILE * file,
+ uint32_t rule_id,
+ const llama_grammar_rule & rule,
+ const std::map<uint32_t, std::string> & symbol_id_names) {
+ if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+ throw std::runtime_error(
+ "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+ }
+ fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+ for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+ llama_grammar_element elem = rule[i];
+ switch (elem.type) {
+ case LLAMA_GRETYPE_END:
+ throw std::runtime_error(
+ "unexpected end of rule: " + std::to_string(rule_id) + "," +
+ std::to_string(i));
+ case LLAMA_GRETYPE_ALT:
+ fprintf(file, "| ");
+ break;
+ case LLAMA_GRETYPE_RULE_REF:
+ fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+ break;
+ case LLAMA_GRETYPE_CHAR:
+ fprintf(file, "[");
+ print_grammar_char(file, elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR_NOT:
+ fprintf(file, "[^");
+ print_grammar_char(file, elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ if (i == 0 || !is_char_element(rule[i - 1])) {
+ throw std::runtime_error(
+ "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+ std::to_string(rule_id) + "," + std::to_string(i));
+ }
+ fprintf(file, "-");
+ print_grammar_char(file, elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR_ALT:
+ if (i == 0 || !is_char_element(rule[i - 1])) {
+ throw std::runtime_error(
+ "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+ std::to_string(rule_id) + "," + std::to_string(i));
+ }
+ print_grammar_char(file, elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR_ANY:
+ fprintf(file, ".");
+ break;
+ case LLAMA_GRETYPE_TOKEN:
+ fprintf(file, "<[");
+ fprintf(file, "%u", elem.value);
+ fprintf(file, "]> ");
+ break;
+ case LLAMA_GRETYPE_TOKEN_NOT:
+ fprintf(file, "!");
+ fprintf(file, "<[");
+ fprintf(file, "%u", elem.value);
+ fprintf(file, "]> ");
+ break;
+ }
+ if (is_char_element(elem)) {
+ switch (rule[i + 1].type) {
+ case LLAMA_GRETYPE_CHAR_ALT:
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ case LLAMA_GRETYPE_CHAR_ANY:
+ break;
+ default:
+ fprintf(file, "] ");
+ }
+ }
+ }
+ fprintf(file, "\n");
+}
+
+//
+// Regex utilities
+//
+
+size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
+ auto find_start_pos = [](const std::smatch & match) {
+ // get from the first matched capturing group to the end of the string
+ size_t start = std::string::npos;
+ for (auto i = 1u; i < match.size(); i++) {
+ if (match.length(i) > 0) {
+ start = match.position(i);
+ break;
+ }
+ }
+ if (start == std::string::npos) {
+ start = match.position(0);
+ }
+ return start;
+ };
+
+ if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
+ // match against the entire input
+ std::smatch match;
+ if (std::regex_match(input, match, regex)) {
+ return find_start_pos(match);
+ }
+ }
+
+ // search anywhere
+ std::smatch match;
+ if (std::regex_search(input, match, regex)) {
+ return find_start_pos(match);
+ }
+
+ return std::string::npos;
+}
+
+
+//
+// implementation
+//
+
+uint32_t llama_grammar_parser::get_symbol_id(const char * src, size_t len) {
+ uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
+ auto result = symbol_ids.emplace(std::string(src, len), next_id);
+ return result.first->second;
+}
+
+uint32_t llama_grammar_parser::generate_symbol_id(const std::string & base_name) {
+ uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
+ symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+ return next_id;
+}
+
+void llama_grammar_parser::add_rule(uint32_t rule_id, const llama_grammar_rule & rule) {
+ if (rules.size() <= rule_id) {
+ rules.resize(rule_id + 1);
+ }
+ rules[rule_id] = rule;
+}
+
+const char * llama_grammar_parser::parse_alternates(
+ const char * src,
+ const std::string & rule_name,
+ uint32_t rule_id,
+ bool is_nested) {
+ llama_grammar_rule rule;
+ const char * pos = parse_sequence(src, rule_name, rule, is_nested);
+ while (*pos == '|') {
+ rule.push_back({LLAMA_GRETYPE_ALT, 0});
+ pos = parse_space(pos + 1, true);
+ pos = parse_sequence(pos, rule_name, rule, is_nested);
+ }
+ rule.push_back({LLAMA_GRETYPE_END, 0});
+ add_rule(rule_id, rule);
+ return pos;
+}
+
+const char * llama_grammar_parser::parse_sequence(
+ const char * src,
+ const std::string & rule_name,
+ llama_grammar_rule & rule,
+ bool is_nested) {
+ size_t last_sym_start = rule.size();
+ const char * pos = src;
+
+ // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
+ // (though it's technically the same as -1 now)
+ auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
+ bool no_max = max_times == UINT64_MAX;
+ if (last_sym_start == rule.size()) {
+ throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
+ }
+
+ // apply transformation to previous symbol (last_sym_start to end) according to
+ // the following rewrite rules:
+ // S{m,n} --> S S S (m times) S'(n-m)
+ // S'(x) ::= S S'(x-1) |
+ // (... n-m definitions of these S' rules ...)
+ // S'(1) ::= S |
+ // S{m,} --> S S S (m times) S'
+ // S' ::= S S' |
+ // S* --> S{0,}
+ // --> S' ::= S S' |
+ // S+ --> S{1,}
+ // --> S S'
+ // S' ::= S S' |
+ // S? --> S{0,1}
+ // --> S'
+ // S' ::= S |
+
+ llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
+ if (min_times == 0) {
+ rule.resize(last_sym_start);
+ } else {
+ // Repeat the previous elements (min_times - 1) times
+ for (uint64_t i = 1; i < min_times; i++) {
+ rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
+ }
+ }
+
+ uint32_t last_rec_rule_id = 0;
+ auto n_opt = no_max ? 1 : max_times - min_times;
+
+ llama_grammar_rule rec_rule(prev_rule);
+ for (uint64_t i = 0; i < n_opt; i++) {
+ rec_rule.resize(prev_rule.size());
+ uint32_t rec_rule_id = generate_symbol_id( rule_name);
+ if (i > 0 || no_max) {
+ rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
+ }
+ rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+ rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+ add_rule( rec_rule_id, rec_rule);
+ last_rec_rule_id = rec_rule_id;
+ }
+ if (n_opt > 0) {
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+ }
+ };
+
+ while (*pos) {
+ if (*pos == '"') { // literal string
+ pos++;
+ last_sym_start = rule.size();
+ while (*pos != '"') {
+ if (!*pos) {
+ throw std::runtime_error("unexpected end of input");
+ }
+ auto char_pair = parse_char(pos);
+ pos = char_pair.second;
+ rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '[') { // char range(s)
+ pos++;
+ enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+ if (*pos == '^') {
+ pos++;
+ start_type = LLAMA_GRETYPE_CHAR_NOT;
+ }
+ last_sym_start = rule.size();
+ while (*pos != ']') {
+ if (!*pos) {
+ throw std::runtime_error("unexpected end of input");
+ }
+ auto char_pair = parse_char(pos);
+ pos = char_pair.second;
+ enum llama_gretype type = last_sym_start < rule.size()
+ ? LLAMA_GRETYPE_CHAR_ALT
+ : start_type;
+
+ rule.push_back({type, char_pair.first});
+ if (pos[0] == '-' && pos[1] != ']') {
+ if (!pos[1]) {
+ throw std::runtime_error("unexpected end of input");
+ }
+ auto endchar_pair = parse_char(pos + 1);
+ pos = endchar_pair.second;
+ rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+ }
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '<' || *pos == '!') { // token
+ auto type = LLAMA_GRETYPE_TOKEN;
+ if (*pos == '!') { // token inverse
+ type = LLAMA_GRETYPE_TOKEN_NOT;
+ pos++;
+ }
+ auto token_pair = parse_token(vocab, pos);
+ const char * token_end = token_pair.second;
+ last_sym_start = rule.size();
+ rule.push_back({type, token_pair.first});
+ pos = parse_space(token_end, is_nested);
+ } else if (is_word_char(*pos)) { // rule reference
+ const char * name_end = parse_name(pos);
+ uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
+ pos = parse_space(name_end, is_nested);
+ last_sym_start = rule.size();
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+ } else if (*pos == '(') { // grouping
+ // parse nested alternates into synthesized rule
+ pos = parse_space(pos + 1, true);
+ uint32_t sub_rule_id = generate_symbol_id(rule_name);
+ pos = parse_alternates(pos, rule_name, sub_rule_id, true);
+ last_sym_start = rule.size();
+ // output reference to synthesized rule
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+ if (*pos != ')') {
+ throw std::runtime_error(std::string("expecting ')' at ") + pos);
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '.') { // any char
+ last_sym_start = rule.size();
+ rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '*') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(0, -1);
+ } else if (*pos == '+') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(1, -1);
+ } else if (*pos == '?') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(0, 1);
+ } else if (*pos == '{') {
+ pos = parse_space(pos + 1, is_nested);
+
+ if (!is_digit_char(*pos)) {
+ throw std::runtime_error(std::string("expecting an int at ") + pos);
+ }
+ const char * int_end = parse_int(pos);
+ uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
+ pos = parse_space(int_end, is_nested);
+
+ uint64_t max_times = UINT64_MAX; // default: no max limit
+
+ if (*pos == '}') {
+ max_times = min_times;
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == ',') {
+ pos = parse_space(pos + 1, is_nested);
+
+ if (is_digit_char(*pos)) {
+ const char * int_end = parse_int(pos);
+ max_times = std::stoul(std::string(pos, int_end - pos));
+ pos = parse_space(int_end, is_nested);
+ }
+
+ if (*pos != '}') {
+ throw std::runtime_error(std::string("expecting '}' at ") + pos);
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else {
+ throw std::runtime_error(std::string("expecting ',' at ") + pos);
+ }
+ bool has_max = max_times != UINT64_MAX;
+ if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
+ throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
+ }
+ handle_repetitions(min_times, max_times);
+ } else {
+ break;
+ }
+ }
+ return pos;
+}
+
+const char * llama_grammar_parser::parse_rule(const char * src) {
+ const char * name_end = parse_name(src);
+ const char * pos = parse_space(name_end, false);
+ size_t name_len = name_end - src;
+ uint32_t rule_id = get_symbol_id(src, name_len);
+ const std::string name(src, name_len);
+
+ if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+ throw std::runtime_error(std::string("expecting ::= at ") + pos);
+ }
+ pos = parse_space(pos + 3, true);
+
+ pos = parse_alternates(pos, name, rule_id, false);
+
+ if (*pos == '\r') {
+ pos += pos[1] == '\n' ? 2 : 1;
+ } else if (*pos == '\n') {
+ pos++;
+ } else if (*pos) {
+ throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+ }
+ return parse_space(pos, true);
+}
+
+bool llama_grammar_parser::parse(const char * src) {
+ try {
+ const char * pos = parse_space(src, true);
+ while (*pos) {
+ pos = parse_rule(pos);
+ }
+ // Validate the state to ensure that all rules are defined
+ for (const auto & rule : rules) {
+ if (rule.empty()) {
+ throw std::runtime_error("Undefined rule");
+ }
+ for (const auto & elem : rule) {
+ if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+ // Ensure that the rule at that location exists
+ if (elem.value >= rules.size() || rules[elem.value].empty()) {
+ // Get the name of the rule that is missing
+ for (const auto & kv : symbol_ids) {
+ if (kv.second == elem.value) {
+ throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
+ }
+ }
+ }
+ }
+ }
+ }
+ } catch (const std::exception & err) {
+ fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
+ rules.clear();
+ return false;
+ }
+
+ return true;
+}
+
+void llama_grammar_parser::print(FILE * file) {
+ try {
+ std::map<uint32_t, std::string> symbol_id_names;
+ for (const auto & kv : symbol_ids) {
+ symbol_id_names[kv.second] = kv.first;
+ }
+ for (size_t i = 0, end = rules.size(); i < end; i++) {
+ // fprintf(file, "%zu: ", i);
+ // print_rule_binary(file, rules[i]);
+ print_rule(file, uint32_t(i), rules[i], symbol_id_names);
+ // fprintf(file, "\n");
+ }
+ } catch (const std::exception & err) {
+ fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+ }
+}
+
+llama_grammar_stack llama_grammar_parser::c_rules() const {
+ llama_grammar_stack ret;
+ ret.reserve(rules.size());
+ for (const auto & rule : rules) {
+ ret.push_back(rule.data());
+ }
+ return ret;
+}
+
+// returns true iff pos points to the end of one of the definitions of a rule
+static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
+ switch (pos->type) {
+ case LLAMA_GRETYPE_END: return true; // NOLINT
+ case LLAMA_GRETYPE_ALT: return true; // NOLINT
+ default: return false;
+ }
+}
+
+// returns true iff chr satisfies the char range at pos (regular or inverse range)
+// asserts that pos is pointing to a char range element
+static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
+ const llama_grammar_element * pos,
+ const uint32_t chr) {
+ bool found = false;
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
+
+ GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
+
+ do {
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+ // inclusive range, e.g. [a-z]
+ found = found || (pos->value <= chr && chr <= pos[1].value);
+ pos += 2;
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+ // Any character matches "."
+ found = true;
+ pos += 1;
+ } else {
+ // exact char match, e.g. [a] or "a"
+ found = found || pos->value == chr;
+ pos += 1;
+ }
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+
+ return std::make_pair(found == is_positive_char, pos);
+}
+
+// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
+// range at pos (regular or inverse range)
+// asserts that pos is pointing to a char range element
+static bool llama_grammar_match_partial_char(
+ const llama_grammar_element * pos,
+ const llama_partial_utf8 partial_utf8) {
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
+ GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
+
+ uint32_t partial_value = partial_utf8.value;
+ int n_remain = partial_utf8.n_remain;
+
+ // invalid sequence or 7-bit char split across 2 bytes (overlong)
+ if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
+ return false;
+ }
+
+ // range of possible code points this partial UTF-8 sequence could complete to
+ uint32_t low = partial_value << (n_remain * 6);
+ uint32_t high = low | ((1 << (n_remain * 6)) - 1);
+
+ if (low == 0) {
+ if (n_remain == 2) {
+ low = 1 << 11;
+ } else if (n_remain == 3) {
+ low = 1 << 16;
+ }
+ }
+
+ do {
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+ // inclusive range, e.g. [a-z]
+ if (pos->value <= high && low <= pos[1].value) {
+ return is_positive_char;
+ }
+ pos += 2;
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+ // Any character matches "."
+ return true;
+ } else {
+ // exact char match, e.g. [a] or "a"
+ if (low <= pos->value && pos->value <= high) {
+ return is_positive_char;
+ }
+ pos += 1;
+ }
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+
+ return !is_positive_char;
+}
+
+// returns true iff token matches the rule at pos (regular or inverse)
+// asserts that pos is pointing to a token element
+static bool llama_grammar_match_token(
+ const llama_grammar_element * pos,
+ const llama_token token) {
+ GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
+ if (pos->type == LLAMA_GRETYPE_TOKEN) {
+ return pos->value == static_cast<uint32_t>(token);
+ }
+ if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+ return pos->value != static_cast<uint32_t>(token);
+ }
+ return false;
+}
+
+// transforms a grammar pushdown stack into N possible stacks, all ending
+// at a character range (terminal element)
+static void llama_grammar_advance_stack(
+ const llama_grammar_rules & rules,
+ const llama_grammar_stack & stack,
+ llama_grammar_stacks & new_stacks) {
+ if (stack.empty()) {
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+ new_stacks.emplace_back(stack);
+ }
+ return;
+ }
+
+ const llama_grammar_element * pos = stack.back();
+
+ switch (pos->type) {
+ case LLAMA_GRETYPE_RULE_REF: {
+ const size_t rule_id = static_cast<size_t>(pos->value);
+ const llama_grammar_element * subpos = rules[rule_id].data();
+ do {
+ // init new stack without the top (pos)
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+ // if this rule ref is followed by another element, add that to stack
+ new_stack.push_back(pos + 1);
+ }
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
+ // if alternate is nonempty, add to stack
+ new_stack.push_back(subpos);
+ }
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
+ // scan to end of alternate def
+ subpos++;
+ }
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
+ // there's another alternate def of this rule to process
+ subpos++;
+ } else {
+ break;
+ }
+ } while (true);
+ break;
+ }
+ case LLAMA_GRETYPE_CHAR:
+ case LLAMA_GRETYPE_CHAR_NOT:
+ case LLAMA_GRETYPE_CHAR_ANY:
+ case LLAMA_GRETYPE_TOKEN:
+ case LLAMA_GRETYPE_TOKEN_NOT:
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+ // only add the stack if it's not a duplicate of one we already have
+ new_stacks.emplace_back(stack);
+ }
+ break;
+ default:
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
+ // those
+ GGML_ABORT("fatal error");
+ }
+}
+
+static llama_grammar_candidates llama_grammar_reject_candidates(
+ const llama_grammar_rules & rules,
+ const llama_grammar_stacks & stacks,
+ const llama_grammar_candidates & candidates) {
+ GGML_ASSERT(!stacks.empty()); // REVIEW
+
+ if (candidates.empty()) {
+ return {};
+ }
+
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
+
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
+ }
+
+ return rejects;
+}
+
+static bool llama_grammar_detect_left_recursion(
+ const llama_grammar_rules & rules,
+ size_t rule_index,
+ std::vector<bool> * rules_visited,
+ std::vector<bool> * rules_in_progress,
+ std::vector<bool> * rules_may_be_empty) {
+ if ((*rules_in_progress)[rule_index]) {
+ return true;
+ }
+
+ (*rules_in_progress)[rule_index] = true;
+
+ const llama_grammar_rule & rule = rules[rule_index];
+
+ // First check if the rule might produce the empty string. This could be done combined with the second
+ // step but it's more readable as two steps.
+ bool at_rule_start = true;
+ for (size_t i = 0; i < rule.size(); i++) {
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
+ if (at_rule_start) {
+ (*rules_may_be_empty)[rule_index] = true;
+ break;
+ }
+ at_rule_start = true;
+ } else {
+ at_rule_start = false;
+ }
+ }
+
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
+ // be empty)
+ bool recurse_into_nonterminal = true;
+ for (size_t i = 0; i < rule.size(); i++) {
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
+ return true;
+ }
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
+ recurse_into_nonterminal = false;
+ }
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
+ recurse_into_nonterminal = true;
+ } else {
+ recurse_into_nonterminal = false;
+ }
+ }
+
+ (*rules_in_progress)[rule_index] = false;
+ (*rules_visited)[rule_index] = true;
+
+ return false;
+}
+
+const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
+ return grammar->rules;
+}
+
+llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
+ return grammar->stacks;
+}
+
+static void llama_grammar_accept_chr(
+ struct llama_grammar & grammar,
+ const llama_grammar_stack & stack,
+ uint32_t chr,
+ llama_grammar_stacks & new_stacks) {
+ if (stack.empty()) {
+ return;
+ }
+
+ const llama_grammar_element * pos = stack.back();
+
+ // ignore if this turns into a token
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+ return;
+ }
+
+ auto match = llama_grammar_match_char(pos, chr);
+ if (match.first) {
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+ if (!llama_grammar_is_end_of_sequence(match.second)) {
+ new_stack.push_back(match.second);
+ }
+ llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
+ }
+}
+
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
+ llama_grammar_stacks stacks_new;
+ stacks_new.reserve(grammar->stacks.size());
+
+ for (const auto & stack : grammar->stacks) {
+ llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
+ }
+
+ grammar->stacks = std::move(stacks_new);
+}
+
+llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
+ const llama_grammar_rules & rules,
+ const llama_grammar_stack & stack,
+ const llama_grammar_candidates & candidates) {
+
+ llama_grammar_candidates rejects;
+ rejects.reserve(candidates.size());
+
+ if (stack.empty()) {
+ for (const auto & tok : candidates) {
+ if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
+ rejects.push_back(tok);
+ }
+ }
+ return rejects;
+ }
+
+ const llama_grammar_element * stack_pos = stack.back();
+
+ // if the top of the stack is a token rule, then we only need to check the token id
+ if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+ for (const auto & tok : candidates) {
+ if (*tok.code_points == 0) {
+ // reached the end of a token consumed by char rules, reject iff it ended
+ // in a partial response
+ if (tok.partial_utf8.n_remain != 0) {
+ rejects.push_back(tok);
+ }
+ } else if (!llama_grammar_match_token(stack_pos, tok.id)) {
+ rejects.push_back(tok);
+ }
+ }
+ return rejects;
+ }
+
+ llama_grammar_candidates next_candidates;
+ next_candidates.reserve(candidates.size());
+
+ for (const auto & tok : candidates) {
+ if (*tok.code_points == 0) {
+ // reached end of full codepoints in token, reject iff it ended in a partial sequence
+ // that cannot satisfy this position in grammar
+ if (tok.partial_utf8.n_remain != 0 &&
+ !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
+ rejects.push_back(tok);
+ }
+ } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
+ } else {
+ rejects.push_back(tok);
+ }
+ }
+
+ const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
+
+ // update top of stack to next element, if any
+ llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
+ stack_after.push_back(stack_pos_after);
+ }
+ llama_grammar_stacks next_stacks;
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
+
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
+ for (const auto & tok : next_rejects) {
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
+ }
+
+ return rejects;
+}
+
+////////////////////
+
+struct llama_grammar * llama_grammar_init_impl(
+ const struct llama_vocab * vocab,
+ const llama_grammar_element ** rules,
+ size_t n_rules,
+ size_t start_rule_index) {
+ const llama_grammar_element * pos;
+
+ // copy rule definitions into vectors
+ llama_grammar_rules vec_rules(n_rules);
+ for (size_t i = 0; i < n_rules; i++) {
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+ vec_rules[i].push_back(*pos);
+ }
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+ }
+
+ // Check for left recursion
+ std::vector<bool> rules_visited(n_rules);
+ std::vector<bool> rules_in_progress(n_rules);
+ std::vector<bool> rules_may_be_empty(n_rules);
+ for (size_t i = 0; i < n_rules; i++) {
+ if (rules_visited[i]) {
+ continue;
+ }
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+ LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+ return nullptr;
+ }
+ }
+
+ // loop over alternates of start rule to build initial stacks
+ llama_grammar_stacks stacks;
+ pos = vec_rules[start_rule_index].data();
+ do {
+ llama_grammar_stack stack;
+ if (!llama_grammar_is_end_of_sequence(pos)) {
+ // if alternate is nonempty, add to stack
+ stack.push_back(pos);
+ }
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
+ while (!llama_grammar_is_end_of_sequence(pos)) {
+ // scan to end of alternate def
+ pos++;
+ }
+ if (pos->type == LLAMA_GRETYPE_ALT) {
+ // there's another alternate def of this rule to process
+ pos++;
+ } else {
+ break;
+ }
+ } while (true);
+
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
+ return new llama_grammar {
+ vocab,
+ std::move(vec_rules),
+ std::move(stacks),
+ /* .partial_utf8 = */ {},
+ /* .lazy = */ false,
+ /* .awaiting_trigger = */ false,
+ /* .trigger_buffer = */ "",
+ /* .trigger_buffer_positions = */ {},
+ /* .trigger_tokens = */ {},
+ /* .trigger_patterns = */ {},
+ };
+}
+
+struct llama_grammar * llama_grammar_init_impl(
+ const struct llama_vocab * vocab,
+ const char * grammar_str,
+ const char * grammar_root,
+ bool lazy,
+ const char ** trigger_patterns,
+ size_t num_trigger_patterns,
+ const llama_token * trigger_tokens,
+ size_t num_trigger_tokens) {
+ llama_grammar_parser parser(vocab);
+
+ // if there is a grammar, parse it
+ // rules will be empty (default) if there are parse errors
+ if (!parser.parse(grammar_str) || parser.rules.empty()) {
+ fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+ return nullptr;
+ }
+
+ // Ensure that there is a "root" node.
+ if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
+ fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+ return nullptr;
+ }
+
+ std::vector<const llama_grammar_element *> grammar_rules(parser.c_rules());
+
+ const size_t n_rules = grammar_rules.size();
+ const size_t start_rule_index = parser.symbol_ids.at(grammar_root);
+
+ const llama_grammar_element * pos;
+
+ // copy rule definitions into vectors
+ llama_grammar_rules vec_rules(n_rules);
+ for (size_t i = 0; i < n_rules; i++) {
+ for (pos = grammar_rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+ vec_rules[i].push_back(*pos);
+ }
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+ }
+
+ // Check for left recursion
+ std::vector<bool> rules_visited(n_rules);
+ std::vector<bool> rules_in_progress(n_rules);
+ std::vector<bool> rules_may_be_empty(n_rules);
+ for (size_t i = 0; i < n_rules; i++) {
+ if (rules_visited[i]) {
+ continue;
+ }
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+ LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+ return nullptr;
+ }
+ }
+
+ // loop over alternates of start rule to build initial stacks
+ llama_grammar_stacks stacks;
+ pos = vec_rules[start_rule_index].data();
+ do {
+ llama_grammar_stack stack;
+ if (!llama_grammar_is_end_of_sequence(pos)) {
+ // if alternate is nonempty, add to stack
+ stack.push_back(pos);
+ }
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
+ while (!llama_grammar_is_end_of_sequence(pos)) {
+ // scan to end of alternate def
+ pos++;
+ }
+ if (pos->type == LLAMA_GRETYPE_ALT) {
+ // there's another alternate def of this rule to process
+ pos++;
+ } else {
+ break;
+ }
+ } while (true);
+
+ std::vector<llama_token> vec_trigger_tokens;
+ std::vector<llama_grammar_trigger_pattern> vec_trigger_patterns;
+ for (size_t i = 0; i < num_trigger_tokens; i++) {
+ GGML_ASSERT(trigger_tokens != nullptr);
+ vec_trigger_tokens.push_back(trigger_tokens[i]);
+ }
+ for (size_t i = 0; i < num_trigger_patterns; i++) {
+ GGML_ASSERT(trigger_patterns != nullptr);
+ auto & trigger = vec_trigger_patterns.emplace_back();
+ trigger.pattern = trigger_patterns[i];
+ trigger.regex = std::regex(trigger.pattern);
+ }
+
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
+ return new llama_grammar {
+ vocab,
+ std::move(vec_rules),
+ std::move(stacks),
+ /* .partial_utf8 = */ {},
+ /* .lazy = */ lazy,
+ /* .awaiting_trigger = */ lazy,
+ /* .trigger_buffer = */ "",
+ /* .trigger_buffer_positions = */ {},
+ std::move(vec_trigger_tokens),
+ std::move(vec_trigger_patterns),
+ };
+}
+
+void llama_grammar_free_impl(struct llama_grammar * grammar) {
+ if (grammar == nullptr) {
+ return;
+ }
+
+ delete grammar;
+}
+
+struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
+ auto * result = new llama_grammar {
+ grammar.vocab,
+ grammar.rules,
+ grammar.stacks,
+ grammar.partial_utf8,
+ grammar.lazy,
+ grammar.awaiting_trigger,
+ grammar.trigger_buffer,
+ grammar.trigger_buffer_positions,
+ grammar.trigger_tokens,
+ grammar.trigger_patterns,
+ };
+
+ // redirect elements in stacks to point to new rules
+ for (size_t is = 0; is < result->stacks.size(); is++) {
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
+ for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
+ for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
+ if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
+ }
+ }
+ }
+ }
+ }
+
+ return result;
+}
+
+void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
+ GGML_ASSERT(grammar.vocab != nullptr);
+
+ if (grammar.awaiting_trigger) {
+ return;
+ }
+
+ bool allow_eog = false;
+ for (const auto & stack : grammar.stacks) {
+ if (stack.empty()) {
+ allow_eog = true;
+ break;
+ }
+ }
+
+ std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
+ candidates_decoded.reserve(cur_p->size);
+
+ llama_grammar_candidates candidates_grammar;
+ candidates_grammar.reserve(cur_p->size);
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ const llama_token id = cur_p->data[i].id;
+ const std::string & piece = grammar.vocab->token_to_piece(id);
+
+ if (grammar.vocab->is_eog(id)) {
+ if (!allow_eog) {
+ cur_p->data[i].logit = -INFINITY;
+ }
+ } else if (piece.empty() || piece[0] == 0) {
+ cur_p->data[i].logit = -INFINITY;
+ } else {
+ candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
+ candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
+ }
+ }
+
+ const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
+ for (const auto & reject : rejects) {
+ cur_p->data[reject.index].logit = -INFINITY;
+ }
+}
+
+void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
+ GGML_ASSERT(grammar.vocab != nullptr);
+
+ const auto & piece = grammar.vocab->token_to_piece(token);
+
+ if (grammar.awaiting_trigger) {
+ if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
+ grammar.awaiting_trigger = false;
+ grammar.trigger_buffer.clear();
+ llama_grammar_accept_token(grammar, token, piece);
+ LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
+ return;
+ } else {
+ auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
+ grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
+ grammar.trigger_buffer += piece;
+
+ for (const auto & trigger_pattern : grammar.trigger_patterns) {
+ auto start = trigger_pattern.find(grammar.trigger_buffer);
+ if (start != std::string::npos) {
+ grammar.awaiting_trigger = false;
+
+ // replay tokens that overlap with [start, end)
+ for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
+ auto [tok_start, tok_end] = tok_pos;
+ if (tok_end <= start) {
+ continue;
+ }
+
+ size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
+ size_t piece_len = tok_end - piece_start;
+ auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
+ llama_grammar_accept_token(grammar, tok, tok_piece);
+ }
+
+ auto constrained_str = grammar.trigger_buffer.substr(start);
+ grammar.trigger_buffer.clear();
+ grammar.trigger_buffer_positions.clear();
+ LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
+ return;
+ }
+ }
+ LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
+ return;
+ }
+ }
+
+ if (grammar.vocab->is_eog(token)) {
+ for (const auto & stack : grammar.stacks) {
+ if (stack.empty()) {
+ return;
+ }
+ }
+ GGML_ABORT("fatal error");
+ }
+
+ llama_grammar_accept_token(grammar, token, piece);
+}
+
+void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
+ // Note terminating 0 in decoded string
+ const auto decoded = decode_utf8(piece, grammar.partial_utf8);
+ const auto & code_points = decoded.first;
+
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+ llama_grammar_accept(&grammar, *it);
+ }
+
+ grammar.partial_utf8 = decoded.second;
+ if (grammar.stacks.empty()) {
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
+ }
+}
+
+void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
+ // Note terminating 0 in decoded string
+ const auto decoded = decode_utf8(piece, grammar.partial_utf8);
+ const auto & code_points = decoded.first;
+
+ llama_grammar_stacks stacks_new;
+ stacks_new.reserve(grammar.stacks.size());
+
+ for (const auto & stack : grammar.stacks) {
+ if (stack.empty()) {
+ continue;
+ }
+
+ const llama_grammar_element * pos = stack.back();
+
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+ if (llama_grammar_match_token(pos, token)) {
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+ new_stack.push_back(pos + 1);
+ }
+ llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
+ }
+ } else {
+ llama_grammar_stacks current_stacks = {stack};
+
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+ llama_grammar_stacks next_stacks;
+
+ for (const auto & cur_stack : current_stacks) {
+ llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
+ }
+
+ current_stacks = std::move(next_stacks);
+ if (current_stacks.empty()) {
+ break;
+ }
+ }
+
+ for (auto & surviving_stack : current_stacks) {
+ if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
+ stacks_new.emplace_back(surviving_stack);
+ }
+ }
+ }
+ }
+
+ grammar.stacks = std::move(stacks_new);
+ grammar.partial_utf8 = decoded.second;
+
+ if (grammar.stacks.empty()) {
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
+ }
+}
+
diff --git a/llama.cpp/src/llama-grammar.h b/llama.cpp/src/llama-grammar.h
new file mode 100644
index 0000000..b5a0e58
--- /dev/null
+++ b/llama.cpp/src/llama-grammar.h
@@ -0,0 +1,194 @@
+#pragma once
+
+#include "llama.h"
+
+#include <map>
+#include <regex>
+#include <string>
+#include <vector>
+
+struct llama_vocab;
+
+// grammar element type
+enum llama_gretype {
+ // end of rule definition
+ LLAMA_GRETYPE_END = 0,
+
+ // start of alternate definition for rule
+ LLAMA_GRETYPE_ALT = 1,
+
+ // non-terminal element: reference to rule
+ LLAMA_GRETYPE_RULE_REF = 2,
+
+ // terminal element: character (code point)
+ LLAMA_GRETYPE_CHAR = 3,
+
+ // inverse char(s) ([^a], [^a-b] [^abc])
+ LLAMA_GRETYPE_CHAR_NOT = 4,
+
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+ // be an inclusive range ([a-z])
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+ LLAMA_GRETYPE_CHAR_ALT = 6,
+
+ // any character (.)
+ LLAMA_GRETYPE_CHAR_ANY = 7,
+
+ // terminal element: token (<[token-id]>)
+ LLAMA_GRETYPE_TOKEN = 8,
+
+ // inverse token (!<[token-id]>)
+ LLAMA_GRETYPE_TOKEN_NOT = 9,
+};
+
+typedef struct llama_grammar_element {
+ enum llama_gretype type;
+ uint32_t value; // Unicode code point, rule ID, or token ID
+} llama_grammar_element;
+
+struct llama_partial_utf8 {
+ uint32_t value; // bit value so far (unshifted)
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar_candidate {
+ size_t index;
+ const uint32_t * code_points;
+ llama_partial_utf8 partial_utf8;
+ llama_token id;
+};
+
+using llama_grammar_rule = std::vector< llama_grammar_element>;
+using llama_grammar_stack = std::vector<const llama_grammar_element *>;
+
+using llama_grammar_rules = std::vector<llama_grammar_rule>;
+using llama_grammar_stacks = std::vector<llama_grammar_stack>;
+using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
+
+// TODO: remove, needed for tests atm
+const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
+ llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
+
+// takes a set of possible pushdown stacks on a grammar, which are required to
+// be positioned at a character range (see `llama_grammar_advance_stack`), and
+// produces the N possible stacks if the given char is accepted at those
+// positions
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
+
+std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
+ const llama_grammar_rules & rules,
+ const llama_grammar_stack & stack,
+ const llama_grammar_candidates & candidates);
+
+struct llama_grammar_parser {
+ const llama_vocab * vocab;
+ std::map<std::string, uint32_t> symbol_ids;
+
+ llama_grammar_rules rules;
+
+ llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
+
+ llama_grammar_stack c_rules() const;
+
+ uint32_t get_symbol_id(const char * src, size_t len);
+ uint32_t generate_symbol_id(const std::string & base_name);
+
+ void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
+
+ const char * parse_alternates(
+ const char * src,
+ const std::string & rule_name,
+ uint32_t rule_id,
+ bool is_nested);
+
+ const char * parse_sequence(
+ const char * src,
+ const std::string & rule_name,
+ llama_grammar_rule & rule,
+ bool is_nested);
+
+ const char * parse_rule(const char * src);
+
+ bool parse(const char * src);
+ void print(FILE * file);
+};
+
+struct llama_grammar_trigger_pattern {
+ std::string pattern;
+ std::regex regex;
+
+ size_t find(const std::string & input) const;
+};
+
+struct llama_grammar {
+ // maintain a list of llama_tokens and their positions in the trigger_buffer
+ using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
+
+ // note: allow null vocab for testing (not great)
+ const llama_vocab * vocab;
+
+ const llama_grammar_rules rules; // TODO: shared ptr
+ llama_grammar_stacks stacks;
+
+ // buffer for partially generated UTF-8 sequence from accepted tokens
+ llama_partial_utf8 partial_utf8;
+
+ // lazy grammars wait for trigger words or tokens before constraining the sampling.
+ // we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
+ // (useful e.g. for tool_choice=required)
+ bool lazy = false;
+ bool awaiting_trigger = false; // Initialized to true for lazy grammars only
+ std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
+ std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
+ std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
+ std::vector<llama_grammar_trigger_pattern>
+ trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
+ // string, and the grammar will be given the string from the first match group onwards.
+
+};
+
+//
+// internal API
+//
+
+// note: needed for tests (not great)
+struct llama_grammar * llama_grammar_init_impl(
+ const struct llama_vocab * vocab,
+ const llama_grammar_element ** rules,
+ size_t n_rules,
+ size_t start_rule_index);
+
+struct llama_grammar * llama_grammar_init_impl(
+ const struct llama_vocab * vocab,
+ const char * grammar_str,
+ const char * grammar_root,
+ bool lazy,
+ const char ** trigger_patterns,
+ size_t num_trigger_patterns,
+ const llama_token * trigger_tokens,
+ size_t num_trigger_tokens);
+
+void llama_grammar_free_impl(struct llama_grammar * grammar);
+
+struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
+
+// TODO: move the API below as member functions of llama_grammar
+void llama_grammar_apply_impl(
+ const struct llama_grammar & grammar,
+ llama_token_data_array * cur_p);
+
+void llama_grammar_accept_impl(
+ struct llama_grammar & grammar,
+ llama_token token);
+
+void llama_grammar_accept_str(
+ struct llama_grammar & grammar,
+ const std::string & piece);
+
+void llama_grammar_accept_token(
+ struct llama_grammar & grammar,
+ llama_token token,
+ const std::string & piece);
diff --git a/llama.cpp/src/llama-graph.cpp b/llama.cpp/src/llama-graph.cpp
new file mode 100644
index 0000000..bba747d
--- /dev/null
+++ b/llama.cpp/src/llama-graph.cpp
@@ -0,0 +1,2626 @@
+#include "llama-graph.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-cparams.h"
+
+#include "llama-kv-cache.h"
+#include "llama-kv-cache-iswa.h"
+#include "llama-memory-hybrid.h"
+#include "llama-memory-hybrid-iswa.h"
+#include "llama-memory-recurrent.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <sstream>
+#include <unordered_set>
+
+void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
+ if (ubatch->token) {
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens));
+ }
+
+ if (ubatch->embd) {
+ GGML_ASSERT(n_embd == embd->ne[0]);
+
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
+ }
+}
+
+bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
+ bool res = true;
+
+ res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+ res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
+
+ return res;
+}
+
+void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
+ if (ubatch->pos && pos) {
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ if (ubatch->token && n_pos_per_embd == 4) {
+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
+ // the 3 first dims are the same, and 4th dim is all 0
+ std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
+ // copy the first dimension
+ for (int i = 0; i < n_tokens; ++i) {
+ pos_data[ i] = ubatch->pos[i];
+ pos_data[ n_tokens + i] = ubatch->pos[i];
+ pos_data[2 * n_tokens + i] = ubatch->pos[i];
+ pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
+ }
+ ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
+ } else {
+ ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
+ }
+ }
+}
+
+bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
+ bool res = true;
+
+ res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
+
+ return res;
+}
+
+void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
+ if (ubatch->pos && attn_scale) {
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(f_attn_temp_scale != 0.0f);
+ GGML_ASSERT(n_attn_temp_floor_scale != 0);
+
+ std::vector<float> attn_scale_data(n_tokens, 0.0f);
+ for (int i = 0; i < n_tokens; ++i) {
+ const float pos = ubatch->pos[i];
+ attn_scale_data[i] = std::log(
+ std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
+ ) * f_attn_temp_scale + 1.0;
+ }
+
+ ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
+ }
+}
+
+void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
+ if (pos_bucket) {
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+ GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
+
+ int32_t * data = (int32_t *) pos_bucket->data;
+
+ for (int j = 0; j < n_tokens; ++j) {
+ for (int i = 0; i < n_tokens; ++i) {
+ data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
+ }
+ }
+ }
+}
+
+void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
+ if (pos_bucket) {
+ mctx->set_input_pos_bucket(pos_bucket, ubatch);
+ }
+}
+
+void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
+ GGML_ASSERT(out_ids);
+
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
+ int32_t * data = (int32_t *) out_ids->data;
+
+ if (n_outputs == n_tokens) {
+ for (int i = 0; i < n_tokens; ++i) {
+ data[i] = i;
+ }
+
+ return;
+ }
+
+ GGML_ASSERT(ubatch->output);
+
+ int n_outputs = 0;
+
+ for (int i = 0; i < n_tokens; ++i) {
+ if (ubatch->output[i]) {
+ data[n_outputs++] = i;
+ }
+ }
+}
+
+bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
+ bool res = true;
+
+ res &= n_outputs == params.n_outputs;
+
+ return res;
+}
+
+void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+ const int64_t n_tokens = ubatch->n_tokens;
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+ const int64_t n_seqs_unq = ubatch->n_seqs_unq;
+
+ GGML_ASSERT(mean);
+ GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
+
+ float * data = (float *) mean->data;
+ memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean));
+
+ std::vector<uint64_t> sums(n_seqs_unq, 0);
+ for (int i = 0; i < n_tokens; i += n_seq_tokens) {
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
+ const int32_t seq_idx = ubatch->seq_idx[seq_id];
+
+ sums[seq_idx] += ubatch->n_seq_tokens;
+ }
+ }
+
+ std::vector<float> div(n_seqs_unq, 0.0f);
+ for (int s = 0; s < n_seqs_unq; ++s) {
+ const uint64_t sum = sums[s];
+ if (sum > 0) {
+ div[s] = 1.0f/float(sum);
+ }
+ }
+
+ for (int i = 0; i < n_tokens; i += n_seq_tokens) {
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
+ const int32_t seq_idx = ubatch->seq_idx[seq_id];
+
+ for (int j = 0; j < n_seq_tokens; ++j) {
+ data[seq_idx*n_tokens + i + j] = div[seq_idx];
+ }
+ }
+ }
+ }
+}
+
+void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
+ const int64_t n_tokens = ubatch->n_tokens;
+ const int64_t n_seqs_unq = ubatch->n_seqs_unq;
+
+ if (cparams.embeddings && (
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
+ cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
+ )) {
+ GGML_ASSERT(cls);
+ GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
+
+ uint32_t * data = (uint32_t *) cls->data;
+ memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
+
+ std::vector<int> target_pos(n_seqs_unq, -1);
+ std::vector<int> target_row(n_seqs_unq, -1);
+
+ const bool last = (
+ cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
+ (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
+ );
+
+ for (int i = 0; i < n_tokens; ++i) {
+ const llama_pos pos = ubatch->pos[i];
+
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
+ const int32_t seq_idx = ubatch->seq_idx[seq_id];
+
+ if (
+ (target_pos[seq_idx] == -1) ||
+ ( last && pos >= target_pos[seq_idx]) ||
+ (!last && pos < target_pos[seq_idx])
+ ) {
+ target_pos[seq_idx] = pos;
+ target_row[seq_idx] = i;
+ }
+ }
+ }
+
+ for (int s = 0; s < n_seqs_unq; ++s) {
+ if (target_row[s] >= 0) {
+ data[s] = target_row[s];
+ }
+ }
+ }
+}
+
+void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
+ GGML_UNUSED(ubatch);
+
+ const int64_t n_rs = mctx->get_n_rs();
+
+ if (s_copy) {
+ GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
+ int32_t * data = (int32_t *) s_copy->data;
+
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+ for (uint32_t i = 0; i < n_rs; ++i) {
+ data[i] = mctx->s_copy(i);
+ }
+ }
+}
+
+bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ res &= s_copy->ne[0] == mctx->get_n_rs();
+
+ res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
+
+ res &= head == mctx->get_head();
+ res &= rs_z == mctx->get_rs_z();
+
+ return res;
+}
+
+void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
+ GGML_UNUSED(ubatch);
+
+ if (cross_embd && !cross->v_embd.empty()) {
+ assert(cross_embd->type == GGML_TYPE_F32);
+
+ ggml_backend_tensor_set(cross_embd, cross->v_embd.data(), 0, ggml_nbytes(cross_embd));
+ }
+}
+
+static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
+ LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
+ const char * swa_type_str = "unknown";
+
+ switch (swa_type) {
+ case LLAMA_SWA_TYPE_NONE: swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
+ case LLAMA_SWA_TYPE_STANDARD: swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
+ case LLAMA_SWA_TYPE_CHUNKED: swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
+ case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
+ };
+
+ LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
+ LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
+ LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
+
+ LLAMA_LOG_DEBUG(" ");
+ for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
+ LLAMA_LOG_DEBUG("%2d", j);
+ }
+ LLAMA_LOG_DEBUG("\n");
+
+ for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
+ LLAMA_LOG_DEBUG(" %2d ", i);
+ for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
+ float val = data[i * n_kv + j];
+ if (val == -INFINITY) {
+ LLAMA_LOG_DEBUG(" ∞");
+ } else {
+ LLAMA_LOG_DEBUG(" 0");
+ }
+ }
+ LLAMA_LOG_DEBUG("\n");
+ }
+}
+
+void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
+ const int64_t n_kv = ubatch->n_tokens;
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
+ for (int i1 = 0; i1 < n_tokens; ++i1) {
+ const llama_seq_id s1 = ubatch->seq_id[i1][0];
+ const llama_pos p1 = ubatch->pos[i1];
+
+ const uint64_t idst = i1*n_kv;
+
+ for (int i0 = 0; i0 < n_tokens; ++i0) {
+ const llama_seq_id s0 = ubatch->seq_id[i0][0];
+ const llama_pos p0 = ubatch->pos[i0];
+
+ // mask different sequences
+ if (s0 != s1) {
+ continue;
+ }
+
+ // mask future tokens
+ if (cparams.causal_attn && p0 > p1) {
+ continue;
+ }
+
+ // apply SWA if any
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+ continue;
+ }
+
+ data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
+ }
+ }
+ };
+
+ {
+ GGML_ASSERT(self_kq_mask);
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+
+ float * data = (float *) self_kq_mask->data;
+
+ std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
+
+ fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
+
+ if (debug) {
+ print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
+ }
+ }
+
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ GGML_ASSERT(self_kq_mask_swa);
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
+
+ float * data = (float *) self_kq_mask_swa->data;
+
+ std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
+
+ fill_mask(data, hparams.n_swa, hparams.swa_type);
+
+ if (debug) {
+ print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
+ }
+ }
+}
+
+void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
+ mctx->set_input_k_idxs(self_k_idxs, ubatch);
+ mctx->set_input_v_idxs(self_v_idxs, ubatch);
+
+ mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+}
+
+bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+ //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= self_kq_mask->ne[0] == mctx->get_n_kv();
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+ return res;
+}
+
+void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) {
+ mctx->set_input_k_idxs(self_k_idxs, ubatch);
+
+ mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+}
+
+bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+
+ res &= self_kq_mask->ne[0] == mctx->get_n_kv();
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+ return res;
+}
+
+void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
+ mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
+ mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+
+ mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+
+ mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
+ mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+
+ mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+}
+
+bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_kv_cache_iswa_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+ //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+ //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+ res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
+ res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
+
+ return res;
+}
+
+void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
+ GGML_ASSERT(cross_kq_mask);
+
+ const int64_t n_enc = cross_kq_mask->ne[0];
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
+ GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
+
+ float * data = (float *) cross_kq_mask->data;
+
+ for (int i = 0; i < n_tokens; ++i) {
+ for (int j = 0; j < n_enc; ++j) {
+ float f = -INFINITY;
+
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
+
+ if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+ f = 0.0f;
+ }
+ }
+
+ data[i*n_enc + j] = f;
+ }
+ }
+}
+
+void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
+ mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
+ mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+
+ mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+
+ const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+ if (inp_rs->s_copy) {
+ GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
+ int32_t * data = (int32_t *) inp_rs->s_copy->data;
+
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+ for (uint32_t i = 0; i < n_rs; ++i) {
+ data[i] = mctx->get_recr()->s_copy(i);
+ }
+ }
+}
+
+bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
+ //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
+ res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+
+ res &= inp_rs->head == mctx->get_recr()->get_head();
+ res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
+
+ return res;
+}
+
+// TODO: Hybrid input classes are a bit redundant.
+// Instead of creating a hybrid input, the graph can simply create 2 separate inputs.
+// Refactoring is required in the future.
+void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {
+ mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
+
+ mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+
+ const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+ if (inp_rs->s_copy) {
+ GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
+ int32_t * data = (int32_t *) inp_rs->s_copy->data;
+
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+ for (uint32_t i = 0; i < n_rs; ++i) {
+ data[i] = mctx->get_recr()->s_copy(i);
+ }
+ }
+}
+
+bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
+
+ res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
+ res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+
+ res &= inp_rs->head == mctx->get_recr()->get_head();
+ res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
+
+ return res;
+}
+
+void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
+ const auto * attn_ctx = mctx->get_attn();
+
+ // base tensors may not be allocated if there are no non-SWA attention layers
+ if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
+ attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
+ attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+
+ attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+ }
+
+ // swa tensors may not be allocated if there are no SWA attention layers
+ if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
+ attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
+ attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+
+ attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+ }
+
+ const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+ if (inp_rs->s_copy) {
+ GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
+ int32_t * data = (int32_t *) inp_rs->s_copy->data;
+
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+ for (uint32_t i = 0; i < n_rs; ++i) {
+ data[i] = mctx->get_recr()->s_copy(i);
+ }
+ }
+}
+
+bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_memory_hybrid_iswa_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ const auto * attn_ctx = mctx->get_attn();
+
+ // base tensors may not be allocated if there are no non-SWA attention layers
+ if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
+ res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
+ //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv();
+ res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+ }
+
+ // swa tensors may not be allocated if there are no SWA attention layers
+ if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
+ res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+ //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv();
+ res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
+ }
+
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+
+ res &= inp_rs->head == mctx->get_recr()->get_head();
+ res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
+
+ return res;
+}
+
+void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
+ // set the inputs only for the active samplers in the current ubatch
+ std::unordered_set<llama_seq_id> active_samplers;
+ for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
+ if (ubatch->output[i]) {
+ llama_seq_id seq_id = ubatch->seq_id[i][0];
+ active_samplers.insert(seq_id);
+ }
+ }
+
+ for (auto seq_id : active_samplers) {
+ if (samplers.find(seq_id) == samplers.end()) {
+ continue;
+ }
+
+ auto & sampler = samplers[seq_id];
+
+ if (sampler->iface->backend_set_input) {
+ sampler->iface->backend_set_input(sampler);
+ }
+ }
+}
+
+bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
+ if (samplers.size() != params.samplers.size()) {
+ return false;
+ }
+
+ for (const auto & [seq_id, sampler] : params.samplers) {
+ if (samplers[seq_id] != sampler) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+//
+// llm_graph_result
+//
+
+llm_graph_result::llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
+ reset();
+
+ const char * LLAMA_GRAPH_RESULT_DEBUG = getenv("LLAMA_GRAPH_RESULT_DEBUG");
+ debug = LLAMA_GRAPH_RESULT_DEBUG ? atoi(LLAMA_GRAPH_RESULT_DEBUG) : 0;
+}
+
+int64_t llm_graph_result::get_max_nodes() const {
+ return max_nodes;
+}
+
+void llm_graph_result::reset() {
+ t_inp_tokens = nullptr;
+ t_inp_embd = nullptr;
+ t_logits = nullptr;
+ t_embd = nullptr;
+ t_embd_pooled = nullptr;
+ t_sampled.clear();
+ t_sampled_probs.clear();
+ t_sampled_logits.clear();
+ t_candidates.clear();
+
+ params = {};
+
+ inputs.clear();
+
+ buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+
+ ggml_init_params params = {
+ /*.mem_size =*/ buf_compute_meta.size(),
+ /*.mem_buffer =*/ buf_compute_meta.data(),
+ /*.no_alloc =*/ true,
+ };
+
+ ctx_compute.reset(ggml_init(params));
+
+ gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
+}
+
+void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
+ for (auto & input : inputs) {
+ input->set_input(ubatch);
+ }
+}
+
+void llm_graph_result::set_outputs() {
+ if (t_logits != nullptr) {
+ ggml_set_output(t_logits);
+ }
+ if (t_embd != nullptr) {
+ ggml_set_output(t_embd);
+ }
+ if (t_embd_pooled != nullptr) {
+ ggml_set_output(t_embd_pooled);
+ }
+ for (auto & [seq_id, t] : t_sampled) {
+ if (t != nullptr) {
+ ggml_set_output(t);
+ }
+ }
+ for (auto & [seq_id, t] : t_sampled_probs) {
+ if (t != nullptr) {
+ ggml_set_output(t);
+ }
+ }
+ for (auto & [seq_id, t] : t_sampled_logits) {
+ if (t != nullptr) {
+ ggml_set_output(t);
+ }
+ }
+ for (auto & [seq_id, t] : t_candidates) {
+ if (t != nullptr) {
+ ggml_set_output(t);
+ }
+ }
+}
+
+bool llm_graph_result::can_reuse(const llm_graph_params & params) {
+ if (!this->params.allow_reuse(params)) {
+ if (debug > 1) {
+ LLAMA_LOG_DEBUG("%s: cannot reuse graph due to incompatible graph parameters\n", __func__);
+ }
+
+ return false;
+ }
+
+ if (debug > 1) {
+ LLAMA_LOG_DEBUG("%s: checking compatibility of %d inputs:\n", __func__, (int) inputs.size());
+ }
+
+ bool res = true;
+
+ for (auto & input : inputs) {
+ const bool cur = input->can_reuse(params);
+
+ if (debug > 1) {
+ LLAMA_LOG_DEBUG("%s: can_reuse = %d\n", "placeholder", cur);
+ }
+
+ res = res && cur;
+ }
+
+ if (debug > 0) {
+ LLAMA_LOG_DEBUG("%s: can reuse graph = %d\n", __func__, res);
+ }
+
+ return res;
+}
+
+llm_graph_input_i * llm_graph_result::add_input(llm_graph_input_ptr input) {
+ inputs.emplace_back(std::move(input));
+ return inputs.back().get();
+}
+
+void llm_graph_result::set_params(const llm_graph_params & params) {
+ this->params = params;
+}
+
+//
+// llm_graph_context
+//
+
+llm_graph_context::llm_graph_context(const llm_graph_params & params) :
+ arch (params.arch),
+ hparams (params.hparams),
+ cparams (params.cparams),
+ ubatch (params.ubatch),
+ n_embd (hparams.n_embd),
+ n_layer (hparams.n_layer),
+ n_rot (hparams.n_rot),
+ n_ctx (cparams.n_ctx),
+ n_head (hparams.n_head()),
+ n_head_kv (hparams.n_head_kv()),
+ n_embd_head_k (hparams.n_embd_head_k),
+ n_embd_k_gqa (hparams.n_embd_k_gqa()),
+ n_embd_head_v (hparams.n_embd_head_v),
+ n_embd_v_gqa (hparams.n_embd_v_gqa()),
+ n_expert (hparams.n_expert),
+ n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
+ freq_base (cparams.rope_freq_base),
+ freq_scale (cparams.rope_freq_scale),
+ ext_factor (cparams.yarn_ext_factor),
+ attn_factor (cparams.yarn_attn_factor),
+ beta_fast (cparams.yarn_beta_fast),
+ beta_slow (cparams.yarn_beta_slow),
+ norm_eps (hparams.f_norm_eps),
+ norm_rms_eps (hparams.f_norm_rms_eps),
+ n_tokens (ubatch.n_tokens),
+ n_outputs (params.n_outputs),
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
+ pooling_type (cparams.pooling_type),
+ rope_type (hparams.rope_type),
+ sched (params.sched),
+ backend_cpu (params.backend_cpu),
+ cvec (params.cvec),
+ loras (params.loras),
+ mctx (params.mctx),
+ cross (params.cross),
+ samplers (params.samplers),
+ cb_func (params.cb),
+ res (params.res),
+ ctx0 (res->get_ctx()),
+ gf (res->get_gf()) {
+ res->set_params(params);
+ }
+
+void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
+ if (cb_func) {
+ cb_func(ubatch, cur, name, il);
+ }
+}
+
+ggml_tensor * llm_graph_context::build_cvec(
+ ggml_tensor * cur,
+ int il) const {
+ return cvec->apply_to(ctx0, cur, il);
+}
+
+ggml_tensor * llm_graph_context::build_lora_mm(
+ ggml_tensor * w,
+ ggml_tensor * cur) const {
+ ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+ for (const auto & lora : *loras) {
+ llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+ if (lw == nullptr) {
+ continue;
+ }
+
+ const float adapter_scale = lora.second;
+ const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+ ggml_tensor * ab_cur = ggml_mul_mat(
+ ctx0, lw->b,
+ ggml_mul_mat(ctx0, lw->a, cur)
+ );
+
+ ab_cur = ggml_scale(ctx0, ab_cur, scale);
+ res = ggml_add(ctx0, res, ab_cur);
+ }
+
+ return res;
+}
+
+ggml_tensor * llm_graph_context::build_lora_mm_id(
+ ggml_tensor * w, // ggml_tensor * as
+ ggml_tensor * cur, // ggml_tensor * b
+ ggml_tensor * ids) const {
+ ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+ for (const auto & lora : *loras) {
+ llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+ if (lw == nullptr) {
+ continue;
+ }
+
+ const float alpha = lora.first->alpha;
+ const float rank = (float) lw->b->ne[0];
+ const float scale = alpha ? lora.second * alpha / rank : lora.second;
+
+ ggml_tensor * ab_cur = ggml_mul_mat_id(
+ ctx0, lw->b,
+ ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+ ids
+ );
+
+ ab_cur = ggml_scale(ctx0, ab_cur, scale);
+ res = ggml_add(ctx0, res, ab_cur);
+ }
+
+ return res;
+}
+
+ggml_tensor * llm_graph_context::build_norm(
+ ggml_tensor * cur,
+ ggml_tensor * mw,
+ ggml_tensor * mb,
+ llm_norm_type type,
+ int il) const {
+ switch (type) {
+ case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
+ case LLM_NORM_RMS: cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
+ case LLM_NORM_GROUP:
+ {
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
+ cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]);
+ } break;
+ }
+
+ if (mw || mb) {
+ cb(cur, "norm", il);
+ }
+
+ if (mw) {
+ cur = ggml_mul(ctx0, cur, mw);
+ if (mb) {
+ cb(cur, "norm_w", il);
+ }
+ }
+
+ if (mb) {
+ cur = ggml_add(ctx0, cur, mb);
+ }
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * up,
+ ggml_tensor * up_b,
+ ggml_tensor * up_s,
+ ggml_tensor * gate,
+ ggml_tensor * gate_b,
+ ggml_tensor * gate_s,
+ ggml_tensor * down,
+ ggml_tensor * down_b,
+ ggml_tensor * down_s,
+ ggml_tensor * act_scales,
+ llm_ffn_op_type type_op,
+ llm_ffn_gate_type type_gate,
+ int il) const {
+ ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
+ cb(tmp, "ffn_up", il);
+
+ if (up_b) {
+ tmp = ggml_add(ctx0, tmp, up_b);
+ cb(tmp, "ffn_up_b", il);
+ }
+
+ if (up_s) {
+ tmp = ggml_mul(ctx0, tmp, up_s);
+ cb(tmp, "ffn_up_s", il);
+ }
+
+ if (gate) {
+ switch (type_gate) {
+ case LLM_FFN_SEQ:
+ {
+ cur = build_lora_mm(gate, tmp);
+ cb(cur, "ffn_gate", il);
+ } break;
+ case LLM_FFN_PAR:
+ {
+ cur = build_lora_mm(gate, cur);
+ cb(cur, "ffn_gate", il);
+ } break;
+ }
+
+ if (gate_b) {
+ cur = ggml_add(ctx0, cur, gate_b);
+ cb(cur, "ffn_gate_b", il);
+ }
+
+ if (gate_s) {
+ cur = ggml_mul(ctx0, cur, gate_s);
+ cb(cur, "ffn_gate_s", il);
+ }
+
+ } else {
+ cur = tmp;
+ }
+
+ switch (type_op) {
+ case LLM_FFN_SILU:
+ if (gate && type_gate == LLM_FFN_PAR) {
+ // Step35: HF clamps gate (after SiLU) and up before multiplication
+ if (arch == LLM_ARCH_STEP35 && il >= 0) {
+ const float limit = hparams.swiglu_clamp_shexp[il];
+ constexpr float eps = 1e-6f;
+ if (limit > eps) {
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_silu", il);
+ gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
+ cb(gate_act, "ffn_silu_clamped", il);
+
+ tmp = ggml_clamp(ctx0, tmp, -limit, limit);
+ cb(tmp, "ffn_up_clamped", il);
+
+ cur = ggml_mul(ctx0, gate_act, tmp);
+ cb(cur, "ffn_swiglu_limited", il);
+ type_gate = LLM_FFN_SEQ;
+ break;
+ }
+ }
+
+ cur = ggml_swiglu_split(ctx0, cur, tmp);
+ cb(cur, "ffn_swiglu", il);
+ type_gate = LLM_FFN_SEQ;
+ } else {
+ cur = ggml_silu(ctx0, cur);
+ cb(cur, "ffn_silu", il);
+ } break;
+ case LLM_FFN_GELU:
+ if (gate && type_gate == LLM_FFN_PAR) {
+ cur = ggml_geglu_split(ctx0, cur, tmp);
+ cb(cur, "ffn_geglu", il);
+ type_gate = LLM_FFN_SEQ;
+ } else {
+ cur = ggml_gelu(ctx0, cur);
+ cb(cur, "ffn_gelu", il);
+ if (act_scales != NULL) {
+ cur = ggml_div(ctx0, cur, act_scales);
+ cb(cur, "ffn_act", il);
+ }
+ } break;
+ case LLM_FFN_RELU:
+ if (gate && type_gate == LLM_FFN_PAR) {
+ cur = ggml_reglu_split(ctx0, cur, tmp);
+ cb(cur, "ffn_reglu", il);
+ type_gate = LLM_FFN_SEQ;
+ } else {
+ cur = ggml_relu(ctx0, cur);
+ cb(cur, "ffn_relu", il);
+ } break;
+ case LLM_FFN_RELU_SQR:
+ {
+ cur = ggml_relu(ctx0, cur);
+ cb(cur, "ffn_relu", il);
+
+ cur = ggml_sqr(ctx0, cur);
+ cb(cur, "ffn_sqr(relu)", il);
+ } break;
+ case LLM_FFN_SWIGLU:
+ {
+ cur = ggml_swiglu(ctx0, cur);
+ cb(cur, "ffn_swiglu", il);
+ } break;
+ case LLM_FFN_GEGLU:
+ {
+ cur = ggml_geglu(ctx0, cur);
+ cb(cur, "ffn_geglu", il);
+ } break;
+ case LLM_FFN_REGLU:
+ {
+ cur = ggml_reglu(ctx0, cur);
+ cb(cur, "ffn_reglu", il);
+ } break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+
+ if (gate && type_gate == LLM_FFN_PAR) {
+ cur = ggml_mul(ctx0, cur, tmp);
+ cb(cur, "ffn_gate_par", il);
+ }
+
+ if (down) {
+ cur = build_lora_mm(down, cur);
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+ ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+ }
+ }
+
+ if (down_b) {
+ cb(cur, "ffn_down", il);
+ }
+
+ if (down_b) {
+ cur = ggml_add(ctx0, cur, down_b);
+ }
+
+ if (down_s) {
+ cur = ggml_mul(ctx0, cur, down_s);
+ cb(cur, "ffn_down_s", il);
+ }
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_moe_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * gate_inp,
+ ggml_tensor * up_exps,
+ ggml_tensor * gate_exps,
+ ggml_tensor * down_exps,
+ ggml_tensor * exp_probs_b,
+ int64_t n_expert,
+ int64_t n_expert_used,
+ llm_ffn_op_type type_op,
+ bool norm_w,
+ bool scale_w,
+ float w_scale,
+ llama_expert_gating_func_type gating_op,
+ int il,
+ ggml_tensor * probs_in) const {
+ return build_moe_ffn(
+ cur,
+ gate_inp, /* gate_inp_b */ nullptr,
+ up_exps, /* up_exps_b */ nullptr,
+ gate_exps, /* gate_exps_b */ nullptr,
+ down_exps, /* down_exps_b */ nullptr,
+ exp_probs_b,
+ n_expert,
+ n_expert_used,
+ type_op,
+ norm_w,
+ scale_w,
+ w_scale,
+ gating_op,
+ il,
+ probs_in
+ );
+}
+
+ggml_tensor * llm_graph_context::build_moe_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * gate_inp,
+ ggml_tensor * gate_inp_b,
+ ggml_tensor * up_exps,
+ ggml_tensor * up_exps_b,
+ ggml_tensor * gate_exps,
+ ggml_tensor * gate_exps_b,
+ ggml_tensor * down_exps,
+ ggml_tensor * down_exps_b,
+ ggml_tensor * exp_probs_b,
+ int64_t n_expert,
+ int64_t n_expert_used,
+ llm_ffn_op_type type_op,
+ bool norm_w,
+ bool scale_w,
+ float w_scale,
+ llama_expert_gating_func_type gating_op,
+ int il,
+ ggml_tensor * probs_in) const {
+ const int64_t n_embd = cur->ne[0];
+ const int64_t n_tokens = cur->ne[1];
+ const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
+
+ ggml_tensor * logits = nullptr;
+
+ if (probs_in == nullptr) {
+ logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
+ cb(logits, "ffn_moe_logits", il);
+ } else {
+ logits = probs_in;
+ }
+
+ if (gate_inp_b) {
+ logits = ggml_add(ctx0, logits, gate_inp_b);
+ cb(logits, "ffn_moe_logits_biased", il);
+ }
+
+ ggml_tensor * probs = nullptr;
+ switch (gating_op) {
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
+ {
+ probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
+ } break;
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
+ {
+ probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
+ } break;
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
+ {
+ probs = logits; // [n_expert, n_tokens]
+ } break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+ cb(probs, "ffn_moe_probs", il);
+
+ // add experts selection bias - introduced in DeepSeek V3
+ // leave probs unbiased as it's later used to get expert weights
+ ggml_tensor * selection_probs = probs;
+ if (exp_probs_b != nullptr) {
+ selection_probs = ggml_add(ctx0, probs, exp_probs_b);
+ cb(selection_probs, "ffn_moe_probs_biased", il);
+ }
+
+ // llama4 doesn't have exp_probs_b, and sigmoid is only used after top_k
+ // see: https://github.com/meta-llama/llama-models/blob/699a02993512fb36936b1b0741e13c06790bcf98/models/llama4/moe.py#L183-L198
+ if (arch == LLM_ARCH_LLAMA4) {
+ selection_probs = logits;
+ }
+
+ if (arch == LLM_ARCH_GROVEMOE) {
+ selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
+ cb(selection_probs, "ffn_moe_probs_biased", il);
+ }
+
+ // select top n_group_used expert groups
+ // https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
+ if (hparams.n_expert_groups > 1 && n_tokens > 0) {
+ const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
+
+ // organize experts into n_expert_groups
+ ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
+
+ ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
+ group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
+
+ // get top n_group_used expert groups
+ group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
+ group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
+
+ ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
+ cb(expert_groups, "ffn_moe_group_topk", il);
+
+ // mask out the other groups
+ selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
+ selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
+ selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
+ cb(selection_probs, "ffn_moe_probs_masked", il);
+ }
+
+ // select experts
+ ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ cb(selected_experts, "ffn_moe_topk", il);
+
+ if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
+ // TODO: Use scalar div instead when/if implemented
+ ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
+ selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
+ probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
+ } else {
+ probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
+ }
+
+ ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
+ cb(weights, "ffn_moe_weights", il);
+
+
+ if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
+ weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
+ weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
+ cb(weights, "ffn_moe_weights_softmax", il);
+ }
+
+ if (norm_w) {
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
+
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
+ cb(weights_sum, "ffn_moe_weights_sum", il);
+
+ // Avoid division by zero, clamp to smallest number representable by F16
+ weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
+ cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
+
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
+ cb(weights, "ffn_moe_weights_norm", il);
+
+ weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
+ }
+ if (scale_w) {
+ weights = ggml_scale(ctx0, weights, w_scale);
+ cb(weights, "ffn_moe_weights_scaled", il);
+ }
+
+ //call early so that topk-moe can be used
+ ggml_build_forward_expand(gf, weights);
+
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+
+ if (weight_before_ffn) {
+ // repeat cur to [n_embd, n_expert_used, n_tokens]
+ ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1);
+ cur = ggml_mul(ctx0, repeated, weights);
+ cb(cur, "ffn_moe_weighted", il);
+ }
+
+ ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+ cb(up, "ffn_moe_up", il);
+
+ if (up_exps_b) {
+ up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
+ cb(up, "ffn_moe_up_biased", il);
+ }
+
+ ggml_tensor * experts = nullptr;
+ if (gate_exps) {
+ cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+ cb(cur, "ffn_moe_gate", il);
+ } else {
+ cur = up;
+ }
+
+ if (gate_exps_b) {
+ cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
+ cb(cur, "ffn_moe_gate_biased", il);
+ }
+
+ switch (type_op) {
+ case LLM_FFN_SILU:
+ if (gate_exps) {
+ // Step35: per-layer clamp for routed experts
+ if (arch == LLM_ARCH_STEP35 && il >= 0) {
+ const float limit = hparams.swiglu_clamp_exp[il];
+ constexpr float eps = 1e-6f;
+ if (limit > eps) {
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_moe_silu", il);
+ gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
+ cb(gate_act, "ffn_moe_silu_clamped", il);
+
+ up = ggml_clamp(ctx0, up, -limit, limit);
+ cb(up, "ffn_moe_up_clamped", il);
+
+ cur = ggml_mul(ctx0, gate_act, up);
+ cb(cur, "ffn_moe_swiglu_limited", il);
+ break;
+ }
+ }
+
+ cur = ggml_swiglu_split(ctx0, cur, up);
+ cb(cur, "ffn_moe_swiglu", il);
+ } else {
+ cur = ggml_silu(ctx0, cur);
+ cb(cur, "ffn_moe_silu", il);
+ } break;
+ case LLM_FFN_GELU:
+ if (gate_exps) {
+ cur = ggml_geglu_split(ctx0, cur, up);
+ cb(cur, "ffn_moe_geglu", il);
+ } else {
+ cur = ggml_gelu(ctx0, cur);
+ cb(cur, "ffn_moe_gelu", il);
+ } break;
+ case LLM_FFN_SWIGLU_OAI_MOE:
+ {
+ // TODO: move to hparams?
+ constexpr float alpha = 1.702f;
+ constexpr float limit = 7.0f;
+ cur = ggml_swiglu_oai(ctx0, cur, up, alpha, limit);
+ cb(cur, "ffn_moe_swiglu_oai", il);
+ } break;
+ case LLM_FFN_RELU:
+ if (gate_exps) {
+ cur = ggml_reglu_split(ctx0, cur, up);
+ cb(cur, "ffn_moe_reglu", il);
+ } else {
+ cur = ggml_relu(ctx0, cur);
+ cb(cur, "ffn_moe_relu", il);
+ } break;
+ case LLM_FFN_RELU_SQR:
+ if (gate_exps) {
+ // TODO: add support for gated squared relu
+ GGML_ABORT("fatal error: gated squared relu not implemented");
+ } else {
+ cur = ggml_relu(ctx0, cur);
+ cur = ggml_sqr(ctx0, cur);
+ cb(cur, "ffn_moe_relu_sqr", il);
+ } break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+
+ experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
+ cb(experts, "ffn_moe_down", il);
+
+ if (down_exps_b) {
+ experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
+ cb(experts, "ffn_moe_down_biased", il);
+ }
+
+ if (!weight_before_ffn) {
+ experts = ggml_mul(ctx0, experts, weights);
+ cb(cur, "ffn_moe_weighted", il);
+ }
+
+ ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
+
+ assert(n_expert_used > 0);
+
+ // order the views before the adds
+ for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
+ cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
+
+ ggml_build_forward_expand(gf, cur_experts[i]);
+ }
+
+ // aggregate experts
+ // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
+ // to avoid potentially a large number of add nodes during warmup
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14753
+ ggml_tensor * moe_out = cur_experts[0];
+
+ for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
+ moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
+ }
+
+ if (hparams.n_expert_used == 1) {
+ // avoid returning a non-contiguous tensor
+ moe_out = ggml_cont(ctx0, moe_out);
+ }
+
+ cb(moe_out, "ffn_moe_out", il);
+
+ return moe_out;
+}
+
+// input embeddings with optional lora
+ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
+ const int64_t n_embd_inp = hparams.n_embd_inp();
+ const int64_t n_embd = hparams.n_embd;
+
+ assert(n_embd_inp >= n_embd);
+
+ auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
+
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+ cb(inp->tokens, "inp_tokens", -1);
+ ggml_set_input(inp->tokens);
+ res->t_inp_tokens = inp->tokens;
+
+ inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
+ cb(inp->embd, "inp_embd", -1);
+ ggml_set_input(inp->embd);
+
+ // select one of the 2 inputs, based on the batch contents
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18550
+ std::array<ggml_tensor *, 2> inps;
+
+ // token embeddings path (ubatch.token != nullptr)
+ {
+ auto & cur = inps[0];
+
+ cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
+
+ // apply lora for embedding tokens if needed
+ for (const auto & lora : *loras) {
+ llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
+ if (lw == nullptr) {
+ continue;
+ }
+
+ const float adapter_scale = lora.second;
+ const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+ ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
+ ctx0, lw->b, // non-transposed lora_b
+ ggml_get_rows(ctx0, lw->a, inp->tokens)
+ ), scale);
+
+ cur = ggml_add(ctx0, cur, inpL_delta);
+ }
+
+ if (n_embd_inp != n_embd) {
+ cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0);
+ }
+ }
+
+ // vector embeddings path (ubatch.embd != nullptr)
+ {
+ auto & cur = inps[1];
+
+ cur = inp->embd;
+ }
+
+ assert(ggml_are_same_shape (inps[0], inps[1]));
+ assert(ggml_are_same_stride(inps[0], inps[1]));
+
+ ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
+
+ if (n_embd_inp != n_embd) {
+ cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
+ }
+
+ res->t_inp_embd = cur;
+
+ // For Granite architecture
+ if (hparams.f_embedding_scale != 0.0f) {
+ cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
+ }
+
+ cb(cur, "embd", -1);
+
+ res->add_input(std::move(inp));
+
+ // make sure the produced embeddings are immediately materialized in the ggml graph
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18599
+ ggml_build_forward_expand(gf, cur);
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_pos() const {
+ auto inp = std::make_unique<llm_graph_input_pos>(hparams.n_pos_per_embd());
+
+ auto & cur = inp->pos;
+
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
+
+ auto & cur = inp->attn_scale;
+
+ // this need to be 1x1xN for broadcasting
+ cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_out_ids() const {
+ // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
+ // but this would make the graph topology depend on the number of output tokens, which can interere with
+ // features that require constant topology such as pipline parallelism
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
+ //if (n_outputs < n_tokens) {
+ // return nullptr;
+ //}
+
+ auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
+
+ auto & cur = inp->out_ids;
+
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_mean() const {
+ auto inp = std::make_unique<llm_graph_input_mean>(cparams);
+
+ auto & cur = inp->mean;
+
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, ubatch.n_seqs_unq);
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_cls() const {
+ auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
+
+ auto & cur = inp->cls;
+
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_seqs_unq);
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
+ auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
+
+ auto & cur = inp->cross_embd;
+
+ // if we have the output embeddings from the encoder, use them directly
+ // TODO: needs more work to be correct, for now just use the tensor shape
+ //if (cross->t_embd) {
+ // cur = ggml_view_tensor(ctx0, cross->t_embd);
+
+ // return cur;
+ //}
+
+ const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
+ const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
+
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
+ auto inp = std::make_unique<llm_graph_input_pos_bucket>(hparams);
+
+ auto & cur = inp->pos_bucket;
+
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
+ const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
+
+ auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur);
+
+ const auto n_kv = mctx_cur->get_n_kv();
+
+ auto & cur = inp->pos_bucket;
+
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
+ ggml_set_input(cur);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const {
+ ggml_tensor * pos_bucket_1d = ggml_reshape_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1]);
+ cb(pos_bucket_1d, "pos_bucket_1d", -1);
+
+ ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
+
+ pos_bias = ggml_reshape_3d(ctx0, pos_bias, pos_bias->ne[0], pos_bucket->ne[0], pos_bucket->ne[1]);
+ pos_bias = ggml_permute (ctx0, pos_bias, 2, 0, 1, 3);
+ pos_bias = ggml_cont (ctx0, pos_bias);
+
+ cb(pos_bias, "pos_bias", -1);
+
+ return pos_bias;
+}
+
+ggml_tensor * llm_graph_context::build_attn_mha(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * kq_b,
+ ggml_tensor * kq_mask,
+ ggml_tensor * sinks,
+ ggml_tensor * v_mla,
+ float kq_scale,
+ int il) const {
+ const bool v_trans = v->nb[1] > v->nb[2];
+
+ // split the batch into streams if needed
+ const auto n_stream = k->ne[3];
+
+ q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
+
+ q = ggml_permute(ctx0, q, 0, 2, 1, 3);
+ k = ggml_permute(ctx0, k, 0, 2, 1, 3);
+ v = ggml_permute(ctx0, v, 0, 2, 1, 3);
+
+ ggml_tensor * cur;
+
+ if (cparams.flash_attn && kq_b == nullptr) {
+ GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
+
+ if (v_trans) {
+ v = ggml_transpose(ctx0, v);
+ }
+
+ // this can happen when KV cache is not used (e.g. an embedding model with non-causal attn)
+ if (k->type == GGML_TYPE_F32) {
+ k = ggml_cast(ctx0, k, GGML_TYPE_F16);
+ }
+
+ if (v->type == GGML_TYPE_F32) {
+ v = ggml_cast(ctx0, v, GGML_TYPE_F16);
+ }
+
+ cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
+ hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
+ cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
+
+ ggml_flash_attn_ext_add_sinks(cur, sinks);
+ ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
+
+ if (v_mla) {
+#if 0
+ // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
+ // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
+ cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
+ cur = ggml_mul_mat(ctx0, v_mla, cur);
+#else
+ // It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
+ // The permutations are noops and only change how the tensor data is interpreted.
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ cur = ggml_mul_mat(ctx0, v_mla, cur);
+ cb(cur, "fattn_mla", il);
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+ cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
+#endif
+ }
+
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
+ } else {
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
+
+ // note: this op tends to require high floating point range
+ // while for some models F16 is enough, for others it is not, so we default to F32 here
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+ if (arch == LLM_ARCH_GROK) {
+ // need to do the following:
+ // multiply by attn_output_multiplier
+ // and then :
+ // kq = 30 * tanh(kq / 30)
+ // before the softmax below
+
+ kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
+ cb(kq, "kq_tanh", il);
+ kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
+ cb(kq, "kq_scaled", il);
+ }
+
+ if (hparams.attn_soft_cap) {
+ kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
+ cb(kq, "kq_scaled_1", il);
+ kq = ggml_tanh (ctx0, kq);
+ cb(kq, "kq_tanh", il);
+ kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
+ cb(kq, "kq_scaled_2", il);
+ }
+
+ if (kq_b) {
+ kq = ggml_add(ctx0, kq, kq_b);
+ cb(kq, "kq_plus_kq_b", il);
+ }
+
+ kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+ ggml_soft_max_add_sinks(kq, sinks);
+ cb(kq, "kq_soft_max", il);
+
+ if (!v_trans) {
+ // note: avoid this branch
+ v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
+ cb(v, "v_cont", il);
+ }
+
+ ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+ cb(kqv, "kqv", il);
+
+ // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
+ if (v_mla) {
+ kqv = ggml_mul_mat(ctx0, v_mla, kqv);
+ cb(kqv, "kqv_mla", il);
+ }
+
+ cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+
+ // recombine streams
+ cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
+
+ if (!cparams.offload_kqv) {
+ // all nodes between the KV store and the attention output are run on the CPU
+ ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
+ }
+ }
+
+ ggml_build_forward_expand(gf, cur);
+
+ return cur;
+}
+
+llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() const {
+ auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
+
+ // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+ ggml_set_input(inp->self_kq_mask);
+
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+ ggml_set_input(inp->self_kq_mask_swa);
+
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+ } else {
+ inp->self_kq_mask_swa = nullptr;
+ inp->self_kq_mask_swa_cnv = nullptr;
+ }
+
+ return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+ llm_graph_input_attn_no_cache * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks,
+ ggml_tensor * v_mla,
+ float kq_scale,
+ int il) const {
+ GGML_UNUSED(n_tokens);
+
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(gf, q_cur);
+ ggml_build_forward_expand(gf, k_cur);
+ ggml_build_forward_expand(gf, v_cur);
+
+ const bool is_swa = hparams.is_swa(il);
+
+ const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+
+ // [TAG_NO_CACHE_PAD]
+ // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
+ // but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
+ //assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
+
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = k_cur;
+ ggml_tensor * v = v_cur;
+
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+ cb(cur, "kqv_out", il);
+
+ if (wo) {
+ cur = build_lora_mm(wo, cur);
+ }
+
+ if (wo_b) {
+ //cb(cur, "kqv_wo", il);
+ }
+
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
+
+ return cur;
+}
+
+static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
+ ggml_context * ctx0,
+ const llama_ubatch & ubatch,
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx_cur) {
+
+ auto inp = std::make_unique<llm_graph_input_attn_kv>(hparams, cparams, mctx_cur);
+
+ {
+ GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
+
+ const auto n_kv = mctx_cur->get_n_kv();
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
+ inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
+
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp->self_kq_mask);
+
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+ }
+
+ return inp;
+}
+
+llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const {
+ const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
+
+ auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
+
+ return (llm_graph_input_attn_kv *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+ llm_graph_input_attn_kv * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks,
+ ggml_tensor * v_mla, // TODO: remove
+ float kq_scale,
+ int il) const {
+ GGML_ASSERT(v_mla == nullptr);
+
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ // expand k later to enable rope fusion which directly writes into k-v cache
+ ggml_build_forward_expand(gf, q_cur);
+ ggml_build_forward_expand(gf, v_cur);
+ ggml_build_forward_expand(gf, k_cur);
+
+ const auto * mctx_cur = inp->mctx;
+
+ // store to KV cache
+ {
+ const auto & k_idxs = inp->get_k_idxs();
+ const auto & v_idxs = inp->get_v_idxs();
+
+ ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+ ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, v_idxs, il));
+ }
+
+ const auto & kq_mask = inp->get_kq_mask();
+
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+ ggml_tensor * v = mctx_cur->get_v(ctx0, il);
+
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+ cb(cur, "kqv_out", il);
+
+ if (wo) {
+ cur = build_lora_mm(wo, cur);
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+ ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+ }
+ }
+
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
+
+ return cur;
+}
+
+static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
+ ggml_context * ctx0,
+ const llama_ubatch & ubatch,
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx_cur) {
+
+ auto inp = std::make_unique<llm_graph_input_attn_k>(hparams, cparams, mctx_cur);
+
+ {
+ GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
+
+ const auto n_kv = mctx_cur->get_n_kv();
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
+
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp->self_kq_mask);
+
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+ }
+
+ return inp;
+}
+
+llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
+ const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
+
+ auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
+
+ return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+ llm_graph_input_attn_k * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks,
+ ggml_tensor * v_mla,
+ float kq_scale,
+ int il) const {
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ // expand k later to enable rope fusion which directly writes into k-v cache
+ ggml_build_forward_expand(gf, q_cur);
+ ggml_build_forward_expand(gf, v_cur);
+ ggml_build_forward_expand(gf, k_cur);
+
+ const auto * mctx_cur = inp->mctx;
+
+ // store to KV cache
+ {
+ const auto & k_idxs = inp->get_k_idxs();
+
+ ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+ }
+
+ const auto & kq_mask = inp->get_kq_mask();
+
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+ ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
+
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+ cb(cur, "kqv_out", il);
+
+ if (wo) {
+ cur = build_lora_mm(wo, cur);
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+ ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+ }
+ }
+
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+ llm_graph_input_attn_kv_iswa * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks,
+ ggml_tensor * v_mla,
+ float kq_scale,
+ int il) const {
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(gf, q_cur);
+
+ if (k_cur) {
+ ggml_build_forward_expand(gf, k_cur);
+ }
+
+ if (v_cur) {
+ ggml_build_forward_expand(gf, v_cur);
+ }
+
+ const auto * mctx_iswa = inp->mctx;
+
+ const bool is_swa = hparams.is_swa(il);
+
+ const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base();
+
+ // optionally store to KV cache
+ if (k_cur) {
+ const auto & k_idxs = is_swa ? inp->get_k_idxs_swa() : inp->get_k_idxs();
+
+ ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+ }
+
+ if (v_cur) {
+ const auto & v_idxs = is_swa ? inp->get_v_idxs_swa() : inp->get_v_idxs();
+
+ ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, v_idxs, il));
+ }
+
+ const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+ ggml_tensor * v = mctx_cur->get_v(ctx0, il);
+
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+ cb(cur, "kqv_out", il);
+
+ if (wo) {
+ cur = build_lora_mm(wo, cur);
+ }
+
+ if (wo_b) {
+ //cb(cur, "kqv_wo", il);
+ }
+
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
+
+ return cur;
+}
+
+llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
+ auto inp = std::make_unique<llm_graph_input_attn_cross>(cross);
+
+ const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
+
+ inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
+ ggml_set_input(inp->cross_kq_mask);
+
+ inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
+
+ return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+ llm_graph_input_attn_cross * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks,
+ ggml_tensor * v_mla,
+ float kq_scale,
+ int il) const {
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(gf, q_cur);
+ ggml_build_forward_expand(gf, k_cur);
+ ggml_build_forward_expand(gf, v_cur);
+
+ const auto & kq_mask = inp->get_kq_mask_cross();
+
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = k_cur;
+ ggml_tensor * v = v_cur;
+
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+ cb(cur, "kqv_out", il);
+
+ if (wo) {
+ cur = build_lora_mm(wo, cur);
+ }
+
+ if (wo_b) {
+ //cb(cur, "kqv_wo", il);
+ }
+
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
+
+ return cur;
+}
+
+// TODO: maybe separate the inner implementation into a separate function
+// like with the non-sliding window equivalent
+// once sliding-window hybrid caches are a thing.
+llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const {
+ const auto * mctx_cur = static_cast<const llama_kv_cache_iswa_context *>(mctx);
+
+ auto inp = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, mctx_cur);
+
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ {
+ const auto n_kv = mctx_cur->get_base()->get_n_kv();
+
+ inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
+ inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
+
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp->self_kq_mask);
+ ggml_set_name(inp->self_kq_mask, "self_kq_mask");
+
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+ ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
+ }
+
+ {
+ GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache for non-SWA");
+
+ const auto n_kv = mctx_cur->get_swa()->get_n_kv();
+
+ inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
+ inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
+
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp->self_kq_mask_swa);
+ ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
+
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+ ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
+ }
+
+ return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_rs(
+ ggml_tensor * s,
+ ggml_tensor * state_copy_main,
+ ggml_tensor * state_copy_extra,
+ int32_t state_size,
+ int32_t n_seqs,
+ uint32_t n_rs,
+ uint32_t rs_head,
+ uint32_t rs_size,
+ int32_t rs_zero,
+ const llm_graph_get_rows_fn & get_state_rows) const {
+
+ ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
+
+ // Clear a single state which will then be copied to the other cleared states.
+ // Note that this is a no-op when the view is zero-sized.
+ ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
+ ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
+
+ // copy states
+ // NOTE: assuming the copy destinations are ALL contained between rs_head and rs_head + n_rs
+ // {state_size, rs_size} -> {state_size, n_seqs}
+ ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
+ ggml_build_forward_expand(gf, output_states);
+
+ // copy extra states which won't be changed further (between n_seqs and n_rs)
+ ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
+ ggml_build_forward_expand(gf,
+ ggml_cpy(ctx0,
+ states_extra,
+ ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
+
+ return output_states;
+}
+
+static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
+ ggml_context * ctx0,
+ const llama_ubatch & ubatch,
+ const llama_memory_recurrent_context * mctx_cur) {
+
+ auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
+
+ const int64_t n_rs = mctx_cur->get_n_rs();
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
+ ggml_set_input(inp->s_copy);
+
+ inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
+ inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
+
+ inp->head = mctx_cur->get_head();
+ inp->rs_z = mctx_cur->get_rs_z();
+
+ return inp;
+}
+
+llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+ auto inp = build_rs_inp_impl(ctx0, ubatch, mctx_cur);
+
+ return (llm_graph_input_rs *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_rs(
+ llm_graph_input_rs * inp,
+ ggml_tensor * s,
+ int32_t state_size,
+ int32_t n_seqs,
+ const llm_graph_get_rows_fn & get_state_rows) const {
+ const auto * kv_state = inp->mctx;
+
+ return build_rs(s, inp->s_copy_main, inp->s_copy_extra, state_size, n_seqs,
+ kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(),
+ get_state_rows);
+}
+
+ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
+ llm_graph_input_rs * inp,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+ const auto token_shift_count = hparams.token_shift_count;
+
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * token_shift_all = mctx_cur->get_r_l(il);
+
+ ggml_tensor * token_shift = build_rs(
+ inp, token_shift_all,
+ hparams.n_embd_r(), n_seqs);
+
+ token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
+
+ return token_shift;
+}
+
+ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
+ ggml_tensor * token_shift,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+ const auto token_shift_count = hparams.token_shift_count;
+ const auto n_embd = hparams.n_embd;
+
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ return ggml_cpy(
+ ctx0,
+ ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
+ ggml_view_1d(ctx0, mctx_cur->get_r_l(il), hparams.n_embd_r()*n_seqs, hparams.n_embd_r()*kv_head*ggml_element_size(mctx_cur->get_r_l(il)))
+ );
+}
+
+llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
+
+ auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
+ auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
+
+ auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
+
+ return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
+}
+
+llm_graph_input_mem_hybrid_k * llm_graph_context::build_inp_mem_hybrid_k() const {
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
+
+ auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
+ auto inp_attn = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
+
+ auto inp = std::make_unique<llm_graph_input_mem_hybrid_k>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
+
+ return (llm_graph_input_mem_hybrid_k *) res->add_input(std::move(inp));
+}
+
+llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const {
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_iswa_context *>(mctx);
+
+ auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
+
+ // build iswa attention input
+ const auto * attn_ctx = mctx_cur->get_attn();
+
+ auto inp_attn = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, attn_ctx);
+
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ {
+ const auto n_kv = attn_ctx->get_base()->get_n_kv();
+
+ inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
+ inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
+
+ inp_attn->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp_attn->self_kq_mask);
+
+ inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
+ }
+
+ {
+ const auto n_kv = attn_ctx->get_swa()->get_n_kv();
+
+ inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
+ inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
+
+ inp_attn->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp_attn->self_kq_mask_swa);
+
+ inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
+ }
+
+ auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
+
+ return (llm_graph_input_mem_hybrid_iswa *) res->add_input(std::move(inp));
+}
+
+void llm_graph_context::build_dense_out(
+ ggml_tensor * dense_2,
+ ggml_tensor * dense_3) const {
+ if (!cparams.embeddings || !(dense_2 || dense_3)) {
+ return;
+ }
+ ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
+ GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
+
+ if (dense_2) {
+ cur = ggml_mul_mat(ctx0, dense_2, cur);
+ }
+ if (dense_3) {
+ cur = ggml_mul_mat(ctx0, dense_3, cur);
+ }
+ cb(cur, "result_embd_pooled", -1);
+ res->t_embd_pooled = cur;
+ ggml_build_forward_expand(gf, cur);
+}
+
+
+void llm_graph_context::build_pooling(
+ ggml_tensor * cls,
+ ggml_tensor * cls_b,
+ ggml_tensor * cls_out,
+ ggml_tensor * cls_out_b) const {
+ if (!cparams.embeddings) {
+ return;
+ }
+
+ ggml_tensor * inp = res->t_embd;
+
+ //// find result_norm tensor for input
+ //for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+ // inp = ggml_graph_node(gf, i);
+ // if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
+ // break;
+ // }
+
+ // inp = nullptr;
+ //}
+
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
+
+ ggml_tensor * cur;
+
+ switch (pooling_type) {
+ case LLAMA_POOLING_TYPE_NONE:
+ {
+ cur = inp;
+ } break;
+ case LLAMA_POOLING_TYPE_MEAN:
+ {
+ ggml_tensor * inp_mean = build_inp_mean();
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
+ } break;
+ case LLAMA_POOLING_TYPE_CLS:
+ case LLAMA_POOLING_TYPE_LAST:
+ {
+ ggml_tensor * inp_cls = build_inp_cls();
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
+ } break;
+ case LLAMA_POOLING_TYPE_RANK:
+ {
+ ggml_tensor * inp_cls = build_inp_cls();
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
+
+ // classification head
+ // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+ if (cls) {
+ cur = ggml_mul_mat(ctx0, cls, cur);
+ if (cls_b) {
+ cur = ggml_add(ctx0, cur, cls_b);
+ }
+ cur = ggml_tanh(ctx0, cur);
+ }
+
+ // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+ // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+ // Single layer classification head (direct projection)
+ // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
+ if (cls_out) {
+ cur = ggml_mul_mat(ctx0, cls_out, cur);
+ if (cls_out_b) {
+ cur = ggml_add(ctx0, cur, cls_out_b);
+ }
+ }
+
+ // softmax for qwen3 reranker
+ if (arch == LLM_ARCH_QWEN3) {
+ cur = ggml_soft_max(ctx0, cur);
+ }
+ } break;
+ default:
+ {
+ GGML_ABORT("unknown pooling type");
+ }
+ }
+
+ cb(cur, "result_embd_pooled", -1);
+ res->t_embd_pooled = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+void llm_graph_context::build_sampling() const {
+ if (samplers.empty() || !res->t_logits) {
+ return;
+ }
+
+ std::array<ggml_tensor *, 2> outs;
+ outs[0] = res->t_logits;
+
+ auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
+ res->add_input(std::move(inp_sampling));
+
+ std::map<llama_seq_id, int32_t> seq_to_logit_row;
+ int32_t logit_row_idx = 0;
+
+ for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+ if (ubatch.output[i]) {
+ llama_seq_id seq_id = ubatch.seq_id[i][0];
+ seq_to_logit_row[seq_id] = logit_row_idx;
+ logit_row_idx++;
+ }
+ }
+
+ // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
+ GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
+
+ // add a dummy row of logits
+ // this trick makes the graph static, regardless of which samplers are activated
+ // this is important in order to minimize graph reallocations
+ ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
+
+ for (const auto & [seq_id, sampler] : samplers) {
+ const auto it = seq_to_logit_row.find(seq_id);
+
+ // inactive samplers always work on the first row
+ const auto row_idx = it != seq_to_logit_row.end() ? it->second : 0;
+ const int i_out = it != seq_to_logit_row.end() ? 1 : 0;
+
+ ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
+ ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
+
+ struct llama_sampler_data data = {
+ /*.logits =*/ logits_seq,
+ /*.probs =*/ nullptr,
+ /*.sampled =*/ nullptr,
+ /*.candidates =*/ nullptr,
+ };
+
+ assert(sampler->iface->backend_apply);
+ sampler->iface->backend_apply(sampler, ctx0, gf, &data);
+
+ if (data.sampled != nullptr) {
+ res->t_sampled[seq_id] = data.sampled;
+ outs[1] = data.sampled;
+ ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
+ }
+
+ if (data.probs != nullptr) {
+ res->t_sampled_probs[seq_id] = data.probs;
+ outs[1] = data.probs;
+ ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
+ }
+
+ if (data.logits != nullptr) {
+ res->t_sampled_logits[seq_id] = data.logits;
+ outs[1] = data.logits;
+ ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
+ }
+
+ if (data.candidates != nullptr) {
+ res->t_candidates[seq_id] = data.candidates;
+ outs[1] = data.candidates;
+ ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
+ }
+ }
+
+ // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
+ /*
+ for (const auto & [seq_id, sampler] : samplers) {
+ if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
+ ggml_tensor * selected_token = it->second;
+ if (selected_token != nullptr) {
+ llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
+ }
+ }
+ }
+ */
+}
+
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+ // TODO move to hparams if a T5 variant appears that uses a different value
+ const int64_t max_distance = 128;
+
+ if (bidirectional) {
+ n_buckets >>= 1;
+ }
+
+ const int64_t max_exact = n_buckets >> 1;
+
+ int32_t relative_position = x - y;
+ int32_t relative_bucket = 0;
+
+ if (bidirectional) {
+ relative_bucket += (relative_position > 0) * n_buckets;
+ relative_position = std::abs(relative_position);
+ } else {
+ relative_position = -std::min<int32_t>(relative_position, 0);
+ }
+
+ int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+ relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+ relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+
+ return relative_bucket;
+}
diff --git a/llama.cpp/src/llama-graph.h b/llama.cpp/src/llama-graph.h
new file mode 100644
index 0000000..1d69ff1
--- /dev/null
+++ b/llama.cpp/src/llama-graph.h
@@ -0,0 +1,1021 @@
+#pragma once
+
+#include "llama-arch.h"
+#include "llama-batch.h"
+#include "llama-hparams.h"
+#include "llama-adapter.h"
+
+#include <cstdint>
+#include <vector>
+#include <memory>
+#include <set>
+#include <functional>
+#include <map>
+
+struct ggml_cgraph;
+struct ggml_context;
+struct ggml_tensor;
+
+struct llama_cparams;
+
+struct llama_memory_context_i;
+
+class llama_kv_cache_context;
+class llama_kv_cache_iswa_context;
+class llama_memory_recurrent_context;
+class llama_memory_hybrid_context;
+class llama_memory_hybrid_iswa_context;
+
+// certain models (typically multi-modal) can produce different types of graphs
+enum llm_graph_type {
+ LLM_GRAPH_TYPE_DEFAULT,
+ LLM_GRAPH_TYPE_ENCODER,
+ LLM_GRAPH_TYPE_DECODER,
+};
+
+enum llm_ffn_op_type {
+ LLM_FFN_SILU,
+ LLM_FFN_GELU,
+ LLM_FFN_RELU,
+ LLM_FFN_RELU_SQR,
+ LLM_FFN_SWIGLU,
+ LLM_FFN_GEGLU,
+ LLM_FFN_REGLU,
+ LLM_FFN_SWIGLU_OAI_MOE,
+};
+
+enum llm_ffn_gate_type {
+ LLM_FFN_SEQ,
+ LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+ LLM_NORM,
+ LLM_NORM_RMS,
+ LLM_NORM_GROUP,
+};
+
+// TODO: tmp - need something better to pass the data from the encoder to the decoder
+struct llama_cross {
+ // the output embeddings from the encoder as a ggml tensor
+ // TODO: this needs more work to be correct, for now copy the embeddings data to host memory
+ // ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
+ //ggml_tensor * t_embd = nullptr;
+
+ int64_t n_embd = 0;
+ int64_t n_enc = 0;
+
+ // embeddings data copied to host memory (tmp)
+ std::vector<float> v_embd;
+
+ // needed to construct the cross-attention mask in the decoder
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
+};
+
+struct llm_graph_params;
+
+//
+// llm_graph_input
+//
+
+class llm_graph_input_i {
+public:
+ llm_graph_input_i() {
+ const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
+ debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
+ }
+
+ virtual ~llm_graph_input_i() = default;
+
+ virtual void set_input(const llama_ubatch * ubatch) = 0;
+
+ // return true if the resulting input tensors using the provided graph parameters would be
+ // the same as the previous input tensors that we have currently stored in the object
+ virtual bool can_reuse(const llm_graph_params & params) {
+ // returning false here by default will prevent from reusing the graph if the check
+ // for the input type has not been implemented yet
+ GGML_UNUSED(params);
+ return false;
+ }
+protected:
+ // env: LLAMA_GRAPH_INPUT_DEBUG
+ int debug = 0;
+};
+
+using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
+
+class llm_graph_input_embd : public llm_graph_input_i {
+public:
+ llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
+ virtual ~llm_graph_input_embd() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * tokens = nullptr; // I32 [n_batch]
+ ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
+
+ const int64_t n_embd = 0;
+};
+
+class llm_graph_input_pos : public llm_graph_input_i {
+public:
+ llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
+ virtual ~llm_graph_input_pos() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * pos = nullptr; // I32 [n_batch]
+
+ const uint32_t n_pos_per_embd = 1;
+};
+
+// temperature tuning, used by llama4
+class llm_graph_input_attn_temp : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
+ : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
+ virtual ~llm_graph_input_attn_temp() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
+
+ const uint32_t n_attn_temp_floor_scale;
+ const float f_attn_temp_scale;
+ const float f_attn_temp_offset;
+};
+
+class llm_graph_input_pos_bucket : public llm_graph_input_i {
+public:
+ llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
+ virtual ~llm_graph_input_pos_bucket() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
+
+ const llama_hparams hparams;
+};
+
+class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
+public:
+ llm_graph_input_pos_bucket_kv(
+ const llama_hparams & hparams,
+ const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
+ virtual ~llm_graph_input_pos_bucket_kv() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
+
+ const llama_hparams hparams;
+
+ const llama_kv_cache_context * mctx;
+};
+
+class llm_graph_input_out_ids : public llm_graph_input_i {
+public:
+ llm_graph_input_out_ids(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+ virtual ~llm_graph_input_out_ids() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * out_ids; // I32 [n_outputs]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const uint32_t n_outputs;
+};
+
+class llm_graph_input_mean : public llm_graph_input_i {
+public:
+ llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
+ virtual ~llm_graph_input_mean() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * mean; // F32 [n_batch, n_batch]
+
+ const llama_cparams cparams;
+};
+
+class llm_graph_input_cls : public llm_graph_input_i {
+public:
+ llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
+ virtual ~llm_graph_input_cls() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * cls; // I32 [n_batch]
+
+ const llama_cparams cparams;
+ const llm_arch arch;
+};
+
+class llm_graph_input_rs : public llm_graph_input_i {
+public:
+ llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
+ virtual ~llm_graph_input_rs() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * s_copy; // I32 [n_rs]
+
+ // views of s_copy, computed once per graph
+ // and shared across layers which use build_rs
+ ggml_tensor * s_copy_main; // I32 [n_seqs]
+ ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
+
+ const llama_memory_recurrent_context * mctx;
+
+ // used in view offsets, need to match for valid graph reuse
+ uint32_t head;
+ int32_t rs_z;
+};
+
+class llm_graph_input_cross_embd : public llm_graph_input_i {
+public:
+ llm_graph_input_cross_embd(
+ const llama_cross * cross) : cross(cross) {}
+ virtual ~llm_graph_input_cross_embd() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
+
+ const llama_cross * cross;
+};
+
+class llm_graph_input_attn_no_cache : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
+ hparams(hparams),
+ cparams(cparams) {
+ }
+ ~llm_graph_input_attn_no_cache() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+ ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
+
+ // n_tokens == n_batch
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+};
+
+class llm_graph_input_attn_kv : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_kv(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx) :
+ hparams(hparams),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_attn_kv() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+ ggml_tensor * get_v_idxs() const { return self_v_idxs; }
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ // note: these have to be copies because in order to be able to reuse a graph, its inputs
+ // need to carry these parameters with them. otherwise, they can point to freed
+ // llm_graph_params from a previous batch, causing stack-use-after-return
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const llama_kv_cache_context * mctx;
+};
+
+// V-less input for the KV cache
+// ref: https://github.com/ggml-org/llama.cpp/pull/19067
+class llm_graph_input_attn_k : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_k(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx) :
+ hparams(hparams),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_attn_k() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const llama_kv_cache_context * mctx;
+};
+
+class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_kv_iswa(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_iswa_context * mctx) :
+ hparams(hparams),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_attn_kv_iswa() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+ ggml_tensor * get_v_idxs() const { return self_v_idxs; }
+ ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
+ ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; }
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+ ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
+ ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
+ ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const llama_kv_cache_iswa_context * mctx;
+};
+
+class llm_graph_input_attn_cross : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
+ ~llm_graph_input_attn_cross() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
+
+ ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
+ ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
+
+ const llama_cross * cross = nullptr;
+};
+
+class llm_graph_input_mem_hybrid : public llm_graph_input_i {
+public:
+ llm_graph_input_mem_hybrid(
+ const llama_cparams & cparams,
+ std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
+ const llama_memory_hybrid_context * mctx) :
+ inp_attn(std::move(inp_attn)),
+ inp_rs(std::move(inp_rs)),
+ cparams(cparams),
+ mctx(mctx) { }
+ virtual ~llm_graph_input_mem_hybrid() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
+
+ llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
+
+ const llama_cparams cparams;
+
+ const llama_memory_hybrid_context * mctx;
+};
+
+class llm_graph_input_mem_hybrid_k : public llm_graph_input_i {
+public:
+ llm_graph_input_mem_hybrid_k(
+ const llama_cparams & cparams,
+ std::unique_ptr<llm_graph_input_attn_k> inp_attn,
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
+ const llama_memory_hybrid_context * mctx) :
+ inp_attn(std::move(inp_attn)),
+ inp_rs(std::move(inp_rs)),
+ cparams(cparams),
+ mctx(mctx) { }
+ virtual ~llm_graph_input_mem_hybrid_k() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::unique_ptr<llm_graph_input_attn_k> inp_attn;
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
+
+ llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); }
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
+
+ const llama_cparams cparams;
+
+ const llama_memory_hybrid_context * mctx;
+};
+
+class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
+public:
+ llm_graph_input_mem_hybrid_iswa(
+ const llama_cparams & cparams,
+ std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
+ const llama_memory_hybrid_iswa_context * mctx) :
+ inp_attn(std::move(inp_attn)),
+ inp_rs(std::move(inp_rs)),
+ cparams(cparams),
+ mctx(mctx) { }
+ virtual ~llm_graph_input_mem_hybrid_iswa() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
+
+ llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
+
+ const llama_cparams cparams;
+
+ const llama_memory_hybrid_iswa_context * mctx;
+};
+
+class llm_graph_input_sampling : public llm_graph_input_i {
+public:
+ llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
+ samplers(std::move(samplers)) { }
+ virtual ~llm_graph_input_sampling() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::map<llama_seq_id, llama_sampler *> samplers;
+};
+
+//
+// llm_graph_result
+//
+
+// these objects deliver the result from the graph build process back to the llama_context
+// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
+// specific data, by calling the set_inputs() method
+// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
+// these are used by the llama_context to extact the relevant data, based on the compute parameters
+
+// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
+
+class llm_graph_result;
+
+struct llm_graph_params {
+ llm_arch arch = LLM_ARCH_UNKNOWN;
+
+ llama_hparams hparams;
+ llama_cparams cparams;
+
+ llama_ubatch ubatch; // note: intentionally make a copy
+
+ llm_graph_type gtype;
+
+ ggml_backend_sched_t sched;
+ ggml_backend_t backend_cpu;
+
+ const llama_adapter_cvec * cvec;
+ const llama_adapter_loras * loras;
+ const llama_memory_context_i * mctx;
+ const llama_cross * cross;
+
+ std::map<llama_seq_id, llama_sampler *> samplers;
+
+ static bool samplers_equal(
+ const std::map<llama_seq_id, llama_sampler *> & lhs,
+ const std::map<llama_seq_id, llama_sampler *> & rhs) {
+ if (lhs.size() != rhs.size()) {
+ return false;
+ }
+ for (const auto & [seq_id, sampler] : lhs) {
+ auto it = rhs.find(seq_id);
+ if (it == rhs.end() || it->second != sampler) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ uint32_t n_outputs;
+
+ llm_graph_cb cb;
+
+ llm_graph_result * res;
+
+ // return true if the "other" params would result in a graph with the same topology as with the current params
+ // having the same topology allows us to reuse the graph in some cases
+ bool allow_reuse(const llm_graph_params & other) const {
+ // first check the ubatch
+ bool can_reuse_ubatch =
+ ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
+ ubatch.n_tokens == other.ubatch.n_tokens &&
+ ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
+ ubatch.n_seqs == other.ubatch.n_seqs &&
+ ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
+ (
+ (!ubatch.token && !other.ubatch.token) ||
+ (!ubatch.embd && !other.ubatch.embd)
+ );
+
+ // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
+ // the reason is because the set of attention streams would be different for different sequences
+ if (can_reuse_ubatch && ubatch.equal_seqs()) {
+ if (!ubatch.data) {
+ // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
+ // therefore we cannot perform the sequence id check. normally should never happen
+ can_reuse_ubatch = false;
+ } else {
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
+ }
+ }
+ }
+
+ if (!can_reuse_ubatch) {
+ return false;
+ }
+
+ if (n_outputs != other.n_outputs) {
+ return false;
+ }
+
+ if (!samplers_equal(samplers, other.samplers)) {
+ return false;
+ }
+
+ if (samplers.size() > 0) {
+ if (!ubatch.data || !other.ubatch.data) {
+ return false;
+ }
+
+ // check that the outputs are the same for all samplers
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ if (ubatch.output[i] != other.ubatch.output[i] ||
+ ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
+ return false;
+ }
+ }
+ }
+
+ return
+ cparams.embeddings == other.cparams.embeddings &&
+ cparams.causal_attn == other.cparams.causal_attn &&
+ arch == other.arch &&
+ gtype == other.gtype &&
+ cvec == other.cvec &&
+ loras == other.loras &&
+ cross == other.cross;
+ }
+};
+
+class llm_graph_result {
+public:
+ llm_graph_result(int64_t max_nodes);
+
+ virtual ~llm_graph_result() = default;
+
+ ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
+ ggml_tensor * get_logits() const { return t_logits; }
+ ggml_tensor * get_embd() const { return t_embd; }
+ ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
+
+ ggml_cgraph * get_gf() const { return gf; }
+ ggml_context * get_ctx() const { return ctx_compute.get(); }
+
+ int64_t get_max_nodes() const;
+
+ void reset();
+
+ void set_inputs(const llama_ubatch * ubatch);
+ void set_outputs();
+
+ // try to update the existing graph result using the new graph parameters in order to reuse it
+ // this can only be done if we determine that the resulting graph using the new graph parameters
+ // would be identical to the existing graph. in that case, we simply have to update the memory
+ // contexts of the input tensors of the graph and we can reuse it for another computation
+ // return true if the graph was updated and can be reused
+ bool can_reuse(const llm_graph_params & params);
+
+ llm_graph_input_i * add_input(llm_graph_input_ptr input);
+
+ void set_params(const llm_graph_params & params);
+
+ // important graph nodes
+ ggml_tensor * t_inp_tokens = nullptr;
+ ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
+ ggml_tensor * t_logits = nullptr;
+ ggml_tensor * t_embd = nullptr;
+ ggml_tensor * t_embd_pooled = nullptr;
+
+ std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
+ std::map<llama_seq_id, ggml_tensor*> t_candidates;
+ std::map<llama_seq_id, ggml_tensor*> t_sampled;
+ std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
+
+ std::vector<llm_graph_input_ptr> inputs;
+
+ ggml_context_ptr ctx_compute;
+
+ // memory buffers used to evaluate the model
+ std::vector<uint8_t> buf_compute_meta;
+
+ ggml_cgraph * gf;
+
+ int64_t max_nodes;
+
+private:
+ // keep a copy of the previous graph parameters
+ // we will use this to determine whether the graph can be reused by comparing them with the new parameters
+ // note: these are updated after constructing the new graph
+ llm_graph_params params;
+
+ // env: LLAMA_GRAPH_RESULT_DEBUG
+ int debug = 0;
+};
+
+using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
+
+//
+// llm_graph_context
+//
+
+// used in build_rs to properly order writes and avoid unnecessary copies
+using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
+
+struct llm_graph_context {
+ const llm_arch arch;
+
+ const llama_hparams & hparams;
+ const llama_cparams & cparams;
+ const llama_ubatch & ubatch;
+
+ const int64_t n_embd;
+ const int64_t n_layer;
+ const int64_t n_rot;
+ const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
+ const int64_t n_head;
+ const int64_t n_head_kv;
+ const int64_t n_embd_head_k;
+ const int64_t n_embd_k_gqa;
+ const int64_t n_embd_head_v;
+ const int64_t n_embd_v_gqa;
+ const int64_t n_expert;
+ const int64_t n_expert_used;
+
+ const float freq_base;
+ const float freq_scale;
+ const float ext_factor;
+ const float attn_factor;
+ const float beta_fast;
+ const float beta_slow;
+ const float norm_eps;
+ const float norm_rms_eps;
+
+ const int64_t n_tokens;
+ const int64_t n_outputs;
+ const int32_t n_ctx_orig; // yarn
+
+ const enum llama_pooling_type pooling_type;
+ const enum llama_rope_type rope_type;
+
+ ggml_backend_sched_t sched;
+
+ ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+
+ const llama_adapter_cvec * cvec;
+ const llama_adapter_loras * loras;
+ const llama_memory_context_i * mctx;
+ const llama_cross * cross;
+
+ std::map<llama_seq_id, llama_sampler *> samplers;
+
+ const llm_graph_cb & cb_func;
+
+ llm_graph_result * res;
+
+ ggml_context * ctx0 = nullptr;
+ ggml_cgraph * gf = nullptr;
+
+ llm_graph_context(const llm_graph_params & params);
+ virtual ~llm_graph_context() = default;
+
+ void cb(ggml_tensor * cur, const char * name, int il) const;
+
+ //
+ // common
+ //
+
+ ggml_tensor * build_cvec(
+ ggml_tensor * cur,
+ int il) const;
+
+ // do mat_mul, while optionally apply lora
+ ggml_tensor * build_lora_mm(
+ ggml_tensor * w,
+ ggml_tensor * cur) const;
+
+ // do mat_mul_id, while optionally apply lora
+ ggml_tensor * build_lora_mm_id(
+ ggml_tensor * w, // ggml_tensor * as
+ ggml_tensor * cur, // ggml_tensor * b
+ ggml_tensor * ids) const;
+
+ ggml_tensor * build_norm(
+ ggml_tensor * cur,
+ ggml_tensor * mw,
+ ggml_tensor * mb,
+ llm_norm_type type,
+ int il) const;
+
+ ggml_tensor * build_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * up,
+ ggml_tensor * up_b,
+ ggml_tensor * up_s,
+ ggml_tensor * gate,
+ ggml_tensor * gate_b,
+ ggml_tensor * gate_s,
+ ggml_tensor * down,
+ ggml_tensor * down_b,
+ ggml_tensor * down_s,
+ ggml_tensor * act_scales,
+ llm_ffn_op_type type_op,
+ llm_ffn_gate_type type_gate,
+ int il) const;
+
+ // build MoE FFN without bias tensors
+ ggml_tensor * build_moe_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * gate_inp,
+ ggml_tensor * up_exps,
+ ggml_tensor * gate_exps,
+ ggml_tensor * down_exps,
+ ggml_tensor * exp_probs_b,
+ int64_t n_expert,
+ int64_t n_expert_used,
+ llm_ffn_op_type type_op,
+ bool norm_w,
+ bool scale_w,
+ float w_scale,
+ llama_expert_gating_func_type gating_op,
+ int il,
+ ggml_tensor * probs_in = nullptr) const;
+
+ ggml_tensor * build_moe_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * gate_inp,
+ ggml_tensor * gate_inp_b,
+ ggml_tensor * up_exps,
+ ggml_tensor * up_exps_b,
+ ggml_tensor * gate_exps,
+ ggml_tensor * gate_exps_b,
+ ggml_tensor * down_exps,
+ ggml_tensor * down_exps_b,
+ ggml_tensor * exp_probs_b,
+ int64_t n_expert,
+ int64_t n_expert_used,
+ llm_ffn_op_type type_op,
+ bool norm_w,
+ bool scale_w,
+ float w_scale,
+ llama_expert_gating_func_type gating_op,
+ int il,
+ ggml_tensor * probs_in = nullptr) const;
+
+ //
+ // inputs
+ //
+
+ ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
+ ggml_tensor * build_inp_pos() const;
+ ggml_tensor * build_inp_attn_scale() const;
+ ggml_tensor * build_inp_out_ids() const;
+ ggml_tensor * build_inp_mean() const;
+ ggml_tensor * build_inp_cls() const;
+
+ ggml_tensor * build_inp_cross_embd() const;
+ ggml_tensor * build_inp_pos_bucket_enc() const;
+ ggml_tensor * build_inp_pos_bucket_dec() const;
+ ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
+
+ //
+ // attention
+ //
+
+ ggml_tensor * build_attn_mha(
+ ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
+ ggml_tensor * kq_b,
+ ggml_tensor * kq_mask,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_no_cache * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_kv * build_attn_inp_kv() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_kv * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_k * build_attn_inp_k() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_k * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
+
+ // note: if k_cur or v_cur are not provided, they will not be stored in the memory
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_kv_iswa * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_cross * build_attn_inp_cross() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_cross * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ //
+ // recurrent
+ //
+
+ // TODO: move this implementation to llama_memory_recurrent.
+ // this is analogous to llama_kv_cache::cpy_k / cpy_v
+ // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
+ // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
+ // `llama_memory_recurrent`
+ ggml_tensor * build_rs(
+ ggml_tensor * s,
+ ggml_tensor * state_copy_main,
+ ggml_tensor * state_copy_extra,
+ int32_t state_size,
+ int32_t n_seqs,
+ uint32_t n_rs,
+ uint32_t rs_head,
+ uint32_t rs_size,
+ int32_t rs_zero,
+ const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
+
+ llm_graph_input_rs * build_rs_inp() const;
+
+ ggml_tensor * build_rs(
+ llm_graph_input_rs * inp,
+ ggml_tensor * s,
+ int32_t state_size,
+ int32_t n_seqs,
+ const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
+
+ ggml_tensor * build_rwkv_token_shift_load(
+ llm_graph_input_rs * inp,
+ const llama_ubatch & ubatch,
+ int il) const;
+
+ ggml_tensor * build_rwkv_token_shift_store(
+ ggml_tensor * token_shift,
+ const llama_ubatch & ubatch,
+ int il) const;
+ //
+ // hybrid
+ //
+
+ llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
+ llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const;
+
+ llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
+
+ //
+ // pooling
+ //
+
+ void build_pooling(
+ ggml_tensor * cls,
+ ggml_tensor * cls_b,
+ ggml_tensor * cls_out,
+ ggml_tensor * cls_out_b) const;
+
+ //
+ // sampling (backend sampling)
+ //
+
+ void build_sampling() const;
+
+ //
+ // dense (out)
+ //
+
+ void build_dense_out(
+ ggml_tensor * dense_2,
+ ggml_tensor * dense_3) const;
+};
+
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
diff --git a/llama.cpp/src/llama-hparams.cpp b/llama.cpp/src/llama-hparams.cpp
new file mode 100644
index 0000000..756dda1
--- /dev/null
+++ b/llama.cpp/src/llama-hparams.cpp
@@ -0,0 +1,234 @@
+#include "llama-hparams.h"
+
+#include "ggml.h"
+
+#include <algorithm>
+#include <cassert>
+
+void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
+ if (dense_first) {
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
+ }
+ } else {
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+ }
+ }
+}
+
+bool llama_hparams::is_swa_any() const {
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ if (swa_layers[il]) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+uint32_t llama_hparams::n_head(uint32_t il) const {
+ if (il < n_layer) {
+ return n_head_arr[il];
+ }
+
+ GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_head_kv(uint32_t il) const {
+ if (il < n_layer) {
+ return n_head_kv_arr[il];
+ }
+
+ GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_ff(uint32_t il) const {
+ if (il < n_layer) {
+ return n_ff_arr[il];
+ }
+
+ GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_gqa(uint32_t il) const {
+ const uint32_t n_head = this->n_head(il);
+ const uint32_t n_head_kv = this->n_head_kv(il);
+
+ if (n_head_kv == 0) {
+ return 0;
+ }
+
+ return n_head/n_head_kv;
+}
+
+uint32_t llama_hparams::n_embd_inp() const {
+ uint32_t n_embd_inp = n_embd;
+
+ if (n_deepstack_layers > 0) {
+ n_embd_inp += n_embd * n_deepstack_layers;
+ }
+
+ return n_embd_inp;
+}
+
+uint32_t llama_hparams::n_embd_out() const {
+ return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
+}
+
+uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
+ const uint32_t n_head_kv = this->n_head_kv(il);
+
+ return n_embd_head_k * n_head_kv;
+}
+
+uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
+ const uint32_t n_head_kv = this->n_head_kv(il);
+
+ return n_embd_head_v * n_head_kv;
+}
+
+bool llama_hparams::is_n_embd_k_gqa_variable() const {
+ const uint32_t val = n_embd_k_gqa();
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ if (val != n_embd_k_gqa(il)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool llama_hparams::is_n_embd_v_gqa_variable() const {
+ const uint32_t val = n_embd_v_gqa();
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ if (val != n_embd_v_gqa(il)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+uint32_t llama_hparams::n_embd_k_gqa_max() const {
+ uint32_t val = n_embd_k_gqa();
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ val = std::max(val, n_embd_k_gqa(il));
+ }
+
+ return val;
+}
+
+uint32_t llama_hparams::n_embd_v_gqa_max() const {
+ uint32_t val = n_embd_v_gqa();
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ val = std::max(val, n_embd_v_gqa(il));
+ }
+
+ return val;
+}
+
+uint32_t llama_hparams::n_embd_r() const {
+ if (wkv_head_size != 0) {
+ // for RWKV models
+ return token_shift_count * n_embd;
+ }
+
+ if (n_shortconv_l_cache != 0) {
+ // for LFM2 models
+ return n_embd * (n_shortconv_l_cache - 1);
+ }
+
+ if (n_embd_head_kda != 0) {
+ // for Kimi KDA layers
+ // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
+ const uint32_t d_inner = n_head() * n_embd_head_kda; // 32 * 128 = 4096
+ return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
+ }
+
+ // TODO: maybe support other convolution strides than 1
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
+ // Corresponds to Mamba's conv_states size
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state);
+}
+
+uint32_t llama_hparams::n_embd_s() const {
+ if (wkv_head_size != 0) {
+ // corresponds to RWKV's wkv_states size
+ return n_embd * wkv_head_size;
+ }
+
+ if (n_embd_head_kda != 0) {
+ // for Kimi KDA layers
+ // Full recurrent state: head_dim * head_dim * n_head
+ // h tensor shape for delta attention: [head_dim, head_dim, n_head]
+ return n_embd_head_kda * n_embd_head_kda * n_head(); // 128 * 128 * 32 = 524288
+ }
+
+ // corresponds to Mamba's ssm_states size
+ return ssm_d_state * ssm_d_inner;
+}
+
+bool llama_hparams::is_recurrent(uint32_t il) const {
+ if (il < n_layer) {
+ return recurrent_layer_arr[il];
+ }
+
+ GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
+}
+
+uint32_t llama_hparams::n_pos_per_embd() const {
+ return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
+}
+
+bool llama_hparams::is_swa(uint32_t il) const {
+ if (il < n_layer) {
+ return swa_layers[il];
+ }
+
+ GGML_ABORT("fatal error");
+}
+
+bool llama_hparams::is_mla() const {
+ assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
+ (n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
+
+ return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
+}
+
+uint32_t llama_hparams::n_embd_head_k_mla() const {
+ return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
+}
+
+uint32_t llama_hparams::n_embd_head_v_mla() const {
+ return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
+}
+
+bool llama_hparams::has_kv(uint32_t il) const {
+ if (n_layer_kv_from_start >= 0) {
+ if (il < (uint32_t) n_layer_kv_from_start) {
+ return true;
+ }
+
+ return false;
+ }
+
+ // by default, all layers have kv
+ return true;
+}
+
+uint32_t llama_hparams::n_layer_kv() const {
+ uint32_t res = 0;
+
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ if (has_kv(il)) {
+ res++;
+ }
+ }
+
+ return res;
+}
+
+bool llama_hparams::use_mrope() const {
+ return rope_sections[0] > 0 && rope_sections[1] > 0;
+}
diff --git a/llama.cpp/src/llama-hparams.h b/llama.cpp/src/llama-hparams.h
new file mode 100644
index 0000000..706eda8
--- /dev/null
+++ b/llama.cpp/src/llama-hparams.h
@@ -0,0 +1,334 @@
+#pragma once
+
+#include "llama.h"
+
+#include <array>
+#include <cassert>
+
+// bump if necessary
+#define LLAMA_MAX_LAYERS 512
+#define LLAMA_MAX_EXPERTS 512 // Qwen3 Next
+
+enum llama_expert_gating_func_type {
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
+};
+
+enum llama_swa_type {
+ LLAMA_SWA_TYPE_NONE = 0,
+ LLAMA_SWA_TYPE_STANDARD = 1,
+ LLAMA_SWA_TYPE_CHUNKED = 2,
+ LLAMA_SWA_TYPE_SYMMETRIC = 3,
+};
+
+struct llama_hparams_posnet {
+ uint32_t n_embd;
+ uint32_t n_layer;
+};
+
+struct llama_hparams_convnext {
+ uint32_t n_embd;
+ uint32_t n_layer;
+};
+
+struct llama_hparams {
+ bool vocab_only;
+ bool no_alloc;
+ bool rope_finetuned;
+ bool use_par_res;
+ bool swin_norm;
+
+ uint32_t n_ctx_train; // context size the model was trained on
+ uint32_t n_embd;
+ uint32_t n_layer;
+ int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
+ uint32_t n_rot;
+ uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
+ uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
+ uint32_t n_expert = 0;
+ uint32_t n_expert_used = 0;
+ uint32_t n_rel_attn_bkts = 0;
+
+ // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
+ uint32_t n_embd_head_k_mla_impl = 0;
+ uint32_t n_embd_head_v_mla_impl = 0;
+
+ // for WavTokenizer
+ struct llama_hparams_posnet posnet;
+ struct llama_hparams_convnext convnext;
+
+ uint32_t n_shortconv_l_cache = 0;
+
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
+
+ uint32_t n_layer_dense_lead = 0;
+ uint32_t n_lora_q = 0;
+ uint32_t n_lora_kv = 0;
+ uint32_t n_ff_exp = 0;
+ uint32_t n_ff_shexp = 0;
+ uint32_t n_ff_chexp = 0;
+ uint32_t n_expert_shared = 0;
+ uint32_t n_norm_groups = 0;
+ uint32_t n_expert_groups = 0;
+ uint32_t n_group_used = 0;
+ uint32_t n_group_experts = 0;
+
+ float expert_group_scale = 0.05f;
+ float expert_weights_scale = 0.0f;
+ bool expert_weights_norm = false;
+ uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+ uint32_t moe_every_n_layers = 0;
+ uint32_t nextn_predict_layers = 0;
+
+ float f_norm_eps;
+ float f_norm_rms_eps;
+ float f_norm_group_eps;
+
+ float f_attn_logit_softcapping = 50.0f;
+ float f_router_logit_softcapping = 30.0f;
+ float f_final_logit_softcapping = 30.0f;
+
+ // for RWKV
+ uint32_t rescale_every_n_layers = 0;
+ uint32_t time_mix_extra_dim = 0;
+ uint32_t time_decay_extra_dim = 0;
+ uint32_t wkv_head_size = 0;
+ uint32_t token_shift_count = 2;
+ uint32_t n_lora_decay = 0;
+ uint32_t n_lora_iclr = 0;
+ uint32_t n_lora_value_res_mix = 0;
+ uint32_t n_lora_gate = 0;
+
+ float rope_attn_factor = 1.0f;
+ float rope_freq_base_train;
+ float rope_freq_base_train_swa = 10000.0f;
+ float rope_freq_scale_train;
+ float rope_freq_scale_train_swa = 1.0f;
+
+ uint32_t n_ctx_orig_yarn;
+ float rope_yarn_log_mul = 0.0f;
+
+ float yarn_ext_factor = -1.0f;
+ float yarn_attn_factor = 1.0f;
+ float yarn_beta_fast = 32.0f;
+ float yarn_beta_slow = 1.0f;
+
+ std::array<int, 4> rope_sections;
+
+ // Sliding Window Attention (SWA)
+ llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+ // the size of the sliding window (0 - no SWA)
+ uint32_t n_swa = 0;
+ // if swa_layers[il] == 1, then layer il is SWA
+ // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
+ // by default, all layers are dense
+ // note: using uint32_t type for compatibility reason
+ std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
+
+ // for State Space Models
+ uint32_t ssm_d_conv = 0;
+ uint32_t ssm_d_inner = 0;
+ uint32_t ssm_d_state = 0;
+ uint32_t ssm_dt_rank = 0;
+ uint32_t ssm_n_group = 0;
+
+ // for Kimi Linear KDA
+ uint32_t n_embd_head_kda = 0;
+
+ // for hybrid state space models
+ std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
+
+ bool ssm_dt_b_c_rms = false;
+
+ float f_clamp_kqv = 0.0f;
+ float f_max_alibi_bias = 0.0f;
+ float f_logit_scale = 0.0f;
+
+ // Additional scale factors (Granite/Granite MoE)
+ float f_residual_scale = 0.0f;
+ float f_embedding_scale = 0.0f;
+ float f_attention_scale = 0.0f;
+
+ // grok-2
+ float f_attn_out_scale = 0.0f;
+ uint32_t attn_temp_length = 0;
+
+ bool causal_attn = true;
+ bool use_alibi = false;
+ bool attn_soft_cap = false;
+ bool use_kq_norm = false;
+
+ // for Classifiers
+ uint32_t n_cls_out = 1;
+
+ // output embedding dimension (0 = use n_embd)
+ uint32_t n_embd_out_impl = 0;
+
+ // llama4 smallthinker
+ uint32_t n_moe_layer_step = 0;
+ uint32_t n_no_rope_layer_step = 4;
+ uint32_t n_attn_temp_floor_scale = 0;
+ float f_attn_temp_scale = 0.0f;
+ float f_attn_temp_offset = 0.0f; // offset position index
+
+ // gemma3n altup
+ uint32_t n_altup = 4; // altup_num_inputs
+ uint32_t i_altup_act = 0; // altup_active_idx
+ uint32_t laurel_rank = 64;
+ uint32_t n_embd_altup = 256;
+
+ // needed for sentence-transformers dense layers
+ uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
+ uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
+ uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
+ uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
+
+ // xIELU
+ std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
+ std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
+ std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
+ std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
+
+ // qwen3vl deepstack
+ uint32_t n_deepstack_layers = 0;
+
+ // needed by encoder-decoder models (e.g. T5, FLAN-T5)
+ // ref: https://github.com/ggml-org/llama.cpp/pull/8141
+ llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
+ uint32_t dec_n_layer = 0;
+
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
+
+
+ // Step35: optional per-layer clamps for (Swi)GLU
+ std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
+ std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert
+
+ // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
+ // dense_first means whether the pattern is start with a dense layer
+ // note that if n_pattern == 0, all layers are SWA
+ // if n_pattern == 1, all layers are dense
+ // example 1: n_pattern = 3, dense_first = false
+ // il == 0: swa
+ // il == 1: swa
+ // il == 2: dense
+ // il == 3: swa
+ // il == 4: swa
+ // il == 5: dense
+ // il == 6: swa
+ // etc ...
+ // example 2: n_pattern = 2, dense_first = true
+ // il == 0: dense
+ // il == 1: swa
+ // il == 2: dense
+ // il == 3: swa
+ // etc ...
+ void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
+
+ // return true if one of the layers is SWA
+ bool is_swa_any() const;
+
+ uint32_t n_head(uint32_t il = 0) const;
+
+ uint32_t n_head_kv(uint32_t il = 0) const;
+
+ uint32_t n_ff(uint32_t il = 0) const;
+
+ uint32_t n_gqa(uint32_t il = 0) const;
+
+ // dimension of main + auxiliary input embeddings
+ uint32_t n_embd_inp() const;
+
+ // dimension of output embeddings
+ uint32_t n_embd_out() const;
+
+ // dimension of key embeddings across all k-v heads
+ uint32_t n_embd_k_gqa(uint32_t il = 0) const;
+
+ // dimension of value embeddings across all k-v heads
+ uint32_t n_embd_v_gqa(uint32_t il = 0) const;
+
+ // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
+ bool is_n_embd_k_gqa_variable() const;
+ bool is_n_embd_v_gqa_variable() const;
+
+ // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
+ uint32_t n_embd_k_gqa_max() const;
+ uint32_t n_embd_v_gqa_max() const;
+
+ // dimension of the rolling state embeddings
+ // corresponds to Mamba's conv_states size or RWKV's token_shift states size
+ uint32_t n_embd_r() const;
+
+ // dimension of the recurrent state embeddings
+ uint32_t n_embd_s() const;
+
+ // whether or not the given layer is recurrent (for hybrid models)
+ bool is_recurrent(uint32_t il) const;
+
+ uint32_t n_pos_per_embd() const;
+
+ bool is_swa(uint32_t il) const;
+
+ // note: currently only support if either all or none of the layers are MLA
+ bool is_mla() const;
+
+ uint32_t n_embd_head_k_mla() const;
+ uint32_t n_embd_head_v_mla() const;
+
+ bool has_kv(uint32_t il) const;
+
+ // number of layers for which has_kv() returns true
+ uint32_t n_layer_kv() const;
+
+ // note that this function uses different SWA parameters from those in the hparams
+ // note: inlined on purpose for performance reasons
+ // TODO: think of a better place for this function
+ // TODO: pack the SWA params in a struct?
+ static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
+ assert(p0 >= 0 && p1 >= 0);
+
+ switch (swa_type) {
+ case LLAMA_SWA_TYPE_NONE:
+ {
+ } break;
+ case LLAMA_SWA_TYPE_STANDARD:
+ {
+ if (p1 - p0 >= (int32_t) n_swa) {
+ return true;
+ }
+ } break;
+ case LLAMA_SWA_TYPE_CHUNKED:
+ {
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+ if (p0 < pos_chunk_start) {
+ return true;
+ }
+ } break;
+ case LLAMA_SWA_TYPE_SYMMETRIC:
+ {
+ const int32_t half_n_swa = (int32_t) n_swa / 2;
+ const int32_t pos_diff = p1 - p0;
+
+ // Mask if outside the symmetric window
+ if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
+ return true;
+ }
+ } break;
+ }
+
+ return false;
+ }
+
+
+ bool use_mrope() const;
+};
+
+static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/llama.cpp/src/llama-impl.cpp b/llama.cpp/src/llama-impl.cpp
new file mode 100644
index 0000000..8e3e7b2
--- /dev/null
+++ b/llama.cpp/src/llama-impl.cpp
@@ -0,0 +1,171 @@
+#include "llama-impl.h"
+
+#include "gguf.h"
+#include "llama.h"
+
+#include <cinttypes>
+#include <climits>
+#include <cstdarg>
+#include <cstring>
+#include <vector>
+#include <sstream>
+
+struct llama_logger_state {
+ ggml_log_callback log_callback = llama_log_callback_default;
+ void * log_callback_user_data = nullptr;
+};
+
+static llama_logger_state g_logger_state;
+
+time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+time_meas::~time_meas() {
+ if (t_start_us >= 0) {
+ t_acc += ggml_time_us() - t_start_us;
+ }
+}
+
+void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
+ ggml_log_get(log_callback, user_data);
+}
+
+void llama_log_set(ggml_log_callback log_callback, void * user_data) {
+ ggml_log_set(log_callback, user_data);
+ g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
+ g_logger_state.log_callback_user_data = user_data;
+}
+
+static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
+ va_list args_copy;
+ va_copy(args_copy, args);
+ char buffer[128];
+ int len = vsnprintf(buffer, 128, format, args);
+ if (len < 128) {
+ g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+ } else {
+ char * buffer2 = new char[len + 1];
+ vsnprintf(buffer2, len + 1, format, args_copy);
+ buffer2[len] = 0;
+ g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+ delete[] buffer2;
+ }
+ va_end(args_copy);
+}
+
+void llama_log_internal(ggml_log_level level, const char * format, ...) {
+ va_list args;
+ va_start(args, format);
+ llama_log_internal_v(level, format, args);
+ va_end(args);
+}
+
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+ (void) level;
+ (void) user_data;
+ fputs(text, stderr);
+ fflush(stderr);
+}
+
+void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+ if (search.empty()) {
+ return;
+ }
+ std::string builder;
+ builder.reserve(s.length());
+ size_t pos = 0;
+ size_t last_pos = 0;
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
+ builder.append(s, last_pos, pos - last_pos);
+ builder.append(replace);
+ last_pos = pos + search.length();
+ }
+ builder.append(s, last_pos, std::string::npos);
+ s = std::move(builder);
+}
+
+std::string format(const char * fmt, ...) {
+ va_list ap;
+ va_list ap2;
+ va_start(ap, fmt);
+ va_copy(ap2, ap);
+ int size = vsnprintf(NULL, 0, fmt, ap);
+ GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+ std::vector<char> buf(size + 1);
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+ GGML_ASSERT(size2 == size);
+ va_end(ap2);
+ va_end(ap);
+ return std::string(buf.data(), size);
+}
+
+std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
+ char buf[256];
+ snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
+ for (size_t i = 1; i < ne.size(); i++) {
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
+ }
+ return buf;
+}
+
+std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
+ char buf[256];
+ snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
+ }
+ return buf;
+}
+
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+ switch (type) {
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
+ default: return format("unknown type %d", type);
+ }
+}
+
+std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+ switch (type) {
+ case GGUF_TYPE_STRING:
+ return gguf_get_val_str(ctx_gguf, i);
+ case GGUF_TYPE_ARRAY:
+ {
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
+ const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
+ std::stringstream ss;
+ ss << "[";
+ for (int j = 0; j < arr_n; j++) {
+ if (arr_type == GGUF_TYPE_STRING) {
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+ // escape quotes
+ replace_all(val, "\\", "\\\\");
+ replace_all(val, "\"", "\\\"");
+ ss << '"' << val << '"';
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
+ ss << "???";
+ } else {
+ ss << gguf_data_to_str(arr_type, data, j);
+ }
+ if (j < arr_n - 1) {
+ ss << ", ";
+ }
+ }
+ ss << "]";
+ return ss.str();
+ }
+ default:
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+ }
+}
diff --git a/llama.cpp/src/llama-impl.h b/llama.cpp/src/llama-impl.h
new file mode 100644
index 0000000..dfd9fee
--- /dev/null
+++ b/llama.cpp/src/llama-impl.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "ggml.h" // for ggml_log_level
+
+#include <string>
+#include <vector>
+
+#ifdef __GNUC__
+# if defined(__MINGW32__) && !defined(__clang__)
+# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+# else
+# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+# endif
+#else
+# define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
+//
+// logging
+//
+
+LLAMA_ATTRIBUTE_FORMAT(2, 3)
+void llama_log_internal (ggml_log_level level, const char * format, ...);
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+
+#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
+#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+
+//
+// helpers
+//
+
+template <typename T>
+struct no_init {
+ T value;
+ no_init() = default;
+};
+
+struct time_meas {
+ time_meas(int64_t & t_acc, bool disable = false);
+ ~time_meas();
+
+ const int64_t t_start_us;
+
+ int64_t & t_acc;
+};
+
+template <typename T>
+struct buffer_view {
+ T * data;
+ size_t size = 0;
+
+ bool has_data() const {
+ return data && size > 0;
+ }
+};
+
+void replace_all(std::string & s, const std::string & search, const std::string & replace);
+
+// TODO: rename to llama_format ?
+LLAMA_ATTRIBUTE_FORMAT(1, 2)
+std::string format(const char * fmt, ...);
+
+std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
+std::string llama_format_tensor_shape(const struct ggml_tensor * t);
+
+std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
+
+#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
diff --git a/llama.cpp/src/llama-io.cpp b/llama.cpp/src/llama-io.cpp
new file mode 100644
index 0000000..7ad70d1
--- /dev/null
+++ b/llama.cpp/src/llama-io.cpp
@@ -0,0 +1,15 @@
+#include "llama-io.h"
+
+void llama_io_write_i::write_string(const std::string & str) {
+ uint32_t str_size = str.size();
+
+ write(&str_size, sizeof(str_size));
+ write(str.data(), str_size);
+}
+
+void llama_io_read_i::read_string(std::string & str) {
+ uint32_t str_size;
+ read_to(&str_size, sizeof(str_size));
+
+ str.assign((const char *) read(str_size), str_size);
+}
diff --git a/llama.cpp/src/llama-io.h b/llama.cpp/src/llama-io.h
new file mode 100644
index 0000000..ce9216b
--- /dev/null
+++ b/llama.cpp/src/llama-io.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+struct ggml_tensor;
+
+class llama_io_write_i {
+public:
+ llama_io_write_i() = default;
+ virtual ~llama_io_write_i() = default;
+
+ virtual void write(const void * src, size_t size) = 0;
+ virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0;
+
+ // bytes written so far
+ virtual size_t n_bytes() = 0;
+
+ void write_string(const std::string & str);
+};
+
+class llama_io_read_i {
+public:
+ llama_io_read_i() = default;
+ virtual ~llama_io_read_i() = default;
+
+ virtual const uint8_t * read(size_t size) = 0;
+ virtual void read_to(void * dst, size_t size) = 0;
+
+ // bytes read so far
+ virtual size_t n_bytes() = 0;
+
+ void read_string(std::string & str);
+};
diff --git a/llama.cpp/src/llama-kv-cache-iswa.cpp b/llama.cpp/src/llama-kv-cache-iswa.cpp
new file mode 100644
index 0000000..26e2cb4
--- /dev/null
+++ b/llama.cpp/src/llama-kv-cache-iswa.cpp
@@ -0,0 +1,330 @@
+#include "llama-kv-cache-iswa.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+
+//
+// llama_kv_cache_iswa
+//
+
+llama_kv_cache_iswa::llama_kv_cache_iswa(
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ bool swa_full,
+ bool unified,
+ uint32_t kv_size,
+ uint32_t n_seq_max,
+ uint32_t n_ubatch,
+ uint32_t n_pad,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
+
+ // chain filters
+ const layer_filter_cb filter_base = [&](int32_t il) {
+ if (filter && !filter(il)) {
+ return false;
+ }
+
+ return !model.hparams.is_swa(il);
+ };
+
+ const layer_filter_cb filter_swa = [&](int32_t il) {
+ if (filter && !filter(il)) {
+ return false;
+ }
+
+ return model.hparams.is_swa(il);
+ };
+
+ const uint32_t size_base = kv_size;
+
+ // note: the SWA cache is always padded to 256 for performance
+ // https://github.com/ggml-org/llama.cpp/issues/17037
+ uint32_t size_swa = GGML_PAD(std::min(size_base, hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch), 256);
+
+ // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
+ if (swa_full) {
+ LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+
+ size_swa = size_base;
+ }
+
+ LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
+
+ kv_base = std::make_unique<llama_kv_cache>(
+ model, type_k, type_v,
+ v_trans, offload, unified, size_base, n_seq_max, n_pad,
+ 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
+
+ LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
+
+ kv_swa = std::make_unique<llama_kv_cache>(
+ model, type_k, type_v,
+ v_trans, offload, unified, size_swa, n_seq_max, n_pad,
+ hparams.n_swa, hparams.swa_type, filter_swa, reuse);
+}
+
+void llama_kv_cache_iswa::clear(bool data) {
+ kv_base->clear(data);
+ kv_swa ->clear(data);
+}
+
+bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ bool res = true;
+
+ res = res & kv_base->seq_rm(seq_id, p0, p1);
+ res = res & kv_swa ->seq_rm(seq_id, p0, p1);
+
+ return res;
+}
+
+void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+ kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+ kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
+ kv_base->seq_keep(seq_id);
+ kv_swa ->seq_keep(seq_id);
+}
+
+void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+ kv_base->seq_add(seq_id, p0, p1, shift);
+ kv_swa ->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ kv_base->seq_div(seq_id, p0, p1, d);
+ kv_swa ->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
+ // the base cache is a superset of the SWA cache, so we can just check the SWA cache
+ return kv_swa->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
+ return kv_swa->seq_pos_max(seq_id);
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
+ for (const auto & buft_size : kv_swa->memory_breakdown()) {
+ mb[buft_size.first] += buft_size.second;
+ }
+ return mb;
+}
+
+llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+ GGML_UNUSED(embd_all);
+
+ // first try simple split
+ do {
+ if (!unified) {
+ // requires equal splits, so we skip the simple split
+ break;
+ }
+
+ balloc.split_reset();
+
+ std::vector<llama_ubatch> ubatches;
+ while (true) {
+ auto ubatch = balloc.split_simple(n_ubatch);
+
+ if (ubatch.n_tokens == 0) {
+ break;
+ }
+
+ ubatches.push_back(std::move(ubatch)); // NOLINT
+ }
+
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
+ // failed to find a suitable split
+ break;
+ }
+
+ auto sinfos_base = kv_base->prepare(ubatches);
+ if (sinfos_base.empty()) {
+ break;
+ }
+
+ auto sinfos_swa = kv_swa->prepare(ubatches);
+ if (sinfos_swa.empty()) {
+ break;
+ }
+
+ assert(sinfos_base.size() == sinfos_swa.size());
+
+ return std::make_unique<llama_kv_cache_iswa_context>(
+ this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
+ } while (false);
+
+ // if it fails, try equal split
+ do {
+ balloc.split_reset();
+
+ std::vector<llama_ubatch> ubatches;
+ while (true) {
+ auto ubatch = balloc.split_equal(n_ubatch, !unified);
+
+ if (ubatch.n_tokens == 0) {
+ break;
+ }
+
+ ubatches.push_back(std::move(ubatch)); // NOLINT
+ }
+
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
+ // failed to find a suitable split
+ break;
+ }
+
+ auto sinfos_base = kv_base->prepare(ubatches);
+ if (sinfos_base.empty()) {
+ break;
+ }
+
+ auto sinfos_swa = kv_swa->prepare(ubatches);
+ if (sinfos_swa.empty()) {
+ break;
+ }
+
+ assert(sinfos_base.size() == sinfos_swa.size());
+
+ return std::make_unique<llama_kv_cache_iswa_context>(
+ this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
+ } while (false);
+
+ // TODO: if we fail again, we should attempt different splitting strategies
+ // but to do that properly, we first have to refactor the batches to be more flexible
+
+ return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
+ return std::make_unique<llama_kv_cache_iswa_context>(this);
+}
+
+llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
+ return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
+}
+
+bool llama_kv_cache_iswa::get_can_shift() const {
+ return kv_base->get_can_shift() &&
+ kv_swa->get_can_shift() &&
+ kv_base->get_size() == kv_swa->get_size();
+}
+
+void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
+ kv_base->state_write(io, seq_id, flags);
+ }
+
+ kv_swa->state_write(io, seq_id, flags);
+}
+
+void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
+ kv_base->state_read(io, seq_id, flags);
+ }
+
+ kv_swa->state_read(io, seq_id, flags);
+}
+
+llama_kv_cache * llama_kv_cache_iswa::get_base() const {
+ return kv_base.get();
+}
+
+llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
+ return kv_swa.get();
+}
+
+//
+// llama_kv_cache_iswa_context
+//
+
+llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
+ llama_kv_cache_iswa * kv) :
+ ctx_base(kv->get_base()->init_full()),
+ ctx_swa (kv->get_swa ()->init_full()),
+ status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
+}
+
+llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
+ llama_kv_cache_iswa * kv,
+ llama_context * lctx,
+ bool optimize) :
+ ctx_base(kv->get_base()->init_update(lctx, optimize)),
+ ctx_swa (kv->get_swa ()->init_update(lctx, optimize)),
+ status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
+}
+
+llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
+ llama_kv_cache_iswa * kv,
+ slot_info_vec_t sinfos_base,
+ slot_info_vec_t sinfos_swa,
+ std::vector<llama_ubatch> ubatches) :
+ ubatches(std::move(ubatches)),
+ // note: here we copy the ubatches. not sure if this is ideal
+ ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
+ ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
+ status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
+}
+
+llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
+
+bool llama_kv_cache_iswa_context::next() {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ ctx_base->next();
+ ctx_swa ->next();
+
+ if (++i_next >= ubatches.size()) {
+ return false;
+ }
+
+ return true;
+}
+
+bool llama_kv_cache_iswa_context::apply() {
+ assert(!llama_memory_status_is_fail(status));
+
+ bool res = true;
+
+ res = res & ctx_base->apply();
+ res = res & ctx_swa ->apply();
+
+ return res;
+}
+
+llama_memory_status llama_kv_cache_iswa_context::get_status() const {
+ return status;
+}
+
+const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ return ubatches[i_next];
+}
+
+const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ return static_cast<const llama_kv_cache_context *>(ctx_base.get());
+}
+
+const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
+}
diff --git a/llama.cpp/src/llama-kv-cache-iswa.h b/llama.cpp/src/llama-kv-cache-iswa.h
new file mode 100644
index 0000000..70ab22f
--- /dev/null
+++ b/llama.cpp/src/llama-kv-cache-iswa.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include "llama-kv-cache.h"
+
+#include <vector>
+
+//
+// llama_kv_cache_iswa
+//
+
+// utilizes two instances of llama_kv_cache
+// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
+
+class llama_kv_cache_iswa : public llama_memory_i {
+public:
+ llama_kv_cache_iswa(
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ bool swa_full,
+ bool unified,
+ uint32_t kv_size,
+ uint32_t n_seq_max,
+ uint32_t n_ubatch,
+ uint32_t n_pad,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse);
+
+ ~llama_kv_cache_iswa() = default;
+
+ //
+ // llama_memory_i
+ //
+
+ llama_memory_context_ptr init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) override;
+
+ llama_memory_context_ptr init_full() override;
+
+ llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+ bool get_can_shift() const override;
+
+ void clear(bool data) override;
+
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+ void seq_keep(llama_seq_id seq_id) override;
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+ // state write/load
+
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+ //
+ // llama_kv_cache_iswa specific API
+ //
+
+ llama_kv_cache * get_base() const;
+ llama_kv_cache * get_swa () const;
+
+private:
+ const llama_hparams & hparams;
+
+ const bool unified;
+
+ std::unique_ptr<llama_kv_cache> kv_base;
+ std::unique_ptr<llama_kv_cache> kv_swa;
+};
+
+class llama_kv_cache_iswa_context : public llama_memory_context_i {
+public:
+ using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+
+ // used for errors
+ llama_kv_cache_iswa_context(llama_memory_status status);
+
+ // used to create a full-cache context
+ llama_kv_cache_iswa_context(
+ llama_kv_cache_iswa * kv);
+
+ // used to create an update context
+ llama_kv_cache_iswa_context(
+ llama_kv_cache_iswa * kv,
+ llama_context * lctx,
+ bool optimize);
+
+ // used to create a batch processing context from a batch
+ llama_kv_cache_iswa_context(
+ llama_kv_cache_iswa * kv,
+ slot_info_vec_t sinfos_base,
+ slot_info_vec_t sinfos_swa,
+ std::vector<llama_ubatch> ubatches);
+
+ virtual ~llama_kv_cache_iswa_context();
+
+ //
+ // llama_memory_context_i
+ //
+
+ bool next() override;
+ bool apply() override;
+
+ llama_memory_status get_status() const override;
+ const llama_ubatch & get_ubatch() const override;
+
+ //
+ // llama_kv_cache_iswa_context specific API
+ //
+
+ const llama_kv_cache_context * get_base() const;
+ const llama_kv_cache_context * get_swa() const;
+
+private:
+ //llama_kv_cache_iswa * kv;
+
+ // the index of the next ubatch to process
+ size_t i_next = 0;
+
+ std::vector<llama_ubatch> ubatches;
+
+ const llama_memory_context_ptr ctx_base;
+ const llama_memory_context_ptr ctx_swa;
+
+ const llama_memory_status status;
+};
diff --git a/llama.cpp/src/llama-kv-cache.cpp b/llama.cpp/src/llama-kv-cache.cpp
new file mode 100644
index 0000000..cb702b2
--- /dev/null
+++ b/llama.cpp/src/llama-kv-cache.cpp
@@ -0,0 +1,2268 @@
+#include "llama-kv-cache.h"
+
+#include "llama-impl.h"
+#include "llama-io.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+//
+// llama_kv_cache
+//
+
+llama_kv_cache::llama_kv_cache(
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ bool unified,
+ uint32_t kv_size,
+ uint32_t n_seq_max,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse) :
+ model(model), hparams(model.hparams), v_trans(v_trans),
+ n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
+
+ GGML_ASSERT(kv_size % n_pad == 0);
+
+ const uint32_t n_layer_kv = hparams.n_layer_kv();
+
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+ struct ggml_backend_buft_comparator {
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+ }
+ };
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+ // create a context for each buffer type
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ ggml_init_params params = {
+ /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ return nullptr;
+ }
+
+ ctx_map.emplace(buft, ctx);
+
+ return ctx;
+ }
+
+ return it->second.get();
+ };
+
+ GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
+
+ v_heads.resize(n_stream);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ v_heads[s] = 0;
+ }
+
+ v_cells.resize(n_stream);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ v_cells[s].resize(kv_size);
+ }
+
+ // by default, all sequence ids are mapped to the 0th stream
+ seq_to_stream.resize(LLAMA_MAX_SEQ, 0);
+
+ if (n_stream > 1) {
+ seq_to_stream.resize(n_stream, 0);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ seq_to_stream[s] = s;
+ }
+ }
+
+ // [TAG_V_CACHE_VARIABLE]
+ if (v_trans && hparams.is_n_embd_v_gqa_variable()) {
+ LLAMA_LOG_WARN("%s: the V embeddings have different sizes across layers and FA is not enabled - padding V cache to %d\n",
+ __func__, hparams.n_embd_v_gqa_max());
+ }
+
+ const bool is_mla = hparams.is_mla();
+
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
+ if (!hparams.has_kv(il)) {
+ LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
+ continue;
+ }
+
+ if (filter && !filter(il)) {
+ LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
+ continue;
+ }
+
+ // [TAG_V_CACHE_VARIABLE]
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+ const uint32_t n_embd_v_gqa = !v_trans ? hparams.n_embd_v_gqa(il) : hparams.n_embd_v_gqa_max();
+
+ const char * dev_name = "CPU";
+
+ ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+ if (offload) {
+ auto * dev = model.dev_layer(il);
+ buft = ggml_backend_dev_buffer_type(dev);
+
+ dev_name = ggml_backend_dev_name(dev);
+ }
+
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
+
+ ggml_context * ctx = ctx_for_buft(buft);
+ if (!ctx) {
+ throw std::runtime_error("failed to create ggml context for kv cache");
+ }
+
+ const bool has_k = true;
+ const bool has_v = !is_mla;
+
+ ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
+ ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
+
+ has_k && ggml_format_name(k, "cache_k_l%d", il);
+ has_v && ggml_format_name(v, "cache_v_l%d", il);
+
+ std::vector<ggml_tensor *> k_stream;
+ std::vector<ggml_tensor *> v_stream;
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
+ v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
+ }
+
+ map_layer_ids[il] = layers.size();
+
+ layers.push_back({ il, k, v, k_stream, v_stream, });
+ }
+
+ if (reuse) {
+ LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
+
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
+ const int32_t il_reuse = reuse(il);
+
+ if (il_reuse < 0) {
+ LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
+ continue;
+ }
+
+ if (filter && !filter(il)) {
+ LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
+ continue;
+ }
+
+ GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
+
+ map_layer_ids[il] = map_layer_ids[il_reuse];
+
+ LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
+ }
+ }
+
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
+ for (auto & [buft, ctx] : ctx_map) {
+ ggml_backend_buffer_t buf;
+ if (model.hparams.no_alloc) {
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
+ t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
+ }
+ } else {
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
+ }
+ if (!buf) {
+ throw std::runtime_error("failed to allocate buffer for kv cache");
+ }
+
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+
+ ggml_backend_buffer_clear(buf, 0);
+ ctxs_bufs.emplace_back(std::move(ctx), buf);
+ }
+
+ {
+ const size_t memory_size_k = size_k_bytes();
+ const size_t memory_size_v = size_v_bytes();
+
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+ }
+
+ const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
+ debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
+}
+
+void llama_kv_cache::clear(bool data) {
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ v_cells[s].reset();
+ v_heads[s] = 0;
+ }
+
+ if (data) {
+ for (auto & [_, buf] : ctxs_bufs) {
+ ggml_backend_buffer_clear(buf.get(), 0);
+ }
+ }
+}
+
+bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ if (seq_id >= 0) {
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+ auto & head = v_heads[seq_to_stream[seq_id]];
+
+ uint32_t new_head = cells.size();
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.pos_in(i, p0, p1)) {
+ continue;
+ }
+
+ if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
+ if (new_head == cells.size()) {
+ new_head = i;
+ }
+ }
+ }
+
+ // If we freed up a slot, set head to it so searching can start there.
+ if (new_head != cells.size() && new_head < head) {
+ head = new_head;
+ }
+ } else {
+ // match any sequence
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ auto & cells = v_cells[s];
+ auto & head = v_heads[s];
+
+ uint32_t new_head = cells.size();
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.pos_in(i, p0, p1)) {
+ continue;
+ }
+
+ cells.rm(i);
+
+ if (new_head == cells.size()) {
+ new_head = i;
+ }
+ }
+
+ // If we freed up a slot, set head to it so searching can start there.
+ if (new_head != cells.size() && new_head < head) {
+ head = new_head;
+ }
+ }
+ }
+
+ return true;
+}
+
+void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+ GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
+ GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
+
+ const auto s0 = seq_to_stream[seq_id_src];
+ const auto s1 = seq_to_stream[seq_id_dst];
+
+ if (s0 == s1) {
+ // since both sequences are in the same stream, no data copy is necessary
+ // we just have to update the cells meta data
+
+ auto & cells = v_cells[s0];
+
+ if (seq_id_src == seq_id_dst) {
+ return;
+ }
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.pos_in(i, p0, p1)) {
+ continue;
+ }
+
+ if (cells.seq_has(i, seq_id_src)) {
+ cells.seq_add(i, seq_id_dst);
+ }
+ }
+
+ return;
+ }
+
+ // cross-stream sequence copies require to copy the actual buffer data
+
+ bool is_full = true;
+
+ if (p0 > 0 && p0 + 1 < (int) get_size()) {
+ is_full = false;
+ }
+
+ if (p1 > 0 && p1 + 1 < (int) get_size()) {
+ is_full = false;
+ }
+
+ GGML_ASSERT(is_full && "seq_cp() is only supported for full KV buffers");
+
+ // enqueue the copy operation - the buffer copy will be performed during the next update
+ sc_info.ssrc.push_back(s0);
+ sc_info.sdst.push_back(s1);
+
+ v_cells[s1].reset();
+ for (uint32_t i = 0; i < v_cells[s0].size(); ++i) {
+ if (v_cells[s0].seq_has(i, seq_id_src)) {
+ llama_pos pos = v_cells[s0].pos_get(i);
+ llama_pos shift = v_cells[s0].get_shift(i);
+
+ llama_kv_cell_ext ext = v_cells[s0].ext_get(i);
+
+ if (shift != 0) {
+ pos -= shift;
+ assert(pos >= 0);
+ }
+
+ v_cells[s1].pos_set(i, pos);
+ v_cells[s1].seq_add(i, seq_id_dst);
+
+ if (shift != 0) {
+ v_cells[s1].pos_add(i, shift);
+ }
+
+ v_cells[s1].ext_set(i, ext);
+ }
+ }
+
+ v_heads[s1] = v_heads[s0];
+
+ //for (uint32_t s = 0; s < n_stream; ++s) {
+ // LLAMA_LOG_WARN("%s: seq %d: min = %d, max = %d\n", __func__, s, v_cells[s].seq_pos_min(s), v_cells[s].seq_pos_max(s));
+ //}
+}
+
+void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+ auto & head = v_heads[seq_to_stream[seq_id]];
+
+ uint32_t new_head = cells.size();
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (cells.seq_keep(i, seq_id)) {
+ if (new_head == cells.size()) {
+ new_head = i;
+ }
+ }
+ }
+
+ // If we freed up a slot, set head to it so searching can start there.
+ if (new_head != cells.size() && new_head < head) {
+ head = new_head;
+ }
+}
+
+void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+ GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1");
+
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+ auto & head = v_heads[seq_to_stream[seq_id]];
+
+ if (shift == 0) {
+ return;
+ }
+
+ uint32_t new_head = cells.size();
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ // If there is no range then return early to avoid looping over all cells.
+ if (p0 == p1) {
+ return;
+ }
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.pos_in(i, p0, p1)) {
+ continue;
+ }
+
+ if (cells.seq_has(i, seq_id)) {
+ if (cells.pos_add(i, shift)) {
+ if (new_head == cells.size()) {
+ new_head = i;
+ }
+ }
+ }
+ }
+
+ // If we freed up a slot, set head to it so searching can start there.
+ // Otherwise we just start the next search from the beginning.
+ head = new_head != cells.size() ? new_head : 0;
+}
+
+void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+ GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1");
+
+ auto & cells = v_cells[seq_to_stream[seq_id]];
+
+ if (d == 1) {
+ return;
+ }
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ // If there is no range then return early to avoid looping over the cache.
+ if (p0 == p1) {
+ return;
+ }
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.pos_in(i, p0, p1)) {
+ continue;
+ }
+
+ if (cells.seq_has(i, seq_id)) {
+ cells.pos_div(i, d);
+ }
+ }
+}
+
+llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
+
+ return cells.seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
+
+ return cells.seq_pos_max(seq_id);
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
+ for (const auto & [ctx, buf] : ctxs_bufs) {
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
+
+ if (hparams.no_alloc) {
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
+ } else {
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
+ ret[buft] += ggml_backend_buffer_get_size(buf.get());
+ }
+ }
+
+ return ret;
+}
+
+llama_memory_context_ptr llama_kv_cache::init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) {
+ GGML_UNUSED(embd_all);
+
+ do {
+ balloc.split_reset();
+
+ std::vector<llama_ubatch> ubatches;
+ while (true) {
+ auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
+
+ if (ubatch.n_tokens == 0) {
+ break;
+ }
+
+ ubatches.push_back(std::move(ubatch)); // NOLINT
+ }
+
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
+ // failed to find a suitable split
+ break;
+ }
+
+ auto sinfos = prepare(ubatches);
+ if (sinfos.empty()) {
+ break;
+ }
+
+ return std::make_unique<llama_kv_cache_context>(
+ this, std::move(sinfos), std::move(ubatches));
+ } while (false);
+
+ return std::make_unique<llama_kv_cache_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_kv_cache::init_full() {
+ return std::make_unique<llama_kv_cache_context>(this);
+}
+
+llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) {
+ GGML_UNUSED(optimize);
+
+ bool do_shift = get_has_shift();
+
+ return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(sc_info));
+}
+
+llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) {
+ llama_kv_cache::slot_info_vec_t res;
+
+ struct state_t {
+ slot_info sinfo; // slot info for the ubatch
+
+ std::vector<uint32_t> v_heads_old; // old positions of the heads, before placing the ubatch
+
+ std::vector<llama_kv_cells> v_cells; // copy of the old cells, before placing the ubatch
+ };
+
+ // remember the old state of the cells so we can restore it in the end
+ std::vector<state_t> states;
+
+ bool success = true;
+
+ for (const auto & ubatch : ubatches) {
+ // only find a suitable slot for the ubatch. don't modify the cells yet
+ const auto sinfo_new = find_slot(ubatch, false);
+ if (sinfo_new.empty()) {
+ success = false;
+ break;
+ }
+
+ // remeber the position that we found
+ res.push_back(sinfo_new);
+
+ // store the old state of the cells in the recovery stack
+ {
+ state_t state = { sinfo_new, v_heads, {} };
+
+ for (uint32_t s = 0; s < sinfo_new.n_stream(); ++s) {
+ auto & cells = v_cells[sinfo_new.strm[s]];
+
+ state.v_cells.push_back(cells.cp(sinfo_new.idxs[s]));
+ }
+
+ states.push_back(std::move(state));
+ }
+
+ // now emplace the ubatch
+ apply_ubatch(sinfo_new, ubatch);
+ }
+
+ GGML_ASSERT(!states.empty() || !success);
+
+ // iterate backwards and restore the cells to their original state
+ for (auto it = states.rbegin(); it != states.rend(); ++it) {
+ const auto & sinfo = it->sinfo;
+
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ auto & cells = v_cells[sinfo.strm[s]];
+ auto & head = v_heads[sinfo.strm[s]];
+
+ cells.set(sinfo.idxs[s], it->v_cells[s]);
+ head = it->v_heads_old[s];
+ }
+ }
+
+ if (!success) {
+ return {};
+ }
+
+ return res;
+}
+
+bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info) {
+ bool updated = false;
+
+ auto * sched = lctx->get_sched();
+
+ if (!sc_info.empty()) {
+ assert(n_stream > 1 && "stream copy should never happen with a single stream");
+
+ llama_synchronize(lctx);
+
+ const size_t n_copy = sc_info.ssrc.size();
+
+ for (size_t i = 0; i < n_copy; ++i) {
+ const auto ssrc = sc_info.ssrc[i];
+ const auto sdst = sc_info.sdst[i];
+
+ assert(ssrc < n_stream);
+ assert(sdst < n_stream);
+
+ LLAMA_LOG_DEBUG("%s: copying KV buffer: stream %d to stream %d\n", __func__, ssrc, sdst);
+
+ assert(ssrc != sdst);
+
+ for (uint32_t il = 0; il < layers.size(); ++il) {
+ const auto & layer = layers[il];
+
+ ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
+
+ if (layer.v_stream[ssrc]) {
+ ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
+ }
+ }
+ }
+ }
+
+ if (do_shift) {
+ if (!get_can_shift()) {
+ GGML_ABORT("The current KV cache / model configuration does not support K-shift");
+ }
+
+ LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
+
+ // apply K-shift if needed
+ if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+ ggml_backend_sched_reset(sched);
+
+ auto * res = lctx->get_gf_res_reserve();
+
+ res->reset();
+
+ auto * gf = build_graph_shift(res, lctx);
+ if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+ LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
+ return updated;
+ }
+
+ res->set_inputs(nullptr);
+
+ if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
+ LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__);
+ return updated;
+ }
+
+ updated = true;
+ }
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ auto & cells = v_cells[s];
+
+ cells.reset_shift();
+ }
+ }
+
+ return updated;
+}
+
+llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, bool cont) const {
+
+ if (debug > 0) {
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ const auto seq_id = ubatch.seq_id_unq[s];
+ const auto stream_id = seq_to_stream[seq_id];
+ const auto & cells = v_cells[stream_id];
+ const uint32_t head_cur = v_heads[stream_id];
+
+ LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
+ __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
+
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
+ std::string ss;
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (cells.is_empty(i)) {
+ ss += '.';
+ } else {
+ assert(cells.seq_count(i) >= 1);
+
+ if (cells.seq_count(i) == 1) {
+ ss += std::to_string(cells.seq_get(i));
+ } else {
+ ss += 'M';
+ }
+ }
+ if (i%256 == 255) {
+ ss += " *";
+ ss += '\n';
+ }
+ }
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
+ }
+
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
+ std::string ss;
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ std::string cur;
+ if (cells.is_empty(i)) {
+ cur = '.';
+ } else {
+ cur = std::to_string(cells.pos_get(i));
+ }
+ const int n = cur.size();
+ for (int j = 0; j < 5 - n; ++j) {
+ cur += ' ';
+ }
+ ss += cur;
+ if (i%256 == 255) {
+ ss += " *";
+ }
+ if (i%64 == 63) {
+ ss += '\n';
+ }
+ }
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
+ }
+
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ if (cells.seq_pos_min(s) < 0) {
+ continue;
+ }
+
+ LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
+ }
+ }
+ }
+
+ uint32_t n_tokens = ubatch.n_tokens;
+ uint32_t n_seqs = 1;
+
+ if (n_stream > 1) {
+ GGML_ASSERT(n_tokens % ubatch.n_seqs_unq == 0);
+
+ n_seqs = ubatch.n_seqs_unq;
+ n_tokens = n_tokens / n_seqs;
+ }
+
+ slot_info res = {
+ /*.s0 =*/ LLAMA_MAX_SEQ,
+ /*.s1 =*/ 0,
+ /*.strm =*/ { },
+ /*.idxs =*/ { },
+ };
+
+ res.resize(n_seqs);
+
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ const auto seq_id = ubatch.seq_id_unq[s];
+
+ if (n_stream > 1) {
+ GGML_ASSERT(ubatch.n_seq_id[s*n_tokens] == 1);
+ GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id);
+ }
+
+ res.s0 = std::min<uint32_t>(res.s0, seq_to_stream[seq_id]);
+ res.s1 = std::max<uint32_t>(res.s1, seq_to_stream[seq_id]);
+
+ res.strm[s] = seq_to_stream[seq_id];
+ res.idxs[s].reserve(n_tokens);
+
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
+
+ uint32_t head_cur = v_heads[seq_to_stream[seq_id]];
+
+ // if we have enough unused cells before the current head ->
+ // better to start searching from the beginning of the cache, hoping to fill it
+ if (head_cur > cells.get_used() + 2*n_tokens) {
+ head_cur = 0;
+ }
+
+ if (n_tokens > cells.size()) {
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
+ return { };
+ }
+
+ uint32_t n_tested = 0;
+
+ // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
+ // for non-continuous slots, we test the tokens one by one
+ const uint32_t n_test = cont ? n_tokens : 1;
+
+ while (true) {
+ if (head_cur + n_test > cells.size()) {
+ n_tested += cells.size() - head_cur;
+ head_cur = 0;
+ continue;
+ }
+
+ for (uint32_t i = 0; i < n_test; i++) {
+ const auto idx = head_cur;
+
+ head_cur++;
+ n_tested++;
+
+ //const llama_pos pos = ubatch.pos[i];
+ //const llama_seq_id seq_id = ubatch.seq_id[i][0];
+
+ // can we use this cell? either:
+ // - the cell is empty
+ // - the cell is occupied only by one sequence:
+ // - (disabled) mask causally, if the sequence is the same as the one we are inserting
+ // - mask SWA, using current max pos for that sequence in the cache
+ // always insert in the cell with minimum pos
+ bool can_use = cells.is_empty(idx);
+
+ if (!can_use && cells.seq_count(idx) == 1) {
+ const llama_pos pos_cell = cells.pos_get(idx);
+
+ // (disabled) causal mask
+ // note: it's better to purge any "future" tokens beforehand
+ //if (cells.seq_has(idx, seq_id)) {
+ // can_use = pos_cell >= pos;
+ //}
+
+ if (!can_use) {
+ const llama_seq_id seq_id_cell = cells.seq_get(idx);
+
+ // SWA mask
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+ can_use = true;
+ }
+ }
+ }
+
+ if (can_use) {
+ res.idxs[s].push_back(idx);
+ } else {
+ if (cont) {
+ break;
+ }
+ }
+ }
+
+ if (res.idxs[s].size() == n_tokens) {
+ break;
+ }
+
+ if (cont) {
+ res.idxs[s].clear();
+ }
+
+ if (n_tested >= cells.size()) {
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+ return { };
+ }
+ }
+
+ // we didn't find a suitable slot - return empty result
+ if (res.idxs[s].size() < n_tokens) {
+ return { };
+ }
+ }
+
+ assert(res.s1 >= res.s0);
+
+ return res;
+}
+
+void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
+ // keep track of the max sequence position that we would overwrite with this ubatch
+ // for non-SWA cache, this would be always empty
+ llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ seq_pos_max_rm[s] = -1;
+ }
+
+ assert(ubatch.n_tokens == sinfo.n_stream()*sinfo.size());
+
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ for (uint32_t ii = 0; ii < sinfo.size(); ++ii) {
+ const uint32_t i = s*sinfo.size() + ii;
+
+ auto & cells = v_cells[sinfo.strm[s]];
+
+ const auto idx = sinfo.idxs[s][ii];
+
+ if (!cells.is_empty(idx)) {
+ assert(cells.seq_count(idx) == 1);
+
+ const llama_seq_id seq_id = cells.seq_get(idx);
+ const llama_pos pos = cells.pos_get(idx);
+
+ seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
+
+ cells.rm(idx);
+ }
+
+ cells.pos_set(idx, ubatch.pos[i]);
+
+ if (ubatch.is_pos_2d()) {
+ llama_kv_cell_ext ext {
+ /*.x =*/ ubatch.pos[i + ubatch.n_tokens*2],
+ /*.y =*/ ubatch.pos[i + ubatch.n_tokens],
+ };
+ cells.ext_set(idx, ext);
+ }
+
+ for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
+ cells.seq_add(idx, ubatch.seq_id[i][s]);
+ }
+ }
+ }
+
+ // note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
+ // will be present in the cache. so we have to purge any position which is less than those we would overwrite
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ if (seq_pos_max_rm[s] == -1) {
+ continue;
+ }
+
+ GGML_ASSERT(s < seq_to_stream.size());
+
+ auto & cells = v_cells[seq_to_stream[s]];
+
+ if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
+ LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
+ __func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
+
+ seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
+ }
+ }
+
+ // move the head at the end of the slot
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ auto & head = v_heads[sinfo.strm[s]];
+
+ head = sinfo.idxs[s].back() + 1;
+ }
+}
+
+bool llama_kv_cache::get_can_shift() const {
+ // Step35 uses per-layer RoPE dims; K-shift assumes a single global n_rot.
+ if (model.arch == LLM_ARCH_STEP35) {
+ return false;
+ }
+ return true;
+}
+
+uint32_t llama_kv_cache::get_size() const {
+ const auto & cells = v_cells[seq_to_stream[0]];
+
+ return cells.size();
+}
+
+uint32_t llama_kv_cache::get_n_stream() const {
+ return n_stream;
+}
+
+bool llama_kv_cache::get_has_shift() const {
+ bool result = false;
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ result |= v_cells[s].get_has_shift();
+ }
+
+ return result;
+}
+
+uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
+ uint32_t result = 0;
+
+ // pad the n_kv value so that the graph remains constant across batches and can be reused
+ // note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
+ const uint32_t n_pad_cur = std::max(n_pad, 256u);
+
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ const auto & cells = v_cells[sinfo.strm[s]];
+
+ result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
+ }
+
+ return result;
+}
+
+ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
+ const int32_t ikv = map_layer_ids.at(il);
+
+ auto * k = layers[ikv].k;
+
+ const uint64_t kv_size = get_size();
+ const uint64_t n_embd_k_gqa = k->ne[0];
+
+ assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il));
+
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
+
+ return ggml_view_4d(ctx, k,
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
+ ggml_row_size(k->type, hparams.n_embd_head_k),
+ ggml_row_size(k->type, n_embd_k_gqa),
+ ggml_row_size(k->type, n_embd_k_gqa*kv_size),
+ ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
+}
+
+ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
+ const int32_t ikv = map_layer_ids.at(il);
+
+ auto * v = layers[ikv].v;
+
+ const uint64_t kv_size = get_size();
+ const uint64_t n_embd_v_gqa = v->ne[0];
+
+ // [TAG_V_CACHE_VARIABLE]
+ assert(n_embd_v_gqa >= hparams.n_embd_v_gqa(il));
+
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
+
+ if (!v_trans) {
+ // note: v->nb[1] <= v->nb[2]
+ return ggml_view_4d(ctx, v,
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
+ ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
+ }
+
+ // note: v->nb[1] > v->nb[2]
+ return ggml_view_4d(ctx, v,
+ n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
+ ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
+ ggml_row_size(v->type, kv_size), // v->nb[2]
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
+}
+
+ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
+ GGML_UNUSED(sinfo);
+
+ const int32_t ikv = map_layer_ids.at(il);
+
+ ggml_tensor * k = layers[ikv].k;
+
+ const int64_t n_embd_head = k_cur->ne[0];
+ const int64_t n_head = k_cur->ne[1];
+ const int64_t n_tokens = k_cur->ne[2];
+
+ const int64_t n_embd_gqa = n_embd_head*n_head;
+
+ // we can merge dims 0 and 1
+ // TODO: add ggml helper function for this?
+ GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
+
+ k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
+
+ const int64_t n_stream = k->ne[2];
+
+ if (n_stream > 1) {
+ const int64_t kv_size = get_size();
+
+ assert(n_embd_gqa == k->ne[0]);
+ assert(kv_size == k->ne[1]);
+
+ // merge the buffer across all streams because the idxs are global
+ k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
+ }
+
+ // store the current K values into the cache
+ return ggml_set_rows(ctx, k, k_cur, k_idxs);
+}
+
+ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
+ GGML_UNUSED(sinfo);
+
+ const int32_t ikv = map_layer_ids.at(il);
+
+ auto * v = layers[ikv].v;
+
+ const int64_t n_embd_head = v_cur->ne[0];
+ const int64_t n_head = v_cur->ne[1];
+ const int64_t n_tokens = v_cur->ne[2];
+
+ const int64_t n_embd_gqa = n_embd_head*n_head;
+
+ // we can merge dims 0 and 1
+ GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
+
+ const int64_t n_stream = v->ne[2];
+
+ // take this branch when FA is enabled (the V cache is not transposed)
+ if (!v_trans) {
+ v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
+
+ if (n_stream > 1) {
+ const int64_t kv_size = get_size();
+
+ assert(n_embd_gqa == v->ne[0]);
+ assert(kv_size == v->ne[1]);
+
+ // merge the buffer across all streams because the idxs are global
+ v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
+ }
+
+ return ggml_set_rows(ctx, v, v_cur, v_idxs);
+ }
+
+ if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
+ // we can merge dims 0, 1 and 2
+ v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
+ } else {
+ // otherwise -> make a copy to get contiguous data
+ v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
+ }
+
+ // [TAG_V_CACHE_VARIABLE]
+ if (n_embd_gqa < v->ne[0]) {
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
+ }
+
+ // in this branch the v_idxs are constructed in such a way that each row is a single head element
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
+
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
+
+ return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
+}
+
+ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+ const uint32_t n_tokens = ubatch.n_tokens;
+
+ ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
+
+ ggml_set_input(k_idxs);
+
+ return k_idxs;
+}
+
+ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+ const uint32_t n_tokens = ubatch.n_tokens;
+
+ ggml_tensor * v_idxs;
+
+ if (!v_trans) {
+ v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
+ } else {
+ v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens*hparams.n_embd_v_gqa_max());
+ }
+
+ ggml_set_input(v_idxs);
+
+ return v_idxs;
+}
+
+void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
+ const uint32_t n_tokens = ubatch->n_tokens;
+ GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+ int64_t * data = (int64_t *) dst->data;
+
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ const int64_t offs = sinfo.strm[s]*get_size();
+
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
+ data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
+ }
+ }
+}
+
+void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
+ const uint32_t n_tokens = ubatch->n_tokens;
+ GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+ int64_t * data = (int64_t *) dst->data;
+
+ if (!v_trans) {
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ const int64_t offs = sinfo.strm[s]*get_size();
+
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
+ data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
+ }
+ }
+ } else {
+ // note: the V cache is transposed when not using flash attention
+ const int64_t kv_size = get_size();
+
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa_max();
+
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+ const int64_t offs = sinfo.strm[s]*kv_size*n_embd_v_gqa;
+
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+ data[s*sinfo.size()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs[s][i];
+ }
+ }
+ }
+ }
+}
+
+void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+
+ int32_t * data = (int32_t *) dst->data;
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ const auto & cells = v_cells[s];
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ data[s*cells.size() + i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
+ }
+ }
+}
+
+struct args_set_input_kq_mask {
+ const llama_hparams & hparams;
+ const llama_ubatch * ubatch;
+
+ const std::vector<llama_kv_cells> & v_cells;
+ const std::vector<uint32_t> & seq_to_stream;
+
+ uint32_t n_swa;
+ llama_swa_type swa_type;
+
+ int64_t n_kv;
+ int64_t n_stream;
+ int64_t n_tps;
+};
+
+template<bool causal, bool swa, bool is_2d, bool alibi>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ //const auto & hparams = args.hparams;
+ const auto & ubatch = args.ubatch;
+
+ const auto & v_cells = args.v_cells;
+ const auto & seq_to_stream = args.seq_to_stream;
+
+ const uint32_t n_swa = args.n_swa;
+ const llama_swa_type swa_type = args.swa_type;
+
+ const int64_t n_kv = args.n_kv;
+ const int64_t n_stream = args.n_stream;
+ const int64_t n_tps = args.n_tps;
+
+ // the min position in the batch for each sequence
+ llama_pos seq_pos_min[LLAMA_MAX_SEQ];
+ std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
+
+ for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
+
+ seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
+ }
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
+ std::unordered_map<llama_seq_id, uint32_t> seq_srct;
+ std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
+
+ for (uint32_t ii = 0; ii < n_tps; ++ii) {
+ const uint32_t i = s*n_tps + ii;
+
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
+
+ const auto & cells = v_cells.at(seq_to_stream[seq_id]);
+
+ llama_pos p0 = -1;
+ const llama_pos p1 = ubatch->pos[i];
+
+ // for M-RoPE
+ const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
+ const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
+
+ const uint64_t idst = n_kv*i;
+
+ // for tokens of the same sequence, the mask is mostly the same, so we can reuse it
+ // the only cells that could change are the ones that are with similar positions as the
+ // ones in the batch (i.e. due to causal masking, SWA, etc.)
+ // keep track of those cells and shortcut the loop to save time
+ // note: this optimization is not compatible with Alibi position encoding
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18842
+ bool prev = false;
+
+ auto & idxs = seq_idxs[seq_id];
+
+ if (!alibi) {
+ if (seq_srct.find(seq_id) != seq_srct.end()) {
+ const uint32_t srct = seq_srct[seq_id];
+
+ const uint64_t idst_prev = n_kv*srct;
+
+ std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
+
+ prev = true;
+ } else {
+ idxs.clear();
+ idxs.reserve(ubatch->n_tokens + n_swa + 32);
+
+ seq_srct[seq_id] = i;
+ }
+ }
+
+ for (uint32_t jj = 0; jj < n_kv; ++jj) {
+ uint32_t j = jj;
+
+ // we have an exiting mask for this sequence -> update just seq_idxs
+ if (!alibi) {
+ if (prev) {
+ if (jj >= idxs.size()) {
+ break;
+ }
+
+ j = idxs[jj];
+ }
+ }
+
+ if (cells.is_empty(j)) {
+ goto skip;
+ }
+
+ // mask the token if not the same sequence
+ if (!cells.seq_has(j, seq_id)) {
+ goto skip;
+ }
+
+ p0 = cells.pos_get(j);
+
+ if (!alibi) {
+ if (!prev) {
+ // record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
+ if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
+ idxs.push_back(j);
+ }
+ }
+ }
+
+ if (causal) {
+ // mask future tokens
+ if (p0 > p1) {
+ goto skip;
+ }
+
+ // M-RoPE causal mask
+ if (is_2d) {
+ if (p0 == p1) {
+ const auto & p0_ext = cells.ext_get(j);
+
+ if (p0_ext.is_2d_gt(p1_x, p1_y)) {
+ goto skip;
+ }
+ }
+ }
+ }
+
+ // apply SWA if any
+ if (swa) {
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+ goto skip;
+ }
+ }
+
+ if (alibi) {
+ data[idst + j] = -std::abs(p0 - p1);
+ } else {
+ data[idst + j] = 0.0f;
+ }
+
+ continue;
+skip:
+ data[idst + j] = -INFINITY;
+ }
+ }
+ }
+}
+
+template<bool causal, bool swa, bool is_2d>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ const bool alibi = args.hparams.use_alibi;
+ if (alibi) {
+ set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
+ } else {
+ set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
+ }
+}
+
+template<bool causal, bool swa>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ const bool is_2d = args.ubatch->is_pos_2d();
+ if (is_2d) {
+ set_input_kq_mask_impl<causal, swa, true> (args, data);
+ } else {
+ set_input_kq_mask_impl<causal, swa, false>(args, data);
+ }
+}
+
+template<bool causal>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
+ if (swa) {
+ set_input_kq_mask_impl<causal, true> (args, data);
+ } else {
+ set_input_kq_mask_impl<causal, false>(args, data);
+ }
+}
+
+void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+ const uint32_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+ float * data = (float *) dst->data;
+
+ const int64_t n_kv = dst->ne[0];
+ const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
+
+ GGML_ASSERT(n_tokens%n_stream == 0);
+
+ // n_tps == n_tokens_per_stream
+ const int64_t n_tps = n_tokens/n_stream;
+
+ //const int64_t t_start = ggml_time_us();
+
+ const args_set_input_kq_mask args = {
+ /*.hparams =*/ hparams,
+ /*.ubatch =*/ ubatch,
+ /*.v_cells =*/ v_cells,
+ /*.seq_to_stream =*/ seq_to_stream,
+ /*.n_swa =*/ n_swa,
+ /*.swa_type =*/ swa_type,
+ /*.n_kv =*/ n_kv,
+ /*.n_stream =*/ n_stream,
+ /*.n_tps =*/ n_tps,
+ };
+
+ if (causal_attn) {
+ set_input_kq_mask_impl<true> (args, data);
+ } else {
+ set_input_kq_mask_impl<false>(args, data);
+ }
+
+ //const int64_t t_end = ggml_time_us();
+
+ //LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
+}
+
+void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+ const int64_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams");
+ const auto & cells = v_cells[0];
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+ GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
+
+ int32_t * data = (int32_t *) dst->data;
+
+ const int32_t n_kv = dst->ne[0];
+
+ for (int h = 0; h < 1; ++h) {
+ for (int i = 0; i < n_tokens; ++i) {
+ for (int j = 0; j < n_kv; ++j) {
+ // the position when the cells is empty is irrelevant - it will be masked out later in the attention
+ const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);
+
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = llama_relative_position_bucket(p0, ubatch->pos[i], hparams.n_rel_attn_bkts, false);
+ }
+ }
+ }
+}
+
+size_t llama_kv_cache::total_size() const {
+ size_t size = 0;
+
+ for (const auto & [_, buf] : ctxs_bufs) {
+ size += ggml_backend_buffer_get_size(buf.get());
+ }
+
+ return size;
+}
+
+size_t llama_kv_cache::size_k_bytes() const {
+ size_t size_k_bytes = 0;
+
+ for (const auto & layer : layers) {
+ size_k_bytes += ggml_nbytes(layer.k);
+ }
+
+ return size_k_bytes;
+}
+
+size_t llama_kv_cache::size_v_bytes() const {
+ size_t size_v_bytes = 0;
+
+ for (const auto & layer : layers) {
+ size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0;
+ }
+
+ return size_v_bytes;
+}
+
+ggml_tensor * llama_kv_cache::build_rope_shift(
+ const llama_cparams & cparams,
+ ggml_context * ctx,
+ ggml_tensor * cur,
+ ggml_tensor * shift,
+ ggml_tensor * factors,
+ float freq_base,
+ float freq_scale) const {
+ const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+
+ const auto & yarn_ext_factor = cparams.yarn_ext_factor;
+ const auto & yarn_beta_fast = cparams.yarn_beta_fast;
+ const auto & yarn_beta_slow = cparams.yarn_beta_slow;
+ const auto & yarn_attn_factor = cparams.yarn_attn_factor;
+
+ const auto & n_rot = hparams.n_rot;
+ const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
+ // @ngxson : this is a workaround
+ // for M-RoPE, we want to rotate the whole vector when doing KV shift
+ // a normal RoPE should work, we just need to use the correct ordering
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13870
+ ? LLAMA_ROPE_TYPE_NEOX
+ : hparams.rope_type;
+
+ ggml_tensor * tmp;
+
+ if (ggml_is_quantized(cur->type)) {
+ // dequantize to f32 -> RoPE -> quantize back
+ tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
+
+ tmp = ggml_rope_ext(ctx, tmp,
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+
+ tmp = ggml_cpy(ctx, tmp, cur);
+ } else {
+ // we rotate only the first n_rot dimensions
+ tmp = ggml_rope_ext_inplace(ctx, cur,
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+ }
+
+ return tmp;
+}
+
+class llm_graph_input_k_shift : public llm_graph_input_i {
+public:
+ llm_graph_input_k_shift(const llama_kv_cache * kv_self) : kv_self(kv_self) {}
+ virtual ~llm_graph_input_k_shift() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * k_shift; // I32 [kv_size*n_stream]
+
+ const llama_kv_cache * kv_self;
+};
+
+void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
+ GGML_UNUSED(ubatch);
+
+ if (k_shift) {
+ kv_self->set_input_k_shift(k_shift);
+ }
+}
+
+ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
+ auto * ctx = res->get_ctx();
+ auto * gf = res->get_gf();
+
+ const auto & n_embd_head_k = hparams.n_embd_head_k;
+ //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+ const auto & n_rot = hparams.n_rot;
+
+ const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
+
+ auto inp = std::make_unique<llm_graph_input_k_shift>(this);
+
+ inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
+ ggml_set_input(inp->k_shift);
+
+ const auto & cparams = lctx->get_cparams();
+
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ ggml_tensor * k =
+ ggml_view_3d(ctx, layer.k,
+ n_rot, n_head_kv, get_size()*n_stream,
+ ggml_row_size(layer.k->type, n_embd_head_k),
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
+ ggml_row_size(layer.k->type, n_embd_nope));
+
+ ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+
+ ggml_build_forward_expand(gf, cur);
+ }
+
+ res->add_input(std::move(inp));
+
+ return gf;
+}
+
+void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+ GGML_UNUSED(flags);
+
+ io.write(&n_stream, sizeof(n_stream));
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ cell_ranges_t cr { s, {} };
+
+ uint32_t cell_count = 0;
+
+ const auto & cells = v_cells[s];
+
+ // Count the number of cells with the specified seq_id
+ // Find all the ranges of cells with this seq id (or all, when -1)
+ uint32_t cell_range_begin = cells.size();
+
+ for (uint32_t i = 0; i < cells.size(); ++i) {
+ if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+ ++cell_count;
+ if (cell_range_begin == cells.size()) {
+ cell_range_begin = i;
+ }
+ } else {
+ if (cell_range_begin != cells.size()) {
+ cr.data.emplace_back(cell_range_begin, i);
+ cell_range_begin = cells.size();
+ }
+ }
+ }
+
+ if (cell_range_begin != cells.size()) {
+ cr.data.emplace_back(cell_range_begin, cells.size());
+ }
+
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+ uint32_t cell_count_check = 0;
+ for (const auto & range : cr.data) {
+ cell_count_check += range.second - range.first;
+ }
+ GGML_ASSERT(cell_count == cell_count_check);
+
+ io.write(&cell_count, sizeof(cell_count));
+
+ // skip empty streams
+ if (cell_count == 0) {
+ continue;
+ }
+
+ state_write_meta(io, cr, seq_id);
+ state_write_data(io, cr);
+ }
+}
+
+void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ GGML_UNUSED(flags);
+
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
+
+ uint32_t n_stream_cur;
+ io.read_to(&n_stream_cur, sizeof(n_stream_cur));
+ if (n_stream_cur != n_stream) {
+ throw std::runtime_error("n_stream mismatch");
+ }
+
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ uint32_t cell_count;
+ io.read_to(&cell_count, sizeof(cell_count));
+
+ if (cell_count == 0) {
+ continue;
+ }
+
+ const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
+
+ slot_info sinfo;
+
+ bool res = true;
+ res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
+ res = res && state_read_data(io, strm, cell_count, sinfo);
+
+ if (!res) {
+ if (seq_id == -1) {
+ clear(true);
+ } else {
+ seq_rm(seq_id, -1, -1);
+ }
+ throw std::runtime_error("failed to restore kv cache");
+ }
+ }
+}
+
+void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {
+ const auto & cells = v_cells[cr.strm];
+
+ for (const auto & range : cr.data) {
+ for (uint32_t i = range.first; i < range.second; ++i) {
+ std::vector<llama_seq_id> seq_ids;
+
+ for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
+ if (cur == seq_id || seq_id == -1) {
+ if (cells.seq_has(i, cur)) {
+ seq_ids.push_back(cur);
+ }
+ }
+ }
+
+ const llama_pos pos = cells.pos_get(i);
+ const uint32_t n_seq_id = seq_ids.size();
+
+ io.write(&pos, sizeof(pos));
+ io.write(&n_seq_id, sizeof(n_seq_id));
+
+ // TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
+ // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
+
+ for (const auto & seq_id : seq_ids) {
+ io.write(&seq_id, sizeof(seq_id));
+ }
+ }
+ }
+}
+
+void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const {
+ const auto & cells = v_cells[cr.strm];
+
+ const uint32_t v_trans = this->v_trans ? 1 : 0;
+ const uint32_t n_layer = layers.size();
+
+ io.write(&v_trans, sizeof(v_trans));
+ io.write(&n_layer, sizeof(n_layer));
+
+ // Iterate and write all the keys first, each row is a cell
+ // Get whole range at a time
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+ auto * k = layer.k_stream[cr.strm];
+
+ // Write key type
+ const int32_t k_type_i = (int32_t) k->type;
+ io.write(&k_type_i, sizeof(k_type_i));
+
+ // Write row size of key
+ const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
+ io.write(&k_size_row, sizeof(k_size_row));
+
+ // Read each range of cells of k_size length and write out
+ for (const auto & range : cr.data) {
+ const size_t range_size = range.second - range.first;
+ const size_t buf_size = range_size * k_size_row;
+ io.write_tensor(k, range.first * k_size_row, buf_size);
+ }
+ }
+
+ if (!v_trans) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+ auto * v = layer.v_stream[cr.strm];
+ if (!v) {
+ continue;
+ }
+
+ // Write value type
+ const int32_t v_type_i = (int32_t) v->type;
+ io.write(&v_type_i, sizeof(v_type_i));
+
+ // Write row size of value
+ const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
+ io.write(&v_size_row, sizeof(v_size_row));
+
+ // Read each range of cells of v_size length and write out
+ for (const auto & range : cr.data) {
+ const size_t range_size = range.second - range.first;
+ const size_t buf_size = range_size * v_size_row;
+ io.write_tensor(v, range.first * v_size_row, buf_size);
+ }
+ }
+ } else {
+ // When v is transposed, we also need the element size and get the element ranges from each row
+ const uint32_t kv_size = cells.size();
+
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+ auto * v = layer.v_stream[cr.strm];
+ if (!v) {
+ continue;
+ }
+
+ // Write value type
+ const int32_t v_type_i = (int32_t) v->type;
+ io.write(&v_type_i, sizeof(v_type_i));
+
+ // Write element size
+ const uint32_t v_size_el = ggml_type_size(v->type);
+ io.write(&v_size_el, sizeof(v_size_el));
+
+ // Write GQA embedding size
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+ // For each row, we get the element values of each cell
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+ // Read each range of cells of v_size_el length and write out
+ for (const auto & range : cr.data) {
+ const size_t range_size = range.second - range.first;
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+ const size_t buf_size = range_size * v_size_el;
+ io.write_tensor(v, src_offset, buf_size);
+ }
+ }
+ }
+ }
+}
+
+bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
+ auto & cells = v_cells[strm];
+ auto & head = v_heads[strm];
+
+ if (dest_seq_id != -1) {
+ // single sequence
+ seq_rm(dest_seq_id, -1, -1);
+
+ llama_batch_allocr balloc(hparams.n_pos_per_embd());
+
+ llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
+
+ ubatch.seq_id_unq[0] = dest_seq_id;
+
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ llama_pos pos;
+ uint32_t n_seq_id;
+
+ io.read_to(&pos, sizeof(pos));
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+ if (n_seq_id != 1) {
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+ return false;
+ }
+
+ // read the sequence id, but directly discard it - we will use dest_seq_id instead
+ {
+ llama_seq_id seq_id;
+ io.read_to(&seq_id, sizeof(seq_id));
+ }
+
+ ubatch.pos[i] = pos;
+ ubatch.n_seq_id[i] = n_seq_id;
+ ubatch.seq_id[i] = &dest_seq_id;
+ }
+
+ sinfo = find_slot(ubatch, false);
+ if (sinfo.empty()) {
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+ return false;
+ }
+
+ // TODO: we cannot yet restore llama_kv_cell_ext as the apply_ubatch() does not support it yet
+ // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
+ apply_ubatch(sinfo, ubatch);
+
+ LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
+
+ // DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
+ GGML_ASSERT(sinfo.n_stream() == 1);
+ GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ const uint32_t idx = sinfo.idxs[0][i];
+ GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
+ GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
+ }
+ } else {
+ // whole KV cache restore
+
+ if (cell_count > cells.size()) {
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+ return false;
+ }
+
+ clear(true);
+
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ llama_pos pos;
+ uint32_t n_seq_id;
+
+ io.read_to(&pos, sizeof(pos));
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+ cells.pos_set(i, pos);
+
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
+ llama_seq_id seq_id;
+ io.read_to(&seq_id, sizeof(seq_id));
+
+ if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
+ return false;
+ }
+
+ cells.seq_add(i, seq_id);
+ }
+ }
+
+ // Create contiguous slot_info for whole cache restore
+ sinfo.s0 = strm;
+ sinfo.s1 = strm;
+ sinfo.resize(1);
+ sinfo.strm[0] = strm;
+ sinfo.idxs[0].resize(cell_count);
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ sinfo.idxs[0][i] = i;
+ }
+
+ head = 0;
+ }
+
+ return true;
+}
+
+bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
+ auto & cells = v_cells[strm];
+
+ uint32_t v_trans;
+ uint32_t n_layer;
+
+ io.read_to(&v_trans, sizeof(v_trans));
+ io.read_to(&n_layer, sizeof(n_layer));
+
+ if (n_layer != layers.size()) {
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
+ return false;
+ }
+
+ if (cell_count > cells.size()) {
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
+ return false;
+ }
+
+ if (this->v_trans != (bool) v_trans) {
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+ return false;
+ }
+
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+ auto * k = layer.k_stream[strm];
+
+ // Read type of key
+ int32_t k_type_i_ref;
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+ const int32_t k_type_i = (int32_t) k->type;
+ if (k_type_i != k_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+ return false;
+ }
+
+ // Read row size of key
+ uint64_t k_size_row_ref;
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+ const size_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
+ if (k_size_row != k_size_row_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+ return false;
+ }
+
+ if (cell_count) {
+ if (sinfo.is_contiguous()) {
+ // Fast path: contiguous cells, single memcpy
+ ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
+ } else {
+ // Slow path: scatter to non-contiguous positions
+ const void * src = io.read(cell_count * k_size_row);
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
+ ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
+ }
+ }
+ }
+ }
+
+ if (!this->v_trans) {
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+ auto * v = layer.v_stream[strm];
+ if (!v) {
+ continue;
+ }
+
+ // Read type of value
+ int32_t v_type_i_ref;
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+ const int32_t v_type_i = (int32_t) v->type;
+ if (v_type_i != v_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+ return false;
+ }
+
+ // Read row size of value
+ uint64_t v_size_row_ref;
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+ const size_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
+ if (v_size_row != v_size_row_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+ return false;
+ }
+
+ if (cell_count) {
+ if (sinfo.is_contiguous()) {
+ // Fast path: contiguous cells, single memcpy
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
+ } else {
+ // Slow path: scatter to non-contiguous positions
+ const void * src = io.read(cell_count * v_size_row);
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
+ }
+ }
+ }
+ }
+ } else {
+ // For each layer, read the values for each cell (transposed)
+ for (const auto & layer : layers) {
+ const uint32_t il = layer.il;
+
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+ auto * v = layer.v_stream[strm];
+ if (!v) {
+ continue;
+ }
+
+ // Read type of value
+ int32_t v_type_i_ref;
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+ const int32_t v_type_i = (int32_t) v->type;
+ if (v_type_i != v_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+ return false;
+ }
+
+ // Read element size of value
+ uint32_t v_size_el_ref;
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+ const size_t v_size_el = ggml_type_size(v->type);
+ if (v_size_el != v_size_el_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+ return false;
+ }
+
+ // Read GQA embedding size
+ uint32_t n_embd_v_gqa_ref;
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+ return false;
+ }
+
+ if (cell_count) {
+ if (sinfo.is_contiguous()) {
+ // Fast path: contiguous cells
+ const uint32_t h = sinfo.head();
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+ const size_t dst_offset = (h + j * cells.size()) * v_size_el;
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+ }
+ } else {
+ // Slow path: scatter to non-contiguous positions
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+ const void * src = io.read(cell_count * v_size_el);
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+//
+// llama_kv_cache_context
+//
+
+llama_kv_cache_context::llama_kv_cache_context(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_context::llama_kv_cache_context(
+ llama_kv_cache * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
+ n_kv = kv->get_size();
+
+ const uint32_t n_stream = kv->get_n_stream();
+
+ // create a dummy slot info - the actual data is irrelevant. we just need to build the graph
+ sinfos.resize(1);
+ sinfos[0].s0 = 0;
+ sinfos[0].s1 = n_stream - 1;
+ sinfos[0].idxs.resize(n_stream);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ sinfos[0].strm.push_back(s);
+ sinfos[0].idxs[s].resize(1, 0);
+ }
+}
+
+llama_kv_cache_context::llama_kv_cache_context(
+ llama_kv_cache * kv,
+ llama_context * lctx,
+ bool do_shift,
+ stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) {
+ if (!do_shift && this->sc_info.empty()) {
+ status = LLAMA_MEMORY_STATUS_NO_UPDATE;
+ }
+}
+
+llama_kv_cache_context::llama_kv_cache_context(
+ llama_kv_cache * kv,
+ llama_kv_cache::slot_info_vec_t sinfos,
+ std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) {
+}
+
+llama_kv_cache_context::~llama_kv_cache_context() = default;
+
+bool llama_kv_cache_context::next() {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ if (++i_cur >= ubatches.size()) {
+ return false;
+ }
+
+ return true;
+}
+
+bool llama_kv_cache_context::apply() {
+ assert(!llama_memory_status_is_fail(status));
+
+ // no ubatches -> this is a KV cache update
+ if (ubatches.empty()) {
+ kv->update(lctx, do_shift, sc_info);
+
+ return true;
+ }
+
+ kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
+ n_kv = kv->get_n_kv(sinfos[i_cur]);
+
+ return true;
+}
+
+llama_memory_status llama_kv_cache_context::get_status() const {
+ return status;
+}
+
+const llama_ubatch & llama_kv_cache_context::get_ubatch() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ return ubatches[i_cur];
+}
+
+uint32_t llama_kv_cache_context::get_n_kv() const {
+ return n_kv;
+}
+
+ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
+ return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
+}
+
+ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) const {
+ return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
+}
+
+ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
+ return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
+}
+
+ggml_tensor * llama_kv_cache_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
+ return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]);
+}
+
+ggml_tensor * llama_kv_cache_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+ return kv->build_input_k_idxs(ctx, ubatch);
+}
+
+ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+ return kv->build_input_v_idxs(ctx, ubatch);
+}
+
+void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
+ kv->set_input_k_shift(dst);
+}
+
+void llama_kv_cache_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+ kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]);
+}
+
+void llama_kv_cache_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+ kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]);
+}
+
+void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+ kv->set_input_kq_mask(dst, ubatch, causal_attn);
+}
+
+void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+ kv->set_input_pos_bucket(dst, ubatch);
+}
diff --git a/llama.cpp/src/llama-kv-cache.h b/llama.cpp/src/llama-kv-cache.h
new file mode 100644
index 0000000..e194bf3
--- /dev/null
+++ b/llama.cpp/src/llama-kv-cache.h
@@ -0,0 +1,388 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cells.h"
+#include "llama-memory.h"
+
+#include <unordered_map>
+#include <vector>
+
+struct llama_cparams;
+struct llama_hparams;
+struct llama_model;
+struct llama_context;
+
+//
+// llama_kv_cache
+//
+
+class llama_kv_cache : public llama_memory_i {
+public:
+ struct stream_copy_info {
+ bool empty() const {
+ assert(ssrc.size() == sdst.size());
+ return ssrc.empty();
+ }
+
+ std::vector<uint32_t> ssrc;
+ std::vector<uint32_t> sdst;
+ };
+
+ // for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the
+ // KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]]
+ struct slot_info {
+ // data for ggml_set_rows
+ using idx_vec_t = std::vector<uint32_t>;
+
+ // number of streams: ns = s1 - s0 + 1
+ uint32_t s0;
+ uint32_t s1;
+
+ std::vector<llama_seq_id> strm; // [ns]
+ std::vector<idx_vec_t> idxs; // [ns]
+
+ uint32_t head() const {
+ GGML_ASSERT(idxs.size() == 1);
+ GGML_ASSERT(!idxs[0].empty());
+
+ return idxs[0][0];
+ }
+
+ void resize(size_t n) {
+ strm.resize(n);
+ idxs.resize(n);
+ }
+
+ size_t size() const {
+ GGML_ASSERT(idxs.size() == strm.size());
+ GGML_ASSERT(!idxs.empty());
+
+ return idxs[0].size();
+ }
+
+ size_t n_stream() const {
+ return strm.size();
+ }
+
+ bool empty() const {
+ return idxs.empty();
+ }
+
+ void clear() {
+ idxs.clear();
+ }
+
+ // check if indices are contiguous starting from head()
+ bool is_contiguous() const {
+ if (idxs.empty() || idxs[0].empty()) {
+ return true;
+ }
+ if (idxs.size() > 1) {
+ return false;
+ }
+ const uint32_t h = idxs[0][0];
+ for (size_t i = 0; i < idxs[0].size(); ++i) {
+ if (idxs[0][i] != h + i) {
+ return false;
+ }
+ }
+ return true;
+ }
+ };
+
+ using slot_info_vec_t = std::vector<slot_info>;
+
+ llama_kv_cache(
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ bool unified,
+ uint32_t kv_size,
+ uint32_t n_seq_max,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse);
+
+ ~llama_kv_cache() = default;
+
+ //
+ // llama_memory_i
+ //
+
+ llama_memory_context_ptr init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) override;
+
+ llama_memory_context_ptr init_full() override;
+
+ llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+ bool get_can_shift() const override;
+
+ void clear(bool data) override;
+
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+ void seq_keep(llama_seq_id seq_id) override;
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+ // state write/load
+
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+ //
+ // llama_kv_cache specific API
+ //
+
+ uint32_t get_size() const;
+ uint32_t get_n_stream() const;
+
+ bool get_has_shift() const;
+
+ //
+ // graph_build API
+ //
+
+ uint32_t get_n_kv(const slot_info & sinfo) const;
+
+ // get views of the current state of the cache
+ ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
+ ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
+
+ // store k_cur and v_cur in the cache based on the provided head location
+ ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
+ ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const;
+
+ //
+ // preparation API
+ //
+
+ // find places for the provided ubatches in the cache, returns the slot infos
+ // return empty vector on failure
+ slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
+
+ bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
+
+ // find a slot of kv cells that can hold the ubatch
+ // if cont == true, then the slot must be continuous
+ // return empty slot_info on failure
+ slot_info find_slot(const llama_ubatch & ubatch, bool cont) const;
+
+ // emplace the ubatch context into slot: [sinfo.idxs[0...ubatch.n_tokens - 1]]
+ void apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch);
+
+ //
+ // input API
+ //
+
+ ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+ ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+
+ void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
+ void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
+
+ void set_input_k_shift(ggml_tensor * dst) const;
+
+ void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+ void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+ const llama_model & model;
+ const llama_hparams & hparams;
+
+ struct kv_layer {
+ // layer index in the model
+ // note: can be different from the layer index in the KV cache
+ uint32_t il;
+
+ ggml_tensor * k;
+ ggml_tensor * v;
+
+ std::vector<ggml_tensor *> k_stream;
+ std::vector<ggml_tensor *> v_stream;
+ };
+
+ bool v_trans = true; // the value tensor is transposed
+
+ const uint32_t n_seq_max = 1;
+ const uint32_t n_stream = 1;
+
+ // required padding
+ const uint32_t n_pad = 1;
+
+ // SWA
+ const uint32_t n_swa = 0;
+
+ // env: LLAMA_KV_CACHE_DEBUG
+ int debug = 0;
+
+ // this is the SWA type of the cache - not to be confused with the model SWA type
+ const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+ // ggml contexts for the KV cache along with the allocated backend buffers:
+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
+
+ // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
+ // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
+ std::vector<uint32_t> v_heads;
+
+ std::vector<llama_kv_cells> v_cells;
+
+ // maps from a sequence id to a stream id
+ std::vector<uint32_t> seq_to_stream;
+
+ // pending stream copies that will be applied during the next update
+ stream_copy_info sc_info;
+
+ std::vector<kv_layer> layers;
+
+ // model layer id -> KV cache layer id
+ std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+ size_t total_size() const;
+
+ size_t size_k_bytes() const;
+ size_t size_v_bytes() const;
+
+ ggml_tensor * build_rope_shift(
+ const llama_cparams & cparams,
+ ggml_context * ctx,
+ ggml_tensor * cur,
+ ggml_tensor * shift,
+ ggml_tensor * factors,
+ float freq_base,
+ float freq_scale) const;
+
+ ggml_cgraph * build_graph_shift(
+ llm_graph_result * res,
+ llama_context * lctx) const;
+
+ struct cell_ranges_t {
+ uint32_t strm;
+
+ std::vector<std::pair<uint32_t, uint32_t>> data; // ranges, from inclusive, to exclusive
+ };
+
+ void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
+ void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
+
+ bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
+ bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
+};
+
+class llama_kv_cache_context : public llama_memory_context_i {
+public:
+ // some shorthands
+ using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+ using stream_copy_info = llama_kv_cache::stream_copy_info;
+
+ // used for errors
+ llama_kv_cache_context(llama_memory_status status);
+
+ // used to create a full-cache context
+ llama_kv_cache_context(
+ llama_kv_cache * kv);
+
+ // used to create an update context
+ llama_kv_cache_context(
+ llama_kv_cache * kv,
+ llama_context * lctx,
+ bool do_shift,
+ stream_copy_info sc_info);
+
+ // used to create a batch processing context from a batch
+ llama_kv_cache_context(
+ llama_kv_cache * kv,
+ slot_info_vec_t sinfos,
+ std::vector<llama_ubatch> ubatches);
+
+ virtual ~llama_kv_cache_context();
+
+ //
+ // llama_memory_context_i
+ //
+
+ bool next() override;
+ bool apply() override;
+
+ llama_memory_status get_status() const override;
+ const llama_ubatch & get_ubatch() const override;
+
+ //
+ // llama_kv_cache_context specific API
+ //
+
+ uint32_t get_n_kv() const;
+
+ // get views of the current state of the cache
+ ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+ ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
+
+ // store k_cur and v_cur in the cache based on the provided head location
+ // note: the heads in k_cur and v_cur should be layed out contiguously in memory
+ // - k_cur [n_embd_head_k, n_head_k, n_tokens]
+ // - k_idxs [n_tokens]
+ // - v_cur [n_embd_head_v, n_head_v, n_tokens]
+ // - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
+ ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
+ ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
+
+ // create destination indices for each head of the current batch for where it would be written in the KV cache
+ // the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
+ // helps understand the implementation logic of cpy_k and cpy_v
+ ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+ ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+
+ void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+ void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+ void set_input_k_shift (ggml_tensor * dst) const;
+ void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+ void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+ llama_memory_status status;
+
+ llama_kv_cache * kv;
+ llama_context * lctx;
+
+ //
+ // update context
+ //
+
+ bool do_shift = false;
+
+ stream_copy_info sc_info;
+
+ //
+ // batch processing context
+ //
+
+ // the index of the cur ubatch to process
+ size_t i_cur = 0;
+
+ slot_info_vec_t sinfos;
+
+ std::vector<llama_ubatch> ubatches;
+
+ //
+ // data needed for building the compute graph for the current ubatch:
+ //
+
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
+ // as the cache gets filled, the benefit from this heuristic disappears
+ int32_t n_kv;
+};
diff --git a/llama.cpp/src/llama-kv-cells.h b/llama.cpp/src/llama-kv-cells.h
new file mode 100644
index 0000000..10063bf
--- /dev/null
+++ b/llama.cpp/src/llama-kv-cells.h
@@ -0,0 +1,533 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-cparams.h"
+
+#include <bitset>
+#include <cassert>
+#include <cstring>
+#include <map>
+#include <set>
+#include <vector>
+
+struct llama_kv_cell_ext {
+ // 2D spatial positions, typically used for M-RoPE
+ llama_pos x = 0;
+ llama_pos y = 0;
+
+ // return true if the current 2D spatial position is greater than other
+ bool is_2d_gt(llama_pos ox, llama_pos oy) const {
+ return (y > oy) || (y == oy && x > ox);
+ }
+
+ void reset() {
+ static_assert(std::is_trivially_copyable_v<llama_kv_cell_ext>);
+
+ memset(this, 0, sizeof(*this));
+ }
+};
+
+// meta information about KV cells that can be part of multiple sequences at the same time
+// TODO: add unit tests
+class llama_kv_cells {
+public:
+ void reset() {
+ for (uint32_t i = 0; i < pos.size(); ++i) {
+ pos[i] = -1;
+ ext[i].reset();
+ shift[i] = 0;
+ seq[i].reset();
+ }
+
+ has_shift = false;
+
+ used.clear();
+
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ seq_pos[s].clear();
+ }
+ }
+
+ void reset_shift() {
+ has_shift = false;
+
+ for (uint32_t i = 0; i < shift.size(); ++i) {
+ shift[i] = 0;
+ }
+ }
+
+ uint32_t size() const {
+ return pos.size();
+ }
+
+ void resize(uint32_t n) {
+ pos.resize(n);
+ ext.resize(n);
+ shift.resize(n);
+ seq.resize(n);
+
+ reset();
+ }
+
+ bool is_empty(uint32_t i) const {
+ assert(i < pos.size());
+ assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
+
+ return pos[i] == -1;
+ }
+
+ uint32_t get_used() const {
+ return used.size();
+ }
+
+ // the index of the first cell that is used
+ // return 0 if no cells are used
+ uint32_t used_min() const {
+ return used.empty() ? 0 : *used.begin();
+ }
+
+ // the index of the last cell that is used + 1
+ // return 0 if no cells are used
+ uint32_t used_max_p1() const {
+ return used.empty() ? 0 : *used.rbegin() + 1;
+ }
+
+ bool get_has_shift() const {
+ return has_shift;
+ }
+
+ // move cell isrc to idst (used during defrag)
+ //void mv(uint32_t isrc, uint32_t idst) {
+ // assert(isrc < pos.size());
+ // assert(idst < pos.size());
+
+ // assert(pos[idst] == -1);
+ // assert(pos[isrc] != -1);
+
+ // pos [idst] = pos [isrc];
+ // shift[idst] = shift[isrc];
+ // seq [idst] = seq [isrc];
+
+ // pos [isrc] = -1;
+ // shift[isrc] = 0;
+ // seq [isrc].reset();
+
+ // used.erase (isrc);
+ // used.insert(idst);
+ //}
+
+ // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
+ llama_kv_cells cp(uint32_t i, uint32_t n) const {
+ assert(i + n <= pos.size());
+
+ llama_kv_cells res;
+
+ res.resize(n);
+
+ for (uint32_t j = 0; j < n; ++j) {
+ const auto idx = i + j;
+
+ res.pos[j] = pos[idx];
+ res.ext[j] = ext[idx];
+ res.seq[j] = seq[idx];
+
+ assert(shift[idx] == 0);
+ }
+
+ return res;
+ }
+
+ // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
+ llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
+ llama_kv_cells res;
+
+ res.resize(idxs.size());
+
+ for (uint32_t j = 0; j < idxs.size(); ++j) {
+ const auto idx = idxs[j];
+
+ res.pos[j] = pos[idx];
+ res.ext[j] = ext[idx];
+ res.seq[j] = seq[idx];
+
+ assert(shift[idx] == 0);
+ }
+
+ return res;
+ }
+
+ // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
+ void set(uint32_t i, const llama_kv_cells & other) {
+ assert(i + other.pos.size() <= pos.size());
+
+ for (uint32_t j = 0; j < other.pos.size(); ++j) {
+ const auto idx = i + j;
+
+ if (pos[idx] == -1 && other.pos[j] != -1) {
+ used.insert(i + j);
+ }
+
+ if (pos[idx] != -1 && other.pos[j] == -1) {
+ used.erase(i + j);
+ }
+
+ if (pos[idx] != -1) {
+ seq_pos_rm(i + j);
+ }
+
+ pos[idx] = other.pos[j];
+ ext[idx] = other.ext[j];
+ seq[idx] = other.seq[j];
+
+ if (pos[idx] != -1) {
+ seq_pos_add(i + j);
+ }
+
+ assert(shift[idx] == 0);
+ }
+ }
+
+ // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
+ void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
+ assert(idxs.size() == other.pos.size());
+
+ for (uint32_t j = 0; j < other.pos.size(); ++j) {
+ const auto idx = idxs[j];
+
+ if (pos[idx] == -1 && other.pos[j] != -1) {
+ used.insert(idx);
+ }
+
+ if (pos[idx] != -1 && other.pos[j] == -1) {
+ used.erase(idx);
+ }
+
+ if (pos[idx] != -1) {
+ seq_pos_rm(idx);
+ }
+
+ pos[idx] = other.pos[j];
+ ext[idx] = other.ext[j];
+ seq[idx] = other.seq[j];
+
+ if (pos[idx] != -1) {
+ seq_pos_add(idx);
+ }
+
+ assert(shift[idx] == 0);
+ }
+ }
+
+ // clear a non-empty cell
+ void rm(uint32_t i) {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+
+ seq_pos_rm(i);
+ seq[i].reset();
+
+ pos[i] = -1;
+ ext[i].reset();
+ shift[i] = 0;
+
+ used.erase(i);
+ }
+
+ // note: call only if the cell has seq_id
+ // return true if the cell becomes empty
+ bool seq_rm(uint32_t i, llama_seq_id seq_id) {
+ assert(i < pos.size());
+ assert(seq[i].test(seq_id));
+ assert(pos[i] != -1);
+ assert(seq_id >= 0);
+
+ seq[i].reset(seq_id);
+ seq_pos_dec(seq_id, pos[i]);
+
+ if (seq[i].none()) {
+ pos[i] = -1;
+ ext[i].reset();
+ shift[i] = 0;
+
+ used.erase(i);
+
+ return true;
+ }
+
+ return false;
+ }
+
+ // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
+ bool seq_keep(uint32_t i, llama_seq_id seq_id) {
+ assert(i < pos.size());
+
+ if (seq[i].test(seq_id)) {
+ seq_pos_rm(i);
+ seq[i].reset();
+
+ seq[i].set(seq_id);
+ seq_pos_inc(seq_id, pos[i]);
+
+ return false;
+ }
+
+ if (seq[i].any()) {
+ seq_pos_rm(i);
+ seq[i].reset();
+
+ pos[i] = -1;
+ ext[i].reset();
+ shift[i] = 0;
+
+ used.erase(i);
+
+ return true;
+ }
+
+ assert(pos[i] == -1);
+
+ return false;
+ }
+
+ // number of different sequences in the cell
+ int seq_count(uint32_t i) const {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+
+ return seq[i].count();
+ }
+
+ // check if the cell contains seq_id
+ bool seq_has(uint32_t i, llama_seq_id seq_id) const {
+ assert(i < pos.size());
+ assert(seq_id >= 0);
+
+ return seq[i].test(seq_id);
+ }
+
+ // note: call only if the cell is not empty and the seq_id is not in the cell
+ void seq_add(uint32_t i, llama_seq_id seq_id) {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+ assert(!seq[i].test(seq_id));
+
+ seq[i].set(seq_id);
+ seq_pos_inc(seq_id, pos[i]);
+ }
+
+ // return the sequence id of this cell
+ // note: call only for cells with exactly one sequence
+ llama_seq_id seq_get(uint32_t i) const {
+ assert(seq[i].count() == 1);
+
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ if (seq[i].test(s)) {
+ return s;
+ }
+ }
+
+ return -1;
+ }
+
+ // the minimum position of sequence seq_id currently present in any of the cells
+ // return -1 if the sequence is not present
+ llama_pos seq_pos_min(llama_seq_id seq_id) const {
+ assert(seq_id >= 0);
+ assert(seq_id < LLAMA_MAX_SEQ);
+
+ if (seq_pos[seq_id].empty()) {
+ return -1;
+ }
+
+ assert(seq_pos[seq_id].begin()->second > 0);
+
+ return seq_pos[seq_id].begin()->first;
+ }
+
+ // the maximum position of sequence seq_id currently present in any of the cells
+ // return -1 if the sequence is not present
+ llama_pos seq_pos_max(llama_seq_id seq_id) const {
+ assert(seq_id >= 0);
+ assert(seq_id < LLAMA_MAX_SEQ);
+
+ if (seq_pos[seq_id].empty()) {
+ return -1;
+ }
+
+ assert(seq_pos[seq_id].rbegin()->second > 0);
+
+ return seq_pos[seq_id].rbegin()->first;
+ }
+
+ // note: call only if the cell is not empty
+ llama_pos pos_get(uint32_t i) const {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+
+ return pos[i];
+ }
+
+ const llama_kv_cell_ext & ext_get(uint32_t i) const {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+
+ return ext[i];
+ }
+
+ // note: call only if the cell is not empty
+ llama_pos get_shift(uint32_t i) const {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+
+ return shift[i];
+ }
+
+ // check if a cell is not empty and its position is within [p0, p1)
+ bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
+ assert(i < pos.size());
+
+ return pos[i] >= p0 && pos[i] < p1;
+ }
+
+ // set the position of an empty cell
+ // does not modify "has_shift"
+ // note: call only if the cell is empty
+ void pos_set(uint32_t i, llama_pos p) {
+ assert(i < pos.size());
+ assert(pos[i] == -1);
+ assert(seq[i].none());
+
+ pos[i] = p;
+
+ used.insert(i);
+ }
+
+ void ext_set(uint32_t i, llama_kv_cell_ext p) {
+ assert(i < ext.size());
+ ext[i] = p;
+ }
+
+ // pos[i] = pos[i] + d
+ // sets "has_shift" to true
+ // note: call only if the cell is not empty
+ bool pos_add(uint32_t i, llama_pos d) {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+
+ seq_pos_rm(i);
+
+ pos[i] += d;
+ shift[i] += d;
+
+ has_shift = true;
+
+ if (pos[i] < 0) {
+ seq[i].reset();
+ pos[i] = -1;
+ shift[i] = 0;
+
+ used.erase(i);
+
+ return true;
+ }
+
+ seq_pos_add(i);
+
+ return false;
+ }
+
+ // pos[i] = pos[i] / d
+ // sets "has_shift" to true
+ // note: call only if the cell is not empty
+ void pos_div(uint32_t i, int d) {
+ assert(i < pos.size());
+ assert(pos[i] != -1);
+
+ const llama_pos p_old = pos[i];
+
+ seq_pos_rm(i);
+
+ pos[i] /= d;
+ shift[i] += p_old - pos[i];
+
+ seq_pos_add(i);
+
+ has_shift = true;
+ }
+
+private:
+ bool has_shift = false;
+
+ // set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
+ std::set<uint32_t> used;
+
+ std::vector<llama_pos> pos;
+
+ // stores extra info per cell
+ std::vector<llama_kv_cell_ext> ext;
+
+ // this array accumulates any applied shifts to the pos array since the last reset_shift() call
+ // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
+ //
+ // cells.pos_add(x, shift_x);
+ // cells.pos_div(y, shift_y);
+ // ...
+ //
+ // if (cells.has_shift()) {
+ // for (int i = 0; i < n; ++i) {
+ // auto shift_i = cells.get_shift(i);
+ // ...
+ // }
+ // cells.reset_shift();
+ // }
+ //
+ std::vector<llama_pos> shift;
+
+ using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
+
+ // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
+ std::vector<seq_set_t> seq;
+
+ // the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
+ // if the position p is not present, seq_pos[s][p] is not set
+ // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
+ //
+ // note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
+ // - during performing a cache reuse via (rm + add)
+ // - some vision models have input embeddings with repeating positions
+ //
+ std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
+
+ // helper functions for updating `seq_pos`, once cell at a time:
+
+ void seq_pos_dec(llama_seq_id s, llama_pos p) {
+ auto it = seq_pos[s].find(p);
+ assert(it != seq_pos[s].end());
+
+ if (--it->second == 0) {
+ seq_pos[s].erase(it);
+ }
+ }
+
+ void seq_pos_inc(llama_seq_id s, llama_pos p) {
+ seq_pos[s][p]++;
+ }
+
+ // remove cell i
+ void seq_pos_rm(uint32_t i) {
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ if (seq[i].test(s)) {
+ seq_pos_dec(s, pos[i]);
+ }
+ }
+ }
+
+ // add cell i
+ void seq_pos_add(uint32_t i) {
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+ if (seq[i].test(s)) {
+ seq_pos_inc(s, pos[i]);
+ }
+ }
+ }
+};
diff --git a/llama.cpp/src/llama-memory-hybrid-iswa.cpp b/llama.cpp/src/llama-memory-hybrid-iswa.cpp
new file mode 100644
index 0000000..4117696
--- /dev/null
+++ b/llama.cpp/src/llama-memory-hybrid-iswa.cpp
@@ -0,0 +1,275 @@
+#include "llama-memory-hybrid-iswa.h"
+
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+//
+// llama_memory_hybrid_iswa
+//
+
+llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
+ const llama_model & model,
+ /* attn */
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool swa_full,
+ uint32_t kv_size,
+ uint32_t n_ubatch,
+ uint32_t n_pad,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn,
+ const layer_filter_cb & filter_recr) :
+ hparams(model.hparams),
+ mem_attn(new llama_kv_cache_iswa(
+ model,
+ type_k,
+ type_v,
+ v_trans,
+ offload,
+ swa_full,
+ unified,
+ kv_size,
+ n_seq_max,
+ n_ubatch,
+ n_pad,
+ filter_attn == nullptr ?
+ [&](int32_t il) { return !hparams.is_recurrent(il); }
+ : filter_attn,
+ nullptr
+ )),
+ mem_recr(new llama_memory_recurrent(
+ model,
+ type_r,
+ type_s,
+ offload,
+ rs_size,
+ n_seq_max,
+ filter_recr == nullptr ?
+ [&](int32_t il) { return hparams.is_recurrent(il); }
+ : filter_recr
+ )) {}
+
+llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+ do {
+ balloc.split_reset();
+
+ // follow the recurrent pattern for creating the ubatch splits
+ std::vector<llama_ubatch> ubatches;
+
+ while (true) {
+ llama_ubatch ubatch;
+
+ if (embd_all) {
+ // if all tokens are output, split by sequence
+ ubatch = balloc.split_seq(n_ubatch);
+ } else {
+ // TODO: non-sequential equal split can be done if using unified KV cache
+ // for simplicity, we always use sequential equal split for now
+ ubatch = balloc.split_equal(n_ubatch, true);
+ }
+
+ if (ubatch.n_tokens == 0) {
+ break;
+ }
+
+ ubatches.push_back(std::move(ubatch)); // NOLINT
+ }
+
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
+ // failed to find a suitable split
+ break;
+ }
+
+ // prepare the recurrent batches first
+ if (!mem_recr->prepare(ubatches)) {
+ // TODO: will the recurrent cache be in an undefined context at this point?
+ LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ // prepare the attention cache (iswa version returns both base and swa slot infos)
+ auto sinfos_base = mem_attn->get_base()->prepare(ubatches);
+ if (sinfos_base.empty()) {
+ LLAMA_LOG_ERROR("%s: failed to prepare attention base ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ auto sinfos_swa = mem_attn->get_swa()->prepare(ubatches);
+ if (sinfos_swa.empty()) {
+ LLAMA_LOG_ERROR("%s: failed to prepare attention swa ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ return std::make_unique<llama_memory_hybrid_iswa_context>(
+ this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
+ } while(false);
+
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_memory_hybrid_iswa::init_full() {
+ return std::make_unique<llama_memory_hybrid_iswa_context>(this);
+}
+
+llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * lctx, bool optimize) {
+ return std::make_unique<llama_memory_hybrid_iswa_context>(this, lctx, optimize);
+}
+
+bool llama_memory_hybrid_iswa::get_can_shift() const {
+ // Shifting is trivially supported for recurrent
+ return mem_attn->get_can_shift();
+}
+
+void llama_memory_hybrid_iswa::clear(bool data) {
+ mem_attn->clear(data);
+ mem_recr->clear(data);
+}
+
+bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ // Try removing from the recurrent cache first since it may fail. If it does
+ // fail, the cache will not have been mutated.
+ if (!mem_recr->seq_rm(seq_id, p0, p1)) {
+ return false;
+ }
+ return mem_attn->seq_rm(seq_id, p0, p1);
+}
+
+void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+ mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+ mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
+ mem_attn->seq_keep(seq_id);
+ mem_recr->seq_keep(seq_id);
+}
+
+void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+ mem_attn->seq_add(seq_id, p0, p1, shift);
+ mem_recr->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ mem_attn->seq_div(seq_id, p0, p1, d);
+ mem_recr->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_memory_hybrid_iswa::seq_pos_min(llama_seq_id seq_id) const {
+ // the min of the total cache is the max of the two caches' min values
+ return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
+}
+
+llama_pos llama_memory_hybrid_iswa::seq_pos_max(llama_seq_id seq_id) const {
+ // the max of the total cache is the min of the two caches' max values
+ return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
+ for (const auto & buft_size : mem_recr->memory_breakdown()) {
+ mb[buft_size.first] += buft_size.second;
+ }
+ return mb;
+}
+
+void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+ mem_attn->state_write(io, seq_id, flags);
+ mem_recr->state_write(io, seq_id, flags);
+}
+
+void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ mem_attn->state_read(io, seq_id, flags);
+ mem_recr->state_read(io, seq_id, flags);
+}
+
+llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
+ return mem_attn.get();
+}
+
+llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
+ return mem_recr.get();
+}
+
+//
+// llama_memory_hybrid_iswa_context
+//
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
+ ctx_attn(mem->get_mem_attn()->init_full()),
+ ctx_recr(mem->get_mem_recr()->init_full()),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ llama_context * lctx,
+ bool optimize) :
+ ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
+ ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ slot_info_vec_t sinfos_base,
+ slot_info_vec_t sinfos_swa,
+ std::vector<llama_ubatch> ubatches) :
+ ubatches(std::move(ubatches)),
+ // note: here we copy the ubatches. not sure if this is ideal
+ ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
+ ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+bool llama_memory_hybrid_iswa_context::next() {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ ctx_attn->next();
+ ctx_recr->next();
+
+ if (++i_next >= ubatches.size()) {
+ return false;
+ }
+
+ return true;
+}
+
+bool llama_memory_hybrid_iswa_context::apply() {
+ assert(!llama_memory_status_is_fail(status));
+
+ bool res = true;
+
+ res = res & ctx_attn->apply();
+ res = res & ctx_recr->apply();
+
+ return res;
+}
+
+llama_memory_status llama_memory_hybrid_iswa_context::get_status() const {
+ return status;
+}
+
+const llama_ubatch & llama_memory_hybrid_iswa_context::get_ubatch() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+ return ubatches[i_next];
+}
+
+const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const {
+ return static_cast<const llama_kv_cache_iswa_context *>(ctx_attn.get());
+}
+
+const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
+ return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
+}
diff --git a/llama.cpp/src/llama-memory-hybrid-iswa.h b/llama.cpp/src/llama-memory-hybrid-iswa.h
new file mode 100644
index 0000000..807c8aa
--- /dev/null
+++ b/llama.cpp/src/llama-memory-hybrid-iswa.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cache-iswa.h"
+#include "llama-memory.h"
+#include "llama-memory-recurrent.h"
+
+#include <memory>
+#include <vector>
+
+//
+// llama_memory_hybrid_iswa
+//
+
+// utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to
+// support models where each layer may be either attention-based (with SWA support) or recurrent
+
+class llama_memory_hybrid_iswa : public llama_memory_i {
+public:
+ llama_memory_hybrid_iswa(
+ const llama_model & model,
+ /* attn */
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool swa_full,
+ uint32_t kv_size,
+ uint32_t n_ubatch,
+ uint32_t n_pad,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn = nullptr,
+ const layer_filter_cb & filter_recr = nullptr);
+
+ ~llama_memory_hybrid_iswa() = default;
+
+ //
+ // llama_memory_i
+ //
+
+ llama_memory_context_ptr init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) override;
+
+ llama_memory_context_ptr init_full() override;
+
+ llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+ bool get_can_shift() const override;
+
+ void clear(bool data) override;
+
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+ void seq_keep(llama_seq_id seq_id) override;
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+ // state write/load
+
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+ //
+ // llama_memory_hybrid_iswa specific API
+ //
+
+ llama_kv_cache_iswa * get_mem_attn() const;
+ llama_memory_recurrent * get_mem_recr() const;
+
+private:
+ const llama_hparams & hparams;
+
+ const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
+ const std::unique_ptr<llama_memory_recurrent> mem_recr;
+};
+
+class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
+public:
+ using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+
+ // init failure
+ explicit llama_memory_hybrid_iswa_context(llama_memory_status status);
+
+ // init full
+ explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem);
+
+ // init update
+ explicit llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ llama_context * lctx,
+ bool optimize);
+
+ // init success
+ llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ slot_info_vec_t sinfos_base,
+ slot_info_vec_t sinfos_swa,
+ std::vector<llama_ubatch> ubatches);
+
+ ~llama_memory_hybrid_iswa_context() = default;
+
+ bool next() override;
+ bool apply() override;
+
+ llama_memory_status get_status() const override;
+ const llama_ubatch & get_ubatch() const override;
+
+ //
+ // llama_memory_hybrid_iswa_context
+ //
+
+ const llama_kv_cache_iswa_context * get_attn() const;
+ const llama_memory_recurrent_context * get_recr() const;
+
+private:
+ // the index of the next ubatch to process
+ size_t i_next = 0;
+
+ std::vector<llama_ubatch> ubatches;
+
+ const llama_memory_context_ptr ctx_attn;
+ const llama_memory_context_ptr ctx_recr;
+
+ const llama_memory_status status;
+};
diff --git a/llama.cpp/src/llama-memory-hybrid.cpp b/llama.cpp/src/llama-memory-hybrid.cpp
new file mode 100644
index 0000000..a1b45e4
--- /dev/null
+++ b/llama.cpp/src/llama-memory-hybrid.cpp
@@ -0,0 +1,268 @@
+#include "llama-memory-hybrid.h"
+
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+//
+// llama_memory_hybrid
+//
+
+llama_memory_hybrid::llama_memory_hybrid(
+ const llama_model & model,
+ /* attn */
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ uint32_t kv_size,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn,
+ const layer_filter_cb & filter_recr) :
+ hparams(model.hparams),
+ mem_attn(new llama_kv_cache(
+ model,
+ type_k,
+ type_v,
+ v_trans,
+ offload,
+ unified,
+ kv_size,
+ n_seq_max,
+ n_pad,
+ n_swa,
+ swa_type,
+ filter_attn == nullptr ?
+ [&](int32_t il) { return !hparams.is_recurrent(il); }
+ : filter_attn,
+ nullptr
+ )),
+ mem_recr(new llama_memory_recurrent(
+ model,
+ type_r,
+ type_s,
+ offload,
+ rs_size,
+ n_seq_max,
+ filter_recr == nullptr ?
+ [&](int32_t il) { return hparams.is_recurrent(il); }
+ : filter_recr
+ )) {}
+
+llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+ do {
+ balloc.split_reset();
+
+ // follow the recurrent pattern for creating the ubatch splits
+ std::vector<llama_ubatch> ubatches;
+
+ while (true) {
+ llama_ubatch ubatch;
+
+ if (embd_all) {
+ // if all tokens are output, split by sequence
+ ubatch = balloc.split_seq(n_ubatch);
+ } else {
+ // TODO: non-sequential equal split can be done if using unified KV cache
+ // for simplicity, we always use sequential equal split for now
+ ubatch = balloc.split_equal(n_ubatch, true);
+ }
+
+ if (ubatch.n_tokens == 0) {
+ break;
+ }
+
+ ubatches.push_back(std::move(ubatch)); // NOLINT
+ }
+
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
+ // failed to find a suitable split
+ break;
+ }
+
+ // prepare the recurrent batches first
+ if (!mem_recr->prepare(ubatches)) {
+ // TODO: will the recurrent cache be in an undefined context at this point?
+ LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ // prepare the attention cache
+ auto heads_attn = mem_attn->prepare(ubatches);
+ if (heads_attn.empty()) {
+ LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ return std::make_unique<llama_memory_hybrid_context>(
+ this, std::move(heads_attn), std::move(ubatches));
+ } while(false);
+
+ return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_memory_hybrid::init_full() {
+ return std::make_unique<llama_memory_hybrid_context>(this);
+}
+
+llama_memory_context_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
+ return std::make_unique<llama_memory_hybrid_context>(this, lctx, optimize);
+}
+
+bool llama_memory_hybrid::get_can_shift() const {
+ // Shifting is trivially supported for recurrent
+ return mem_attn->get_can_shift();
+}
+
+void llama_memory_hybrid::clear(bool data) {
+ mem_attn->clear(data);
+ mem_recr->clear(data);
+}
+
+bool llama_memory_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ // Try removing from the recurrent cache first since it may fail. If it does
+ // fail, the cache will not have been mutated.
+ if (!mem_recr->seq_rm(seq_id, p0, p1)) {
+ return false;
+ }
+ return mem_attn->seq_rm(seq_id, p0, p1);
+}
+
+void llama_memory_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+ mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+ mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_memory_hybrid::seq_keep(llama_seq_id seq_id) {
+ mem_attn->seq_keep(seq_id);
+ mem_recr->seq_keep(seq_id);
+}
+
+void llama_memory_hybrid::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+ mem_attn->seq_add(seq_id, p0, p1, shift);
+ mem_recr->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_memory_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ mem_attn->seq_div(seq_id, p0, p1, d);
+ mem_recr->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_memory_hybrid::seq_pos_min(llama_seq_id seq_id) const {
+ // the min of the total cache is the max of the two caches' min values
+ return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
+}
+
+llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
+ // the max of the total cache is the min of the two caches' max values
+ return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
+ for (const auto & buft_size : mem_recr->memory_breakdown()) {
+ mb[buft_size.first] += buft_size.second;
+ }
+ return mb;
+}
+
+void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
+ mem_attn->state_write(io, seq_id, flags);
+ }
+ mem_recr->state_write(io, seq_id, flags);
+}
+
+void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
+ mem_attn->state_read(io, seq_id, flags);
+ }
+ mem_recr->state_read(io, seq_id, flags);
+}
+
+llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
+ return mem_attn.get();
+}
+
+llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const {
+ return mem_recr.get();
+}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_status status) : status(status) {}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_hybrid * mem) :
+ ctx_attn(mem->get_mem_attn()->init_full()),
+ ctx_recr(mem->get_mem_recr()->init_full()),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(
+ llama_memory_hybrid * mem,
+ llama_context * lctx,
+ bool optimize) :
+ ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
+ ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(
+ llama_memory_hybrid * mem,
+ slot_info_vec_t sinfos_attn,
+ std::vector<llama_ubatch> ubatches) :
+ ubatches(std::move(ubatches)),
+ // note: here we copy the ubatches. not sure if this is ideal
+ ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
+ ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+bool llama_memory_hybrid_context::next() {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ ctx_attn->next();
+ ctx_recr->next();
+
+ if (++i_next >= ubatches.size()) {
+ return false;
+ }
+
+ return true;
+}
+
+bool llama_memory_hybrid_context::apply() {
+ assert(!llama_memory_status_is_fail(status));
+
+ bool res = true;
+
+ res = res & ctx_attn->apply();
+ res = res & ctx_recr->apply();
+
+ return res;
+}
+
+llama_memory_status llama_memory_hybrid_context::get_status() const {
+ return status;
+}
+
+const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+ return ubatches[i_next];
+}
+
+const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
+ return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
+}
+
+const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
+ return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
+}
diff --git a/llama.cpp/src/llama-memory-hybrid.h b/llama.cpp/src/llama-memory-hybrid.h
new file mode 100644
index 0000000..558cafd
--- /dev/null
+++ b/llama.cpp/src/llama-memory-hybrid.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cache.h"
+#include "llama-memory.h"
+#include "llama-memory-recurrent.h"
+
+#include <memory>
+#include <vector>
+
+//
+// llama_memory_hybrid
+//
+
+// utilizes instances of llama_memory_recurrent and llama_kv_cache to
+// support models where each layer may be either attention-based or recurrent
+
+class llama_memory_hybrid : public llama_memory_i {
+public:
+ llama_memory_hybrid(
+ const llama_model & model,
+ /* attn */
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ uint32_t kv_size,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn = nullptr,
+ const layer_filter_cb & filter_recr = nullptr);
+
+ ~llama_memory_hybrid() = default;
+
+ //
+ // llama_memory_i
+ //
+
+ llama_memory_context_ptr init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) override;
+
+ llama_memory_context_ptr init_full() override;
+
+ llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+ bool get_can_shift() const override;
+
+ void clear(bool data) override;
+
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+ void seq_keep(llama_seq_id seq_id) override;
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+ // state write/load
+
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+ //
+ // llama_memory_hybrid specific API
+ //
+
+ llama_kv_cache * get_mem_attn() const;
+ llama_memory_recurrent * get_mem_recr() const;
+
+private:
+ const llama_hparams & hparams;
+
+ const std::unique_ptr<llama_kv_cache> mem_attn;
+ const std::unique_ptr<llama_memory_recurrent> mem_recr;
+};
+
+class llama_memory_hybrid_context : public llama_memory_context_i {
+public:
+ using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+
+ // init failure
+ explicit llama_memory_hybrid_context(llama_memory_status status);
+
+ // init full
+ explicit llama_memory_hybrid_context(llama_memory_hybrid * mem);
+
+ // init update
+ explicit llama_memory_hybrid_context(
+ llama_memory_hybrid * mem,
+ llama_context * lctx,
+ bool optimize);
+
+ // init success
+ llama_memory_hybrid_context(
+ llama_memory_hybrid * mem,
+ slot_info_vec_t sinfos_attn,
+ std::vector<llama_ubatch> ubatches);
+
+ ~llama_memory_hybrid_context() = default;
+
+ bool next() override;
+ bool apply() override;
+
+ llama_memory_status get_status() const override;
+ const llama_ubatch & get_ubatch() const override;
+
+ //
+ // llama_memory_hybrid_context
+ //
+
+ const llama_kv_cache_context * get_attn() const;
+ const llama_memory_recurrent_context * get_recr() const;
+
+private:
+ // the index of the next ubatch to process
+ size_t i_next = 0;
+
+ std::vector<llama_ubatch> ubatches;
+
+ const llama_memory_context_ptr ctx_attn;
+ const llama_memory_context_ptr ctx_recr;
+
+ const llama_memory_status status;
+};
diff --git a/llama.cpp/src/llama-memory-recurrent.cpp b/llama.cpp/src/llama-memory-recurrent.cpp
new file mode 100644
index 0000000..f003803
--- /dev/null
+++ b/llama.cpp/src/llama-memory-recurrent.cpp
@@ -0,0 +1,1165 @@
+#include "llama-memory-recurrent.h"
+
+#include "llama-impl.h"
+#include "llama-io.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+//
+// llama_memory_recurrent
+//
+
+llama_memory_recurrent::llama_memory_recurrent(
+ const llama_model & model,
+ ggml_type type_r,
+ ggml_type type_s,
+ bool offload,
+ uint32_t mem_size,
+ uint32_t n_seq_max,
+ const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
+ const int32_t n_layer = hparams.n_layer;
+
+ head = 0;
+ size = mem_size;
+ used = 0;
+
+ cells.clear();
+ cells.resize(mem_size);
+
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+ struct ggml_backend_buft_comparator {
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+ }
+ };
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+ // create a context for each buffer type
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ ggml_init_params params = {
+ /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ return nullptr;
+ }
+
+ ctx_map.emplace(buft, ctx);
+
+ return ctx;
+ }
+
+ return it->second.get();
+ };
+
+ r_l.resize(n_layer);
+ s_l.resize(n_layer);
+
+ for (int i = 0; i < n_layer; i++) {
+ if (filter && !filter(i)) {
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, i);
+ continue;
+ }
+
+ const char * dev_name = "CPU";
+
+ ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+ if (offload) {
+ auto * dev = model.dev_layer(i);
+ buft = ggml_backend_dev_buffer_type(dev);
+
+ dev_name = ggml_backend_dev_name(dev);
+ }
+
+ LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
+
+ ggml_context * ctx = ctx_for_buft(buft);
+ if (!ctx) {
+ throw std::runtime_error("failed to create ggml context for rs cache");
+ }
+
+ ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
+ ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
+ ggml_format_name(r, "cache_r_l%d", i);
+ ggml_format_name(s, "cache_s_l%d", i);
+ r_l[i] = r;
+ s_l[i] = s;
+ }
+
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
+ for (auto & [buft, ctx] : ctx_map) {
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
+ if (!buf) {
+ throw std::runtime_error("failed to allocate buffer for rs cache");
+ }
+ ggml_backend_buffer_clear(buf, 0);
+ LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+ ctxs_bufs.emplace_back(std::move(ctx), buf);
+ }
+
+ {
+ const size_t memory_size_r = size_r_bytes();
+ const size_t memory_size_s = size_s_bytes();
+
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
+ (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
+ ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
+ ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
+ }
+}
+
+void llama_memory_recurrent::clear(bool data) {
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
+ cells[i].pos = -1;
+ cells[i].seq_id.clear();
+ cells[i].src = -1;
+ cells[i].tail = -1;
+ }
+
+ head = 0;
+ used = 0;
+
+ if (data) {
+ for (auto & [_, buf] : ctxs_bufs) {
+ ggml_backend_buffer_clear(buf.get(), 0);
+ }
+ }
+}
+
+bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ //printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
+ uint32_t new_head = size;
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ // models like Mamba or RWKV can't have a state partially erased at the end
+ // of the sequence because their state isn't preserved for previous tokens
+ if (seq_id >= (int64_t) size) {
+ // could be fatal
+ return false;
+ }
+ if (0 <= seq_id) {
+ int32_t & tail_id = cells[seq_id].tail;
+ if (tail_id >= 0) {
+ const auto & cell = cells[tail_id];
+ // partial intersection is invalid if it includes the final pos
+ if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
+ //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
+ return false;
+ }
+ // invalidate tails which will be cleared
+ if (p0 <= cell.pos && cell.pos < p1) {
+ tail_id = -1;
+ }
+ }
+ } else {
+ // seq_id is negative, then the range should include everything or nothing
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+ //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
+ return false;
+ }
+ }
+
+ for (uint32_t i = 0; i < size; ++i) {
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
+ if (seq_id < 0) {
+ cells[i].seq_id.clear();
+ } else if (cells[i].has_seq_id(seq_id)) {
+ cells[i].seq_id.erase(seq_id);
+ } else {
+ continue;
+ }
+ if (cells[i].is_empty()) {
+ // keep count of the number of used cells
+ if (cells[i].pos >= 0) {
+ used--;
+ }
+ cells[i].pos = -1;
+ cells[i].src = -1;
+ if (new_head == size) {
+ new_head = i;
+ }
+ }
+ }
+ }
+
+ // If we freed up a slot, set head to it so searching can start there.
+ if (new_head != size && new_head < head) {
+ head = new_head;
+ }
+
+ return true;
+}
+
+void llama_memory_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+ if (seq_id_src == seq_id_dst) {
+ return;
+ }
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+ auto & tail_src = cells[seq_id_src];
+ auto & tail_dst = cells[seq_id_dst];
+ if (tail_dst.tail >= 0) {
+ // clear destination seq_id if it wasn't empty
+ auto & cell_dst = cells[tail_dst.tail];
+
+ cell_dst.seq_id.erase(seq_id_dst);
+ tail_dst.tail = -1;
+ if (cell_dst.seq_id.empty()) {
+ cell_dst.pos = -1;
+ cell_dst.src = -1;
+ used -= 1;
+ }
+ }
+ if (tail_src.tail >= 0) {
+ auto & cell_src = cells[tail_src.tail];
+
+ cell_src.seq_id.insert(seq_id_dst);
+ tail_dst.tail = tail_src.tail;
+ }
+ }
+}
+
+void llama_memory_recurrent::seq_keep(llama_seq_id seq_id) {
+ uint32_t new_head = size;
+
+ for (uint32_t i = 0; i < size; ++i) {
+ if ((llama_seq_id) i != seq_id) {
+ cells[i].tail = -1;
+ }
+
+ if (!cells[i].has_seq_id(seq_id)) {
+ if (cells[i].pos >= 0) {
+ used--;
+ }
+
+ cells[i].pos = -1;
+ cells[i].src = -1;
+ cells[i].seq_id.clear();
+
+ if (new_head == size){
+ new_head = i;
+ }
+ } else {
+ cells[i].seq_id.clear();
+ cells[i].seq_id.insert(seq_id);
+ }
+ }
+
+ // If we freed up a slot, set head to it so searching can start there.
+ if (new_head != size && new_head < head) {
+ head = new_head;
+ }
+}
+
+void llama_memory_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+ if (shift == 0) {
+ return;
+ }
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ // If there is no range then return early to avoid looping over the
+ if (p0 == p1) {
+ return;
+ }
+
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
+ if (0 <= seq_id && seq_id < (int64_t) size) {
+ const int32_t tail_id = cells[seq_id].tail;
+ if (tail_id >= 0) {
+ auto & cell = cells[tail_id];
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+ cell.pos += shift;
+ }
+ }
+ }
+}
+
+void llama_memory_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ if (d == 1) {
+ return;
+ }
+
+ if (p0 < 0) {
+ p0 = 0;
+ }
+
+ if (p1 < 0) {
+ p1 = std::numeric_limits<llama_pos>::max();
+ }
+
+ // If there is no range then return early to avoid looping over the cache.
+ if (p0 == p1) {
+ return;
+ }
+
+ // for Mamba-like or RWKV models, only the pos needs to be changed
+ if (0 <= seq_id && seq_id < (int64_t) size) {
+ const int32_t tail_id = cells[seq_id].tail;
+ if (tail_id >= 0) {
+ auto & cell = cells[tail_id];
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+ cell.pos /= d;
+ }
+ }
+ }
+}
+
+llama_pos llama_memory_recurrent::seq_pos_min(llama_seq_id seq_id) const {
+ llama_pos result = std::numeric_limits<llama_pos>::max();
+
+ for (uint32_t i = 0; i < size; ++i) {
+ if (cells[i].has_seq_id(seq_id)) {
+ result = std::min(result, cells[i].pos);
+ }
+ }
+
+ if (result == std::numeric_limits<llama_pos>::max()) {
+ result = -1;
+ }
+
+ return result;
+}
+
+llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
+ llama_pos result = -1;
+
+ for (uint32_t i = 0; i < size; ++i) {
+ if (cells[i].has_seq_id(seq_id)) {
+ result = std::max(result, cells[i].pos);
+ }
+ }
+
+ return result;
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
+ for (const auto & [_, buf] : ctxs_bufs) {
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+ }
+ return ret;
+}
+
+llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+ do {
+ balloc.split_reset();
+
+ std::vector<llama_ubatch> ubatches;
+ while (true) {
+ llama_ubatch ubatch;
+
+ if (embd_all) {
+ // if all tokens are output, split by sequence
+ ubatch = balloc.split_seq(n_ubatch);
+ } else {
+ // TODO: non-sequential equal split can be done if using unified KV cache
+ // for simplicity, we always use sequential equal split for now
+ ubatch = balloc.split_equal(n_ubatch, true);
+ }
+
+ if (ubatch.n_tokens == 0) {
+ break;
+ }
+
+ ubatches.push_back(std::move(ubatch)); // NOLINT
+ }
+
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
+ // failed to find a suitable split
+ break;
+ }
+
+ if (!prepare(ubatches)) {
+ break;
+ }
+
+ return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
+ } while (false);
+
+ return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_memory_recurrent::init_full() {
+ return std::make_unique<llama_memory_recurrent_context>(this);
+}
+
+llama_memory_context_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) {
+ GGML_UNUSED(lctx);
+ GGML_UNUSED(optimize);
+
+ return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_NO_UPDATE);
+}
+
+bool llama_memory_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
+ // simply remember the full state because it is very small for this type of cache
+ // TODO: optimize
+ auto org_cells = cells;
+ auto org_used = used;
+ auto org_head = head;
+
+ bool success = true;
+
+ for (const auto & ubatch : ubatches) {
+ if (!find_slot(ubatch)) {
+ success = false;
+ break;
+ }
+ }
+
+ // restore the original state
+ cells = std::move(org_cells);
+ used = org_used;
+ head = org_head;
+
+ return success;
+}
+
+bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+ const uint32_t n_seqs = ubatch.n_seqs;
+
+ // if we have enough unused cells before the current head ->
+ // better to start searching from the beginning of the cache, hoping to fill it
+ if (head > used + 2*n_seqs) {
+ head = 0;
+ }
+
+ // For recurrent state architectures (like Mamba or RWKV),
+ // each cache cell can store the state for a whole sequence.
+ // A slot should be always be contiguous.
+
+ // can only process batches with an equal number of new tokens in each sequence
+ GGML_ASSERT(ubatch.equal_seqs());
+
+ int32_t min = size - 1;
+ int32_t max = 0;
+
+ // everything should fit if all seq_ids are smaller than the max
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ const uint32_t i = s*n_seq_tokens; // first token of sequence set s
+ const uint32_t n_seq_id = ubatch.n_seq_id[i];
+
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
+ const llama_seq_id seq_id = ubatch.seq_id[i][j];
+
+ if (seq_id < 0 || (uint32_t) seq_id >= size) {
+ // too big seq_id
+ // TODO: would it be possible to resize the cache instead?
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
+ return false;
+ }
+ if (j > 0) {
+ auto & seq = cells[seq_id];
+ if (seq.tail >= 0) {
+ auto & cell = cells[seq.tail];
+ // clear cells from seq_ids that become shared
+ // (should not normally happen, but let's handle it anyway)
+ cell.seq_id.erase(seq_id);
+ seq.tail = -1;
+ if (cell.seq_id.empty()) {
+ cell.pos = -1;
+ cell.src = -1;
+ used -= 1;
+ }
+ }
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ {
+ std::vector<int32_t> tails_verif;
+ tails_verif.assign(size, -1);
+ for (uint32_t i = 0; i < size; ++i) {
+ auto & cell = cells[i];
+ for (llama_seq_id seq_id : cell.seq_id) {
+ if (tails_verif[seq_id] != -1) {
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+ }
+ tails_verif[seq_id] = i;
+ }
+ }
+ for (uint32_t i = 0; i < size; ++i) {
+ if (tails_verif[i] != cells[i].tail) {
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
+ }
+ }
+ }
+#endif
+
+ // find next empty cell
+ uint32_t next_empty_cell = head;
+
+ for (uint32_t i = 0; i < size; ++i) {
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
+ auto & cell = cells[next_empty_cell];
+ if (cell.is_empty()) { break; }
+ next_empty_cell += 1;
+ }
+
+ // find usable cell range
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ const uint32_t i = s*n_seq_tokens;
+ const llama_seq_id seq_id = ubatch.seq_id[i][0];
+ auto & seq_meta = cells[seq_id];
+ bool has_cell = false;
+ if (seq_meta.tail >= 0) {
+ auto & cell = cells[seq_meta.tail];
+ GGML_ASSERT(cell.has_seq_id(seq_id));
+ // does this seq_id "own" the cell?
+ if (cell.seq_id.size() == 1) { has_cell = true; }
+ }
+ if (!has_cell) {
+ auto & empty_cell = cells[next_empty_cell];
+ GGML_ASSERT(empty_cell.is_empty());
+ // copy old tail into the empty cell
+ if (seq_meta.tail >= 0) {
+ auto & orig_cell = cells[seq_meta.tail];
+ empty_cell.pos = orig_cell.pos;
+ empty_cell.src = orig_cell.src;
+ orig_cell.seq_id.erase(seq_id);
+ empty_cell.seq_id.insert(seq_id); // will be overwritten
+ GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
+ }
+ seq_meta.tail = next_empty_cell;
+ // find next empty cell
+ if (s + 1 < n_seqs) {
+ for (uint32_t j = 0; j < size; ++j) {
+ next_empty_cell += 1;
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
+ auto & cell = cells[next_empty_cell];
+ if (cell.is_empty()) { break; }
+ }
+ }
+ }
+ if (min > seq_meta.tail) { min = seq_meta.tail; }
+ if (max < seq_meta.tail) { max = seq_meta.tail; }
+ }
+
+ // gather and re-order
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ const uint32_t i = s*n_seq_tokens;
+ const int32_t dst_id = s + min;
+ const int32_t src_id = cells[ubatch.seq_id[i][0]].tail;
+ if (dst_id != src_id) {
+ auto & dst_cell = cells[dst_id];
+ auto & src_cell = cells[src_id];
+
+ std::swap(dst_cell.pos, src_cell.pos);
+ std::swap(dst_cell.src, src_cell.src);
+ std::swap(dst_cell.seq_id, src_cell.seq_id);
+
+ // swap tails
+ for (uint32_t j = 0; j < size; ++j) {
+ int32_t & tail = cells[j].tail;
+ if (tail == src_id) {
+ tail = dst_id;
+ } else if (tail == dst_id) {
+ tail = src_id;
+ }
+ }
+ }
+ }
+
+ // update the pos of the used seqs
+ for (uint32_t s = 0; s < n_seqs; ++s) {
+ const uint32_t i = s*n_seq_tokens;
+ const llama_pos last_pos = ubatch.pos[i + n_seq_tokens - 1];
+ const int32_t cell_id = s + min;
+ auto & cell = cells[cell_id];
+
+ if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+ // What should happen when the pos backtracks or skips a value?
+ // Clearing the state mid-batch would require special-casing which isn't done.
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+ __func__, last_pos, cell.pos, ubatch.seq_id[i][0], n_seq_tokens);
+ }
+ cell.pos = last_pos;
+ cell.seq_id.clear();
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
+ const llama_seq_id seq_id = ubatch.seq_id[i][j];
+ cell.seq_id.insert(seq_id);
+ cells[seq_id].tail = cell_id;
+ }
+ }
+
+ // Find first cell without src refs, to use as the zero-ed state
+ {
+ // TODO: bake-in src refcounts in the cell metadata
+ std::vector<int32_t> refcounts(size, 0);
+ for (size_t i = 0; i < size; ++i) {
+ const int32_t src = cells[i].src;
+ if (src >= 0) {
+ refcounts[src] += 1;
+ }
+ }
+
+ rs_z = -1;
+ for (int i = min; i <= max; ++i) {
+ if (refcounts[i] == 0) {
+ rs_z = i;
+ break;
+ }
+ }
+
+ for (int i = min; i <= max; ++i) {
+ if (cells[i].src < 0) {
+ GGML_ASSERT(rs_z >= 0);
+ cells[i].src0 = rs_z;
+ } else {
+ // Stage the source ids for all used cells to allow correct seq_* behavior
+ // and still make these values available when setting the inputs
+ cells[i].src0 = cells[i].src;
+ }
+ cells[i].src = i; // avoid moving or clearing twice
+ }
+ }
+
+ // allow getting the range of used cells, from head to head + n
+ head = min;
+ n = max - min + 1;
+ used = std::count_if(cells.begin(), cells.end(),
+ [](const mem_cell & cell){ return !cell.is_empty(); });
+
+ // sanity check
+ return n >= n_seqs;
+}
+
+bool llama_memory_recurrent::get_can_shift() const {
+ // shifting the pos is trivial for recurrent models
+ return true;
+}
+
+size_t llama_memory_recurrent::total_size() const {
+ size_t size = 0;
+ for (const auto & [_, buf] : ctxs_bufs) {
+ size += ggml_backend_buffer_get_size(buf.get());
+ }
+
+ return size;
+}
+
+size_t llama_memory_recurrent::size_r_bytes() const {
+ size_t size_r_bytes = 0;
+
+ for (const auto & r : r_l) {
+ if (r != nullptr) {
+ size_r_bytes += ggml_nbytes(r);
+ }
+ }
+
+ return size_r_bytes;
+}
+
+size_t llama_memory_recurrent::size_s_bytes() const {
+ size_t size_s_bytes = 0;
+
+ for (const auto & s : s_l) {
+ if (s != nullptr) {
+ size_s_bytes += ggml_nbytes(s);
+ }
+ }
+
+ return size_s_bytes;
+}
+
+void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+ GGML_UNUSED(flags);
+
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+ uint32_t cell_count = 0;
+
+ // Count the number of cells with the specified seq_id
+ // Find all the ranges of cells with this seq id (or all, when -1)
+ uint32_t cell_range_begin = size;
+ for (uint32_t i = 0; i < size; ++i) {
+ const auto & cell = cells[i];
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+ ++cell_count;
+ if (cell_range_begin == size) {
+ cell_range_begin = i;
+ }
+ } else {
+ if (cell_range_begin != size) {
+ cell_ranges.emplace_back(cell_range_begin, i);
+ cell_range_begin = size;
+ }
+ }
+ }
+ if (cell_range_begin != size) {
+ cell_ranges.emplace_back(cell_range_begin, size);
+ }
+
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+ uint32_t cell_count_check = 0;
+ for (const auto & range : cell_ranges) {
+ cell_count_check += range.second - range.first;
+ }
+ GGML_ASSERT(cell_count == cell_count_check);
+
+ io.write(&cell_count, sizeof(cell_count));
+
+ state_write_meta(io, cell_ranges, seq_id);
+ state_write_data(io, cell_ranges);
+}
+
+void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ GGML_UNUSED(flags);
+
+ uint32_t cell_count;
+ io.read_to(&cell_count, sizeof(cell_count));
+
+ bool res = true;
+
+ res = res && state_read_meta(io, cell_count, seq_id);
+ res = res && state_read_data(io, cell_count);
+
+ if (!res) {
+ if (seq_id == -1) {
+ clear(true);
+ } else {
+ seq_rm(seq_id, -1, -1);
+ }
+ throw std::runtime_error("failed to restore kv cache");
+ }
+}
+
+void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+ for (const auto & range : cell_ranges) {
+ for (uint32_t i = range.first; i < range.second; ++i) {
+ const auto & cell = cells[i];
+ const llama_pos pos = cell.pos;
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+ io.write(&pos, sizeof(pos));
+ io.write(&n_seq_id, sizeof(n_seq_id));
+
+ if (n_seq_id) {
+ for (auto seq_id : cell.seq_id) {
+ io.write(&seq_id, sizeof(seq_id));
+ }
+ }
+ }
+ }
+}
+
+void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+ const uint32_t s_trans = 0;
+ const uint32_t n_layer = hparams.n_layer;
+
+ io.write(&s_trans, sizeof(s_trans));
+ io.write(&n_layer, sizeof(n_layer));
+
+ // Iterate and write all the R tensors first, each row is a cell
+ // Get whole range at a time
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
+ if (r_l[il] == nullptr) continue;
+
+ // Write R tensor type
+ const int32_t r_type_i = (int32_t)r_l[il]->type;
+ io.write(&r_type_i, sizeof(r_type_i));
+
+ // Write row size of R tensor
+ const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
+ io.write(&r_size_row, sizeof(r_size_row));
+
+ // Write each range of cells of r_size_row length
+ for (const auto & range : cell_ranges) {
+ const size_t range_size = range.second - range.first;
+ const size_t buf_size = range_size * r_size_row;
+ io.write_tensor(r_l[il], range.first * r_size_row, buf_size);
+ }
+ }
+
+ if (!s_trans) {
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
+ if (s_l[il] == nullptr) continue;
+
+ // Write S tensor type
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
+ io.write(&s_type_i, sizeof(s_type_i));
+
+ // Write row size of S tensor
+ const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
+ io.write(&s_size_row, sizeof(s_size_row));
+
+ // Write each range of S tensor rows
+ for (const auto & range : cell_ranges) {
+ const size_t range_size = range.second - range.first;
+ const size_t buf_size = range_size * s_size_row;
+ io.write_tensor(s_l[il], range.first * s_size_row, buf_size);
+ }
+ }
+ } else {
+ // When S tensor is transposed, we also need the element size and get the element ranges from each row
+ const uint32_t mem_size = size;
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
+ if (s_l[il] == nullptr) continue;
+
+ const uint32_t n_embd_s = hparams.n_embd_s();
+
+ // Write S tensor type
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
+ io.write(&s_type_i, sizeof(s_type_i));
+
+ // Write element size
+ const uint32_t s_size_el = ggml_type_size(s_l[il]->type);
+ io.write(&s_size_el, sizeof(s_size_el));
+
+ // Write GQA embedding size
+ io.write(&n_embd_s, sizeof(n_embd_s));
+
+ // For each row, we get the element values of each cell
+ for (uint32_t j = 0; j < n_embd_s; ++j) {
+ // Write each range of cells of s_size_el length
+ for (const auto & range : cell_ranges) {
+ const size_t range_size = range.second - range.first;
+ const size_t src_offset = (range.first + j * mem_size) * s_size_el;
+ const size_t buf_size = range_size * s_size_el;
+ io.write_tensor(s_l[il], src_offset, buf_size);
+ }
+ }
+ }
+ }
+}
+
+bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+ if (dest_seq_id != -1) {
+ // single sequence
+ seq_rm(dest_seq_id, -1, -1);
+
+ if (cell_count == 0) {
+ return true;
+ }
+
+ llama_batch_allocr balloc(hparams.n_pos_per_embd());
+
+ llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
+
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ llama_pos pos;
+ uint32_t n_seq_id;
+
+ io.read_to(&pos, sizeof(pos));
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+ if (n_seq_id != 0) {
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+ return false;
+ }
+
+ ubatch.pos[i] = pos;
+ }
+ ubatch.n_seq_id[0] = 1;
+ ubatch.seq_id[0] = &dest_seq_id;
+
+ if (!find_slot(ubatch)) {
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+ return false;
+ }
+
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+ // Assume that this is one contiguous block of cells
+ GGML_ASSERT(head + cell_count <= size);
+ GGML_ASSERT(cells[head].pos == ubatch.pos[0]);
+ GGML_ASSERT(cells[head + cell_count - 1].pos == ubatch.pos[cell_count - 1]);
+ GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+ GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+ } else {
+ // whole KV cache restore
+
+ if (cell_count > size) {
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+ return false;
+ }
+
+ clear(true);
+
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ auto & cell = cells[i];
+
+ llama_pos pos;
+ uint32_t n_seq_id;
+
+ io.read_to(&pos, sizeof(pos));
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+ cell.pos = pos;
+
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
+ llama_seq_id seq_id;
+ io.read_to(&seq_id, sizeof(seq_id));
+
+ // TODO: llama_memory_recurrent should have a notion of max sequences
+ //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+ if (seq_id < 0) {
+ //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+ return false;
+ }
+
+ cell.seq_id.insert(seq_id);
+
+ int32_t & tail = cells[seq_id].tail;
+ if (tail != -1) {
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+ return false;
+ }
+ tail = i;
+ }
+ }
+
+ head = 0;
+ used = cell_count;
+ }
+
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ uint32_t cell_id = head + i;
+ // make sure the recurrent states will keep their restored state
+ cells[cell_id].src = cell_id;
+ }
+
+ return true;
+}
+
+bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+ uint32_t s_trans;
+ uint32_t n_layer;
+ io.read_to(&s_trans, sizeof(s_trans));
+ io.read_to(&n_layer, sizeof(n_layer));
+
+ if (n_layer != hparams.n_layer) {
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+ return false;
+ }
+ if (cell_count > size) {
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+ return false;
+ }
+ if (false != (bool) s_trans) {
+ LLAMA_LOG_ERROR("%s: incompatible s transposition\n", __func__);
+ return false;
+ }
+
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ // skip null layers
+ if (r_l[il] == nullptr) continue;
+
+ // Read type of key
+ int32_t r_type_i_ref;
+ io.read_to(&r_type_i_ref, sizeof(r_type_i_ref));
+ const int32_t r_type_i = (int32_t) r_l[il]->type;
+ if (r_type_i != r_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched r type (%d != %d, layer %d)\n", __func__, r_type_i, r_type_i_ref, il);
+ return false;
+ }
+
+ // Read row size of key
+ uint64_t r_size_row_ref;
+ io.read_to(&r_size_row_ref, sizeof(r_size_row_ref));
+ const size_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
+ if (r_size_row != r_size_row_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched r row size (%zu != %zu, layer %d)\n", __func__, r_size_row, (size_t) r_size_row_ref, il);
+ return false;
+ }
+
+ if (cell_count) {
+ // Read and set the keys for the whole cell range
+ ggml_backend_tensor_set(r_l[il], io.read(cell_count * r_size_row), head * r_size_row, cell_count * r_size_row);
+ }
+ }
+
+ if (!s_trans) {
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ // skip null layers
+ if (s_l[il] == nullptr) continue;
+
+ // Read type of value
+ int32_t s_type_i_ref;
+ io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
+
+ if (s_type_i != s_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
+ return false;
+ }
+
+ // Read row size of value
+ uint64_t s_size_row_ref;
+ io.read_to(&s_size_row_ref, sizeof(s_size_row_ref));
+ const size_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
+ if (s_size_row != s_size_row_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched s row size (%zu != %zu, layer %d)\n", __func__, s_size_row, (size_t) s_size_row_ref, il);
+ return false;
+ }
+
+ if (cell_count) {
+ // Read and set the values for the whole cell range
+ ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_row), head * s_size_row, cell_count * s_size_row);
+ }
+ }
+ } else {
+ // For each layer, read the values for each cell (transposed)
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ // skip null layers
+ if (s_l[il] == nullptr) continue;
+
+ const uint32_t n_embd_s = hparams.n_embd_s();
+
+ // Read type of value
+ int32_t s_type_i_ref;
+ io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
+ if (s_type_i != s_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
+ return false;
+ }
+
+ // Read element size of value
+ uint32_t s_size_el_ref;
+ io.read_to(&s_size_el_ref, sizeof(s_size_el_ref));
+ const size_t s_size_el = ggml_type_size(s_l[il]->type);
+ if (s_size_el != s_size_el_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched s element size (%zu != %zu, layer %d)\n", __func__, s_size_el, (size_t) s_size_el_ref, il);
+ return false;
+ }
+
+ // Read state embedding size
+ uint32_t n_embd_s_ref;
+ io.read_to(&n_embd_s_ref, sizeof(n_embd_s_ref));
+ if (n_embd_s != n_embd_s_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched s embedding size (%u != %u, layer %d)\n", __func__, n_embd_s, n_embd_s_ref, il);
+ return false;
+ }
+
+ if (cell_count) {
+ // For each row in the transposed matrix, read the values for the whole cell range
+ for (uint32_t j = 0; j < n_embd_s; ++j) {
+ const size_t dst_offset = (head + j * size) * s_size_el;
+ ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_el), dst_offset, cell_count * s_size_el);
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+//
+// llama_memory_recurrent_context
+//
+
+llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {}
+
+llama_memory_recurrent_context::llama_memory_recurrent_context(
+ llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) {
+}
+
+llama_memory_recurrent_context::llama_memory_recurrent_context(
+ llama_memory_recurrent * mem,
+ std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {}
+
+llama_memory_recurrent_context::~llama_memory_recurrent_context() = default;
+
+bool llama_memory_recurrent_context::next() {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ if (++i_next >= ubatches.size()) {
+ return false;
+ }
+
+ return true;
+}
+
+bool llama_memory_recurrent_context::apply() {
+ assert(!llama_memory_status_is_fail(status));
+
+ // no ubatches -> this is an update
+ if (ubatches.empty()) {
+ // recurrent cache never performs updates
+ assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE);
+
+ return true;
+ }
+
+ mem->find_slot(ubatches[i_next]);
+
+ return true;
+}
+
+llama_memory_status llama_memory_recurrent_context::get_status() const {
+ return status;
+}
+
+const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ return ubatches[i_next];
+}
+
+uint32_t llama_memory_recurrent_context::get_n_rs() const {
+ return is_full ? mem->size : mem->n;
+}
+
+uint32_t llama_memory_recurrent_context::get_head() const {
+ return is_full ? 0 : mem->head;
+}
+
+int32_t llama_memory_recurrent_context::get_rs_z() const {
+ return is_full ? 0 : mem->rs_z;
+}
+
+uint32_t llama_memory_recurrent_context::get_size() const {
+ return mem->size;
+}
+
+ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const {
+ return mem->r_l[il];
+}
+
+ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
+ return mem->s_l[il];
+}
+
+int32_t llama_memory_recurrent_context::s_copy(int i) const {
+ return mem->cells[i + mem->head].src0;
+}
diff --git a/llama.cpp/src/llama-memory-recurrent.h b/llama.cpp/src/llama-memory-recurrent.h
new file mode 100644
index 0000000..47f01d7
--- /dev/null
+++ b/llama.cpp/src/llama-memory-recurrent.h
@@ -0,0 +1,182 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-memory.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+//
+// llama_memory_recurrent
+//
+
+// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
+// see the implementation of llama_kv_cache_context_i for an example how to do it
+class llama_memory_recurrent : public llama_memory_i {
+public:
+ llama_memory_recurrent(
+ const llama_model & model,
+ ggml_type type_r,
+ ggml_type type_s,
+ bool offload,
+ uint32_t mem_size,
+ uint32_t n_seq_max,
+ const layer_filter_cb & filter);
+
+ ~llama_memory_recurrent() = default;
+
+ //
+ // llama_memory_i
+ //
+
+ llama_memory_context_ptr init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) override;
+
+ llama_memory_context_ptr init_full() override;
+
+ llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+ void clear(bool data) override;
+
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+ void seq_keep(llama_seq_id seq_id) override;
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+ bool prepare(const std::vector<llama_ubatch> & ubatches);
+
+ // find a contiguous slot of memory cells and emplace the ubatch there
+ bool find_slot(const llama_ubatch & ubatch);
+
+ bool get_can_shift() const override;
+
+ // state write/load
+
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+ uint32_t size = 0; // total number of cells, shared across all sequences
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+ // computed before each graph build
+ uint32_t n = 0;
+
+ // first zero-ed state
+ int32_t rs_z = -1;
+
+ // TODO: optimize for recurrent state needs
+ struct mem_cell {
+ llama_pos pos = -1;
+ int32_t src = -1; // used to know where states should be copied from
+ int32_t src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
+ int32_t tail = -1;
+
+ std::set<llama_seq_id> seq_id;
+
+ bool has_seq_id(const llama_seq_id & id) const {
+ return seq_id.find(id) != seq_id.end();
+ }
+
+ bool is_empty() const {
+ return seq_id.empty();
+ }
+
+ bool is_same_seq(const mem_cell & other) const {
+ return seq_id == other.seq_id;
+ }
+ };
+
+ std::vector<mem_cell> cells;
+
+ // per layer
+ std::vector<ggml_tensor *> r_l;
+ std::vector<ggml_tensor *> s_l;
+
+private:
+ //const llama_model & model;
+ const llama_hparams & hparams;
+
+ const uint32_t n_seq_max = 1;
+
+ // ggml contexts for the KV cache along with the allocated backend buffers:
+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
+
+ size_t total_size() const;
+
+ size_t size_r_bytes() const;
+ size_t size_s_bytes() const;
+
+ void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+ void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+
+ bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+ bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+};
+
+class llama_memory_recurrent_context : public llama_memory_context_i {
+public:
+ // used for errors
+ llama_memory_recurrent_context(llama_memory_status status);
+
+ // used to create a full-cache or update context
+ llama_memory_recurrent_context(
+ llama_memory_recurrent * mem);
+
+ // used to create a batch processing context from a batch
+ llama_memory_recurrent_context(
+ llama_memory_recurrent * mem,
+ std::vector<llama_ubatch> ubatches);
+
+ virtual ~llama_memory_recurrent_context();
+
+ //
+ // llama_memory_context_i
+ //
+
+ bool next() override;
+ bool apply() override;
+
+ llama_memory_status get_status() const override;
+ const llama_ubatch & get_ubatch() const override;
+
+ //
+ // llama_memory_recurrent_context specific API
+ //
+
+ uint32_t get_n_rs() const;
+ uint32_t get_head() const;
+ int32_t get_rs_z() const;
+ uint32_t get_size() const;
+
+ ggml_tensor * get_r_l(int32_t il) const;
+ ggml_tensor * get_s_l(int32_t il) const;
+
+ int32_t s_copy(int i) const;
+
+private:
+ const llama_memory_status status;
+
+ llama_memory_recurrent * mem;
+
+ size_t i_next = 0;
+
+ std::vector<llama_ubatch> ubatches;
+
+ //
+ // data needed for building the compute graph for the current ubatch:
+ // TODO: extract all the state like `head` and `n` here
+ //
+
+ const bool is_full = false;
+};
diff --git a/llama.cpp/src/llama-memory.cpp b/llama.cpp/src/llama-memory.cpp
new file mode 100644
index 0000000..ca6844c
--- /dev/null
+++ b/llama.cpp/src/llama-memory.cpp
@@ -0,0 +1,59 @@
+#include "llama-memory.h"
+
+llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1) {
+ bool has_update = false;
+
+ switch (s0) {
+ case LLAMA_MEMORY_STATUS_SUCCESS:
+ {
+ has_update = true;
+ break;
+ }
+ case LLAMA_MEMORY_STATUS_NO_UPDATE:
+ {
+ break;
+ }
+ case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+ case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+ {
+ return s0;
+ }
+ }
+
+ switch (s1) {
+ case LLAMA_MEMORY_STATUS_SUCCESS:
+ {
+ has_update = true;
+ break;
+ }
+ case LLAMA_MEMORY_STATUS_NO_UPDATE:
+ {
+ break;
+ }
+ case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+ case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+ {
+ return s1;
+ }
+ }
+
+ // if either status has an update, then the combined status has an update
+ return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
+}
+
+bool llama_memory_status_is_fail(llama_memory_status status) {
+ switch (status) {
+ case LLAMA_MEMORY_STATUS_SUCCESS:
+ case LLAMA_MEMORY_STATUS_NO_UPDATE:
+ {
+ return false;
+ }
+ case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+ case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+ {
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/llama.cpp/src/llama-memory.h b/llama.cpp/src/llama-memory.h
new file mode 100644
index 0000000..4a157b9
--- /dev/null
+++ b/llama.cpp/src/llama-memory.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include "llama.h"
+
+#include <map>
+#include <memory>
+#include <functional>
+
+struct llama_ubatch;
+
+class llama_batch_allocr;
+
+class llama_io_write_i;
+class llama_io_read_i;
+
+struct llama_memory_params {
+ // kv cache
+ ggml_type type_k;
+ ggml_type type_v;
+
+ // use full-size SWA cache
+ bool swa_full;
+};
+
+enum llama_memory_status {
+ LLAMA_MEMORY_STATUS_SUCCESS = 0,
+ LLAMA_MEMORY_STATUS_NO_UPDATE,
+ LLAMA_MEMORY_STATUS_FAILED_PREPARE,
+ LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
+};
+
+// helper function for combining the status of two memory contexts
+// useful for implementing hybrid memory types (e.g. iSWA)
+llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
+
+// helper function for checking if a memory status indicates a failure
+bool llama_memory_status_is_fail(llama_memory_status status);
+
+// the interface for managing the memory context during batch processing
+// this interface is implemented per memory type. see:
+// - llama_kv_cache_context
+// - llama_kv_cache_iswa_context
+// ...
+//
+// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
+struct llama_memory_context_i {
+ virtual ~llama_memory_context_i() = default;
+
+ // consume the current ubatch from the context and proceed to the next one
+ // return false if we are done
+ virtual bool next() = 0;
+
+ // apply the memory state for the current ubatch to the memory object
+ // return false on failure
+ virtual bool apply() = 0;
+
+ // get the current ubatch
+ virtual const llama_ubatch & get_ubatch() const = 0;
+
+ // get the status of the memory context - used for error handling and checking if any updates would be applied
+ virtual llama_memory_status get_status() const = 0;
+};
+
+using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
+
+// general concept of LLM memory
+// the KV cache is a type of LLM memory, but there can be other types
+struct llama_memory_i {
+ // this callback is used to filter out layers that should not be included in the cache
+ using layer_filter_cb = std::function<bool(int32_t il)>;
+
+ // this callback is used to specify which layers should reuse memory from other layers
+ // return negative value to indicate that the layer il should not reuse memory
+ using layer_reuse_cb = std::function<int32_t(int32_t il)>;
+
+ virtual ~llama_memory_i() = default;
+
+ // split the input batch into a set of ubatches and verify that they can fit into the cache
+ // return a context object containing the ubatches and memory state required to process them
+ // check the llama_memory_context_i::get_status() for the result
+ virtual llama_memory_context_ptr init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) = 0;
+
+ // simulate full cache, used for allocating worst-case compute buffers
+ virtual llama_memory_context_ptr init_full() = 0;
+
+ // prepare for any pending memory updates, such as shifts, copies, etc.
+ // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
+ virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
+
+ // getters
+ virtual bool get_can_shift() const = 0;
+
+ //
+ // ops
+ //
+
+ // if data == true, the data buffers will also be cleared together with the metadata
+ virtual void clear(bool data) = 0;
+
+ virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
+ virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
+ virtual void seq_keep(llama_seq_id seq_id) = 0;
+ virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0;
+ virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
+
+ virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
+ virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
+
+ virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
+
+ //
+ // state write/read
+ //
+
+ virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
+ virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
+};
+
+using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
diff --git a/llama.cpp/src/llama-mmap.cpp b/llama.cpp/src/llama-mmap.cpp
new file mode 100644
index 0000000..0261e4c
--- /dev/null
+++ b/llama.cpp/src/llama-mmap.cpp
@@ -0,0 +1,742 @@
+#include "llama-mmap.h"
+
+#include "llama-impl.h"
+
+#include "ggml.h"
+
+#include <cstring>
+#include <climits>
+#include <stdexcept>
+#include <cerrno>
+#include <algorithm>
+
+#ifdef __has_include
+ #if __has_include(<unistd.h>)
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <sys/stat.h>
+ #if defined(_POSIX_MAPPED_FILES)
+ #include <sys/mman.h>
+ #endif
+ #if defined(_POSIX_MEMLOCK_RANGE)
+ #include <sys/resource.h>
+ #endif
+ #endif
+#endif
+
+#if defined(_WIN32)
+ #define WIN32_LEAN_AND_MEAN
+ #ifndef NOMINMAX
+ #define NOMINMAX
+ #endif
+ #include <windows.h>
+ #ifndef PATH_MAX
+ #define PATH_MAX MAX_PATH
+ #endif
+ #include <io.h>
+#endif
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+// TODO: consider moving to llama-impl.h if needed in more places
+#if defined(_WIN32)
+static std::string llama_format_win_err(DWORD err) {
+ LPSTR buf;
+ size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+ if (!size) {
+ return "FormatMessageA failed";
+ }
+ std::string ret(buf, size);
+ LocalFree(buf);
+ return ret;
+}
+#endif
+
+// llama_file
+
+struct llama_file::impl {
+#if defined(_WIN32)
+ HANDLE fp_win32;
+ std::string GetErrorMessageWin32(DWORD error_code) const {
+ std::string ret;
+ LPSTR lpMsgBuf = NULL;
+ DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
+ if (!bufLen) {
+ ret = format("Win32 error code: %lx", error_code);
+ } else {
+ ret = lpMsgBuf;
+ LocalFree(lpMsgBuf);
+ }
+
+ return ret;
+ }
+
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+ fp = ggml_fopen(fname, mode);
+ if (fp == NULL) {
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+ }
+ fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
+ seek(0, SEEK_END);
+ size = tell();
+ seek(0, SEEK_SET);
+ }
+
+ size_t tell() const {
+ LARGE_INTEGER li;
+ li.QuadPart = 0;
+ BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
+ if (!ret) {
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+ }
+
+ return li.QuadPart;
+ }
+
+ void seek(size_t offset, int whence) const {
+ static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
+ static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
+ static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
+
+ LARGE_INTEGER li;
+ li.QuadPart = offset;
+ BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
+ if (!ret) {
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+ }
+ }
+
+ void read_raw(void * ptr, size_t len) {
+ size_t bytes_read = 0;
+ while (bytes_read < len) {
+ size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
+ DWORD chunk_read = 0;
+ BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
+ if (!result) {
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+ }
+ if (chunk_read < chunk_size || chunk_read == 0) {
+ throw std::runtime_error("unexpectedly reached end of file");
+ }
+
+ bytes_read += chunk_read;
+ }
+ }
+
+ uint32_t read_u32() {
+ uint32_t val;
+ read_raw(&val, sizeof(val));
+ return val;
+ }
+
+ void write_raw(const void * ptr, size_t len) const {
+ size_t bytes_written = 0;
+ while (bytes_written < len) {
+ size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
+ DWORD chunk_written = 0;
+ BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
+ if (!result) {
+ throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+ }
+ if (chunk_written < chunk_size || chunk_written == 0) {
+ throw std::runtime_error("unexpectedly failed to write bytes");
+ }
+
+ bytes_written += chunk_written;
+ }
+ }
+
+ void write_u32(uint32_t val) const {
+ write_raw(&val, sizeof(val));
+ }
+
+ bool has_direct_io() const {
+ return true;
+ }
+
+ ~impl() {
+ if (fp) {
+ std::fclose(fp);
+ }
+ }
+#else
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
+#ifdef __linux__
+ // Try unbuffered I/O for read only
+ if (use_direct_io && std::strcmp(mode, "rb") == 0) {
+ if (init_fd()) {
+ return;
+ }
+ LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
+ fname, strerror(errno));
+ }
+#endif
+ init_fp(mode);
+ }
+
+#ifdef __linux__
+ bool init_fd() {
+ fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
+
+ if (fd != -1) {
+ struct stat file_stats{};
+ fstat(fd, &file_stats);
+
+ size = file_stats.st_size;
+ alignment = file_stats.st_blksize;
+
+ off_t ret = lseek(fd, 0, SEEK_SET);
+ if (ret == -1) {
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
+ }
+ return true;
+ }
+ return false;
+ }
+#endif
+
+ void init_fp(const char * mode) {
+ fp = ggml_fopen(fname.c_str(), mode);
+ if (fp == NULL) {
+ throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
+ }
+ seek(0, SEEK_END);
+ size = tell();
+ seek(0, SEEK_SET);
+ }
+
+ size_t tell() const {
+ if (fd == -1) {
+ long ret = std::ftell(fp);
+ if (ret == -1) {
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+ }
+
+ return (size_t) ret;
+ }
+
+ off_t pos = lseek(fd, 0, SEEK_CUR);
+ if (pos == -1) {
+ throw std::runtime_error(format("lseek error: %s", strerror(errno)));
+ }
+ return (size_t) pos;
+ }
+
+ void seek(size_t offset, int whence) const {
+ off_t ret = 0;
+ if (fd == -1) {
+ ret = std::fseek(fp, (long) offset, whence);
+ } else {
+ ret = lseek(fd, offset, whence);
+ }
+ if (ret == -1) {
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
+ }
+ }
+
+ void read_raw_unsafe(void * ptr, size_t len) {
+ if (len == 0) {
+ return;
+ }
+ errno = 0;
+ if (fd == -1) {
+ const size_t curr_off = tell();
+ const size_t to_read = std::min(len, size - curr_off);
+
+ std::size_t ret = std::fread(ptr, to_read, 1, fp);
+ if (ferror(fp)) {
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
+ }
+ if (to_read > 0 && ret != 1) {
+ throw std::runtime_error("unexpectedly reached end of file");
+ }
+ } else {
+ size_t bytes_read = 0;
+ while (bytes_read < len) {
+ const size_t to_read = len - bytes_read;
+ ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
+
+ if (ret == -1) {
+ if (errno == EINTR) {
+ continue; // Interrupted by signal, retry
+ }
+ // Fallback to std::fread in case the DMA controller cannot access the buffer
+ if (errno == EFAULT || errno == EINVAL) {
+ LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
+ auto curr_off = tell();
+ close(fd);
+ fd = -1;
+ alignment = 1;
+ init_fp("rb");
+ seek(curr_off, SEEK_SET);
+ read_raw_unsafe(ptr, len);
+ return;
+ }
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
+ }
+ if (ret == 0) {
+ // EOF: allow if this read was only pulling alignment padding past file end
+ off_t pos = lseek(fd, 0, SEEK_CUR);
+ if (pos != -1 && (size_t) pos == size) {
+ std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
+ return;
+ }
+ throw std::runtime_error("unexpectedly reached end of file");
+ }
+
+ bytes_read += (size_t) ret;
+ }
+ }
+ }
+
+ void read_aligned_chunk(void * dest, size_t size) {
+ size_t offset = tell();
+ off_t aligned_offset = offset & ~(alignment - 1);
+ off_t offset_from_alignment = offset - aligned_offset;
+ size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
+
+ void * raw_buffer = nullptr;
+ int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
+ if (ret != 0) {
+ throw std::runtime_error(format("posix_memalign failed with error %d", ret));
+ }
+
+ struct aligned_buffer_deleter {
+ void operator()(void * p) const { free(p); }
+ };
+ std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
+
+ seek(aligned_offset, SEEK_SET);
+ read_raw_unsafe(buffer.get(), bytes_to_read);
+
+ uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
+ memcpy(dest, reinterpret_cast<void *>(actual_data), size);
+ }
+
+ void read_raw(void * ptr, size_t len) {
+ if (has_direct_io()) {
+ read_aligned_chunk(ptr, len);
+ } else {
+ read_raw_unsafe(ptr, len);
+ }
+ }
+
+ uint32_t read_u32() {
+ uint32_t ret;
+ read_raw(&ret, sizeof(ret));
+ return ret;
+ }
+
+ void write_raw(const void * ptr, size_t len) const {
+ if (len == 0) {
+ return;
+ }
+ errno = 0;
+ size_t ret = std::fwrite(ptr, len, 1, fp);
+ if (ret != 1) {
+ throw std::runtime_error(format("write error: %s", strerror(errno)));
+ }
+ }
+
+ void write_u32(uint32_t val) const {
+ write_raw(&val, sizeof(val));
+ }
+
+ bool has_direct_io() const {
+ return fd != -1 && alignment > 1;
+ }
+
+ ~impl() {
+ if (fd != -1) {
+ close(fd);
+ } else {
+ std::fclose(fp);
+ }
+ }
+ int fd = -1;
+ std::string fname;
+#endif
+
+ size_t read_alignment() const {
+ return alignment;
+ }
+
+ size_t alignment = 1;
+
+ FILE * fp{};
+ size_t size{};
+};
+
+llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
+ pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
+llama_file::~llama_file() = default;
+
+size_t llama_file::tell() const { return pimpl->tell(); }
+size_t llama_file::size() const { return pimpl->size; }
+
+size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
+bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
+
+int llama_file::file_id() const {
+#ifdef _WIN32
+ return _fileno(pimpl->fp);
+#else
+ if (pimpl->fd != -1) {
+ return pimpl->fd;
+ }
+#if defined(fileno)
+ return fileno(pimpl->fp);
+#else
+ return ::fileno(pimpl->fp);
+#endif
+#endif
+}
+
+void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
+void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#ifdef _WIN32
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
+#else
+void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
+#endif
+
+uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
+
+void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
+void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
+
+// llama_mmap
+
+struct llama_mmap::impl {
+#ifdef _POSIX_MAPPED_FILES
+ std::vector<std::pair<size_t, size_t>> mapped_fragments;
+
+ impl(struct llama_file * file, size_t prefetch, bool numa) {
+ size = file->size();
+ int fd = file->file_id();
+ int flags = MAP_SHARED;
+ if (numa) { prefetch = 0; }
+#ifdef __linux__
+ if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
+ LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
+ strerror(errno));
+ }
+ if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+ addr = mmap(NULL, file->size(), PROT_READ, flags, fd, 0);
+ if (addr == MAP_FAILED) {
+ throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
+ }
+
+ if (prefetch > 0) {
+ if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
+ strerror(errno));
+ }
+ }
+ if (numa) {
+ if (posix_madvise(addr, file->size(), POSIX_MADV_RANDOM)) {
+ LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
+ strerror(errno));
+ }
+ }
+
+ mapped_fragments.emplace_back(0, file->size());
+ }
+
+ static void align_range(size_t * first, size_t * last, size_t page_size) {
+ size_t offset_in_page = *first & (page_size - 1);
+ size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
+ *first += offset_to_page;
+
+ *last = *last & ~(page_size - 1);
+
+ if (*last <= *first) {
+ *last = *first;
+ }
+ }
+
+ void unmap_fragment(size_t first, size_t last) {
+ int page_size = sysconf(_SC_PAGESIZE);
+ align_range(&first, &last, page_size);
+ size_t len = last - first;
+
+ if (len == 0) {
+ return;
+ }
+
+ GGML_ASSERT(first % page_size == 0);
+ GGML_ASSERT(last % page_size == 0);
+ GGML_ASSERT(last > first);
+
+ void * next_page_start = (uint8_t *) addr + first;
+
+ if (munmap(next_page_start, len)) {
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+ }
+
+ std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
+ for (const auto & frag : mapped_fragments) {
+ if (frag.first < first && frag.second > last) {
+ new_mapped_fragments.emplace_back(frag.first, first);
+ new_mapped_fragments.emplace_back(last, frag.second);
+ } else if (frag.first < first && frag.second > first) {
+ new_mapped_fragments.emplace_back(frag.first, first);
+ } else if (frag.first < last && frag.second > last) {
+ new_mapped_fragments.emplace_back(last, frag.second);
+ } else if (frag.first >= first && frag.second <= last) {
+ } else {
+ new_mapped_fragments.push_back(frag);
+ }
+ }
+ mapped_fragments = std::move(new_mapped_fragments);
+ }
+
+ ~impl() {
+ for (const auto & frag : mapped_fragments) {
+ if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
+ LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+ }
+ }
+ }
+#elif defined(_WIN32)
+ impl(struct llama_file * file, size_t prefetch, bool numa) {
+ GGML_UNUSED(numa);
+
+ size = file->size();
+
+ HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
+
+ HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+
+ if (hMapping == NULL) {
+ DWORD error = GetLastError();
+ throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
+ }
+
+ addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+ DWORD error = GetLastError();
+ CloseHandle(hMapping);
+
+ if (addr == NULL) {
+ throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
+ }
+
+ if (prefetch > 0) {
+#if _WIN32_WINNT >= 0x602
+ BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
+ HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
+
+ pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
+
+ if (pPrefetchVirtualMemory) {
+ WIN32_MEMORY_RANGE_ENTRY range;
+ range.VirtualAddress = addr;
+ range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
+ if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+ LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
+ llama_format_win_err(GetLastError()).c_str());
+ }
+ }
+#else
+ LLAMA_LOG_DEBUG("skipping PrefetchVirtualMemory because _WIN32_WINNT < 0x602\n");
+#endif
+ }
+ }
+
+ void unmap_fragment(size_t first, size_t last) {
+ GGML_UNUSED(first);
+ GGML_UNUSED(last);
+ }
+
+ ~impl() {
+ if (!UnmapViewOfFile(addr)) {
+ LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
+ llama_format_win_err(GetLastError()).c_str());
+ }
+ }
+#else
+ impl(struct llama_file * file, size_t prefetch, bool numa) {
+ GGML_UNUSED(file);
+ GGML_UNUSED(prefetch);
+ GGML_UNUSED(numa);
+
+ throw std::runtime_error("mmap not supported");
+ }
+
+ void unmap_fragment(size_t first, size_t last) {
+ GGML_UNUSED(first);
+ GGML_UNUSED(last);
+
+ throw std::runtime_error("mmap not supported");
+ }
+#endif
+
+ void * addr;
+ size_t size;
+};
+
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa) : pimpl(std::make_unique<impl>(file, prefetch, numa)) {}
+llama_mmap::~llama_mmap() = default;
+
+size_t llama_mmap::size() const { return pimpl->size; }
+void * llama_mmap::addr() const { return pimpl->addr; }
+
+void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
+
+#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
+const bool llama_mmap::SUPPORTED = true;
+#else
+const bool llama_mmap::SUPPORTED = false;
+#endif
+
+// llama_mlock
+
+struct llama_mlock::impl {
+#ifdef _POSIX_MEMLOCK_RANGE
+ static size_t lock_granularity() {
+ return (size_t) sysconf(_SC_PAGESIZE);
+ }
+
+ bool raw_lock(const void * addr, size_t size) const {
+ if (!mlock(addr, size)) {
+ return true;
+ }
+
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION \
+ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION \
+ "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
+#endif
+
+ char* errmsg = std::strerror(errno);
+ bool suggest = (errno == ENOMEM);
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
+ // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
+ // Skip resource limit checks on these platforms
+ suggest = false;
+#else
+ struct rlimit lock_limit;
+ if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
+ suggest = false;
+ }
+ if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
+ suggest = false;
+ }
+#endif
+
+ LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+ size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+ return false;
+ }
+
+ static void raw_unlock(void * addr, size_t size) {
+ if (munlock(addr, size)) {
+ LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
+ }
+ }
+#elif defined(_WIN32)
+ static size_t lock_granularity() {
+ SYSTEM_INFO si;
+ GetSystemInfo(&si);
+ return (size_t) si.dwPageSize;
+ }
+
+ bool raw_lock(void * ptr, size_t len) const {
+ for (int tries = 1; ; tries++) {
+ if (VirtualLock(ptr, len)) {
+ return true;
+ }
+ if (tries == 2) {
+ LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+ len, size, llama_format_win_err(GetLastError()).c_str());
+ return false;
+ }
+
+ SIZE_T min_ws_size, max_ws_size;
+ if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+ LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
+ llama_format_win_err(GetLastError()).c_str());
+ return false;
+ }
+ size_t increment = len + 1048576;
+ min_ws_size += increment;
+ max_ws_size += increment;
+ if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+ LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
+ llama_format_win_err(GetLastError()).c_str());
+ return false;
+ }
+ }
+ }
+
+ static void raw_unlock(void * ptr, size_t len) {
+ if (!VirtualUnlock(ptr, len)) {
+ LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
+ llama_format_win_err(GetLastError()).c_str());
+ }
+ }
+#else
+ static size_t lock_granularity() {
+ return (size_t) 65536;
+ }
+
+ bool raw_lock(const void * addr, size_t len) const {
+ LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
+ return false;
+ }
+
+ static void raw_unlock(const void * addr, size_t len) {}
+#endif
+
+ impl() : addr(NULL), size(0), failed_already(false) {}
+
+ void init(void * ptr) {
+ GGML_ASSERT(addr == NULL && size == 0);
+ addr = ptr;
+ }
+
+ void grow_to(size_t target_size) {
+ GGML_ASSERT(addr);
+ if (failed_already) {
+ return;
+ }
+ size_t granularity = lock_granularity();
+ target_size = (target_size + granularity - 1) & ~(granularity - 1);
+ if (target_size > size) {
+ if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+ size = target_size;
+ } else {
+ failed_already = true;
+ }
+ }
+ }
+
+ void * addr;
+ size_t size;
+
+ bool failed_already;
+};
+
+llama_mlock::llama_mlock() : pimpl(std::make_unique<impl>()) {}
+llama_mlock::~llama_mlock() = default;
+
+void llama_mlock::init(void * ptr) { pimpl->init(ptr); }
+void llama_mlock::grow_to(size_t target_size) { pimpl->grow_to(target_size); }
+
+#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
+const bool llama_mlock::SUPPORTED = true;
+#else
+const bool llama_mlock::SUPPORTED = false;
+#endif
+
+size_t llama_path_max() {
+ return PATH_MAX;
+}
diff --git a/llama.cpp/src/llama-mmap.h b/llama.cpp/src/llama-mmap.h
new file mode 100644
index 0000000..29ce4d2
--- /dev/null
+++ b/llama.cpp/src/llama-mmap.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+#include <cstdio>
+
+struct llama_file;
+struct llama_mmap;
+struct llama_mlock;
+
+using llama_files = std::vector<std::unique_ptr<llama_file>>;
+using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
+using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
+
+struct llama_file {
+ llama_file(const char * fname, const char * mode, bool use_direct_io = false);
+ ~llama_file();
+
+ size_t tell() const;
+ size_t size() const;
+
+ int file_id() const; // fileno overload
+
+ void seek(size_t offset, int whence) const;
+
+ void read_raw(void * ptr, size_t len);
+ void read_raw_unsafe(void * ptr, size_t len);
+ void read_aligned_chunk(void * dest, size_t size);
+ uint32_t read_u32();
+
+ void write_raw(const void * ptr, size_t len) const;
+ void write_u32(uint32_t val) const;
+
+ size_t read_alignment() const;
+ bool has_direct_io() const;
+private:
+ struct impl;
+ std::unique_ptr<impl> pimpl;
+};
+
+struct llama_mmap {
+ llama_mmap(const llama_mmap &) = delete;
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false);
+ ~llama_mmap();
+
+ size_t size() const;
+ void * addr() const;
+
+ void unmap_fragment(size_t first, size_t last);
+
+ static const bool SUPPORTED;
+
+private:
+ struct impl;
+ std::unique_ptr<impl> pimpl;
+};
+
+struct llama_mlock {
+ llama_mlock();
+ ~llama_mlock();
+
+ void init(void * ptr);
+ void grow_to(size_t target_size);
+
+ static const bool SUPPORTED;
+
+private:
+ struct impl;
+ std::unique_ptr<impl> pimpl;
+};
+
+size_t llama_path_max();
diff --git a/llama.cpp/src/llama-model-loader.cpp b/llama.cpp/src/llama-model-loader.cpp
new file mode 100644
index 0000000..1501e39
--- /dev/null
+++ b/llama.cpp/src/llama-model-loader.cpp
@@ -0,0 +1,1261 @@
+#include "llama-model-loader.h"
+
+#include "ggml.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <future>
+
+static const size_t kiB = 1024;
+static const size_t MiB = 1024*kiB;
+static const size_t GiB = 1024*MiB;
+
+const char * llama_file_version_name(llama_fver version) {
+ switch (version) {
+ case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
+ case GGUF_FILE_VERSION_V2: return "GGUF V2";
+ case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
+ }
+
+ return "unknown";
+}
+
+static std::string llama_model_ftype_name(llama_ftype ftype) {
+ if (ftype & LLAMA_FTYPE_GUESSED) {
+ return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+ }
+
+ switch (ftype) {
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
+
+ default: return "unknown, may not work";
+ }
+}
+
+// return a list of splits for a given path
+// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
+static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
+ std::vector<std::string> paths;
+ std::string split_prefix;
+ std::vector<char> buf(llama_path_max(), 0);
+
+ {
+ int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
+ if (!ret) {
+ throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
+ }
+ split_prefix = std::string(buf.data(), ret);
+ }
+
+ if (split_prefix.empty()) {
+ throw std::runtime_error(format("invalid split file: %s", path.c_str()));
+ }
+
+ for (int idx = 0; idx < n_split; ++idx) {
+ int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
+ paths.push_back(std::string(buf.data(), ret));
+ }
+
+ return paths;
+}
+
+namespace GGUFMeta {
+ template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
+ struct GKV_Base_Type {
+ static constexpr gguf_type gt = gt_;
+
+ static T getter(const gguf_context * ctx, const int kid) {
+ return gfun(ctx, kid);
+ }
+ };
+
+ template<typename T> struct GKV_Base;
+
+ template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
+ template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
+ template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
+ template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
+ template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
+ template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
+ template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
+ template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
+ template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
+ template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
+ template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
+ template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
+
+ template<> struct GKV_Base<std::string> {
+ static constexpr gguf_type gt = GGUF_TYPE_STRING;
+
+ static std::string getter(const gguf_context * ctx, const int kid) {
+ return gguf_get_val_str(ctx, kid);
+ }
+ };
+
+ struct ArrayInfo {
+ const gguf_type gt;
+ const size_t length;
+ const void * data;
+ };
+
+ template<> struct GKV_Base<ArrayInfo> {
+ public:
+ static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
+ static ArrayInfo getter(const gguf_context *ctx, const int k) {
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
+ return ArrayInfo {
+ arr_type,
+ size_t(gguf_get_arr_n(ctx, k)),
+ arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
+ };
+ }
+ };
+
+ template<typename T>
+ class GKV : public GKV_Base<T> {
+ GKV() = delete;
+
+ public:
+ static T get_kv(const gguf_context * ctx, const int k) {
+ const enum gguf_type kt = gguf_get_kv_type(ctx, k);
+
+ if (kt != GKV::gt) {
+ throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
+ gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
+ }
+ return GKV::getter(ctx, k);
+ }
+
+ static const char * override_type_to_str(const llama_model_kv_override_type ty) {
+ switch (ty) {
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
+ }
+ return "unknown";
+ }
+
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
+ if (!ovrd) { return false; }
+ if (ovrd->tag == expected_type) {
+ LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
+ __func__, override_type_to_str(ovrd->tag), ovrd->key);
+ switch (ovrd->tag) {
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
+ } break;
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
+ } break;
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
+ } break;
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
+ } break;
+ default:
+ // Shouldn't be possible to end up here, but just in case...
+ throw std::runtime_error(
+ format("Unsupported attempt to override %s type for metadata key %s\n",
+ override_type_to_str(ovrd->tag), ovrd->key));
+ }
+ return true;
+ }
+ LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
+ __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
+ return false;
+ }
+
+ template<typename OT>
+ static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
+ target = ovrd->val_bool;
+ return true;
+ }
+ return false;
+ }
+
+ template<typename OT>
+ static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
+ target = ovrd->val_i64;
+ return true;
+ }
+ return false;
+ }
+
+ template<typename OT>
+ static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
+ target = ovrd->val_f64;
+ return true;
+ }
+ return false;
+ }
+
+ template<typename OT>
+ static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
+ target = ovrd->val_str;
+ return true;
+ }
+ return false;
+ }
+
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+ if (try_override<T>(target, ovrd)) {
+ return true;
+ }
+ if (k < 0) { return false; }
+ target = get_kv(ctx, k);
+ return true;
+ }
+
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+ return set(ctx, gguf_find_key(ctx, key), target, ovrd);
+ }
+
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+ return set(ctx, key.c_str(), target, ovrd);
+ }
+ };
+}
+
+ template<typename T>
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
+ llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
+ const int kid = gguf_find_key(meta.get(), key.c_str());
+
+ if (kid < 0) {
+ if (required) {
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ struct GGUFMeta::ArrayInfo arr_info =
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+
+
+ result = arr_info.length;
+ return true;
+ }
+
+ template<typename T>
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
+ llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) {
+ return get_arr_n(llm_kv(kid), result, required);
+ }
+
+ template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required);
+
+ template<typename T>
+ bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
+ const gguf_context * ctx = meta.get();
+ const int kid = gguf_find_key(ctx, key.c_str());
+
+ if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
+ if (required) {
+ throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ struct GGUFMeta::ArrayInfo arr_info =
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
+
+ switch (arr_info.gt) {
+ case GGUF_TYPE_UINT32:
+ case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
+ (std::is_same<T, uint32_t>::value)); break;
+ case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+ case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
+ default:
+ throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
+ }
+
+ if constexpr (std::is_same<T, std::string>::value) {
+ const size_t n_items = gguf_get_arr_n(ctx, kid);
+ result.clear();
+
+ for (size_t i = 0; i < n_items; i++) {
+ const T value = gguf_get_arr_str(ctx, kid, i);
+ result.emplace_back(value);
+ }
+ } else {
+ result.resize(arr_info.length);
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
+ }
+
+ return true;
+ }
+
+ template<typename T, size_t N_MAX>
+ bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
+ const gguf_context * ctx = meta.get();
+ const int kid = gguf_find_key(ctx, key.c_str());
+
+ if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
+ if (required) {
+ throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ struct GGUFMeta::ArrayInfo arr_info =
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
+
+ switch (arr_info.gt) {
+ case GGUF_TYPE_BOOL:
+ case GGUF_TYPE_UINT32:
+ case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
+ (std::is_same<T, uint32_t>::value)); break;
+ case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+ case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
+ default:
+ throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
+ }
+
+ if (arr_info.length > N_MAX) {
+ throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
+ }
+
+ if constexpr (std::is_same<T, std::string>::value) {
+ const size_t n_items = gguf_get_arr_n(ctx, kid);
+
+ for (size_t i = 0; i < n_items; i++) {
+ const T value = gguf_get_arr_str(ctx, kid, i);
+ result[i] = value;
+ }
+ } else {
+ if (arr_info.gt == GGUF_TYPE_BOOL) {
+ std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
+ return static_cast<T>(x);
+ });
+ } else {
+ std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+ }
+ }
+
+ return true;
+ }
+
+ template<typename T>
+ bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) {
+ return get_arr(llm_kv(kid), result, required);
+ }
+
+ template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
+
+ template<typename T>
+ bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
+ auto it = kv_overrides.find(key);
+
+ const struct llama_model_kv_override * override =
+ it != kv_overrides.end() ? &it->second : nullptr;
+
+ const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
+
+ if (required && !found) {
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+ }
+
+ return found;
+ }
+
+ template<typename T>
+ bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) {
+ return get_key(llm_kv(kid), result, required);
+ }
+
+ template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
+ template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
+ template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
+ template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
+
+ template<>
+ bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) {
+ uint32_t tmp;
+ const bool found = get_key(kid, tmp, required);
+ if (found) {
+ result = (enum llama_pooling_type) tmp;
+ } else {
+ result = LLAMA_POOLING_TYPE_UNSPECIFIED;
+ }
+ return found;
+ }
+
+ // get array of n <= N_MAX elements, or a single element repeated n times
+ template<typename T, size_t N_MAX>
+ bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
+ const int kid = gguf_find_key(meta.get(), key.c_str());
+
+ if (kid < 0) {
+ if (required) {
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ if (n > N_MAX) {
+ throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
+ }
+
+ if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
+ struct GGUFMeta::ArrayInfo arr_info =
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+
+ if (n != arr_info.length) {
+ throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
+ }
+
+ return get_arr(key, result, required);
+ }
+
+ T value;
+
+ bool ok = get_key(key, value, required);
+ if (!ok) {
+ return false;
+ }
+
+ for (uint32_t i = 0; i < n; i++) {
+ result[i] = value;
+ }
+
+ return true;
+ }
+
+ template<typename T>
+ bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) {
+ return get_key_or_arr(llm_kv(kid), result, n, required);
+ }
+
+ bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
+ const std::string key = llm_kv(kid);
+
+ const int id = gguf_find_key(meta.get(), key.c_str());
+
+ if (id < 0) {
+ if (required) {
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ // throw and error if type is an array
+ if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
+ if (required) {
+ throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ return get_key(key, result, required);
+ }
+
+ // TODO: this is not very clever - figure out something better
+ template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
+ template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+ template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
+
+
+llama_model_loader::llama_model_loader(
+ const std::string & fname,
+ std::vector<std::string> & splits,
+ bool use_mmap,
+ bool use_direct_io,
+ bool check_tensors,
+ bool no_alloc,
+ const llama_model_kv_override * param_overrides_p,
+ const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
+ int trace = 0;
+ if (getenv("LLAMA_TRACE")) {
+ trace = atoi(getenv("LLAMA_TRACE"));
+ }
+
+ if (param_overrides_p != nullptr) {
+ for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
+ kv_overrides.insert({std::string(p->key), *p});
+ }
+ }
+
+ tensor_buft_overrides = param_tensor_buft_overrides_p;
+
+ // Load the main GGUF
+ struct ggml_context * ctx = NULL;
+ struct gguf_init_params params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ &ctx,
+ };
+
+ meta.reset(gguf_init_from_file(fname.c_str(), params));
+ if (!meta) {
+ throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
+ }
+
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
+
+ files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+ contexts.emplace_back(ctx);
+
+ if (use_mmap && use_direct_io) {
+ if (files.back()->has_direct_io()) {
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+ use_mmap = false;
+ } else {
+ LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
+ use_direct_io = false;
+
+ // reopen file using std::fopen for mmap
+ files.pop_back();
+ files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+ }
+ }
+
+ // Save tensors data offset of the main file.
+ // For subsidiary files, `meta` tensor data offset must not be used,
+ // so we build a unified tensors index for weights.
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+ std::string tensor_name = std::string(cur->name);
+ // make sure there is no duplicated tensor names
+ if (weights_map.find(tensor_name) != weights_map.end()) {
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ }
+ n_elements += ggml_nelements(cur);
+ n_bytes += ggml_nbytes(cur);
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
+ }
+ uint16_t n_split = 0;
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+
+ // Load additional GGML contexts
+ if (n_split > 1) {
+ // make sure the main file is loaded first
+ uint16_t idx = 0;
+ const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
+ get_key(kv_split_no, idx);
+ if (idx != 0) {
+ throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
+ }
+
+ // generate list of splits if needed
+ if (splits.empty()) {
+ splits = llama_get_list_splits(fname, idx, n_split);
+ }
+
+ // in case user give a custom list of splits, check if it matches the expected number
+ if (n_split != (uint16_t)splits.size()) {
+ throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
+ }
+
+ if (trace > 0) {
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
+ }
+
+ // load other splits
+ for (idx = 1; idx < n_split; idx++) {
+ const char * fname_split = splits[idx].c_str();
+
+ struct gguf_init_params split_params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ &ctx,
+ };
+ gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
+ if (!ctx_gguf) {
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
+ }
+
+ // check idx
+ {
+ const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
+ if (kid < 0) {
+ throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+ }
+ int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
+ if (idx_gguf != idx) {
+ throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+ }
+ }
+
+ files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+ contexts.emplace_back(ctx);
+
+ // Save tensors data offset info of the shard.
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+ std::string tensor_name = std::string(cur->name);
+ // make sure there is no duplicated tensor names
+ if (weights_map.find(tensor_name) != weights_map.end()) {
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ }
+ n_elements += ggml_nelements(cur);
+ n_bytes += ggml_nbytes(cur);
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
+ }
+ }
+
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+
+ // sanity check
+ {
+ const int n_tensors_loaded = (int) weights_map.size();
+ if (n_tensors != n_tensors_loaded) {
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+ }
+ }
+
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
+ }
+
+ n_kv = gguf_get_n_kv(meta.get());
+ n_tensors = weights_map.size();
+
+ fver = (enum llama_fver) gguf_get_version(meta.get());
+
+ LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
+ __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+
+ // determine file type based on the number of tensors for each quantization and print meta data
+ // TODO: make optional
+ {
+ std::map<enum ggml_type, uint32_t> n_type;
+
+ uint32_t n_type_max = 0;
+ enum ggml_type type_max = GGML_TYPE_F32;
+
+ for (const auto & it : weights_map) {
+ const llama_tensor_weight & w = it.second;
+ const ggml_tensor * tensor = w.tensor;
+
+ enum ggml_type type = tensor->type;
+
+ n_type[type]++;
+
+ if (n_type_max < n_type[type]) {
+ n_type_max = n_type[type];
+ type_max = type;
+ }
+
+ if (trace > 0) {
+ const uint16_t sid = w.idx;
+ LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
+ sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
+ ggml_nbytes(tensor)/1024.0f/1024.0f);
+ }
+ }
+
+ switch (type_max) {
+ case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
+ case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
+ case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
+ case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
+ case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
+ case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
+ case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
+ case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
+ case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
+ case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
+ case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
+ case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
+ case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
+ case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
+ case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
+ case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
+ case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
+ case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
+ case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
+ case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
+ case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
+ case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
+ case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
+ default:
+ {
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
+ ftype = LLAMA_FTYPE_ALL_F32;
+ } break;
+ }
+
+ // this is a way to mark that we have "guessed" the file type
+ ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
+
+ {
+ uint32_t ftype_val = 0;
+ if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) {
+ ftype = (llama_ftype) ftype_val;
+ }
+ }
+
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+
+ for (int i = 0; i < n_kv; i++) {
+ const char * name = gguf_get_key(meta.get(), i);
+ const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
+ const std::string type_name =
+ type == GGUF_TYPE_ARRAY
+ ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
+ : gguf_type_name(type);
+
+ std::string value = gguf_kv_to_str(meta.get(), i);
+ const size_t MAX_VALUE_LEN = 40;
+ if (value.size() > MAX_VALUE_LEN) {
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+ }
+ replace_all(value, "\n", "\\n");
+
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+ }
+
+ // print type counts
+ for (auto & kv : n_type) {
+ if (kv.second == 0) {
+ continue;
+ }
+
+ LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+ }
+ }
+
+ if (!llama_mmap::SUPPORTED) {
+ LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
+ use_mmap = false;
+ }
+
+ this->use_mmap = use_mmap;
+ this->use_direct_io = use_direct_io;
+ this->check_tensors = check_tensors;
+ this->no_alloc = no_alloc;
+}
+
+std::string llama_model_loader::get_arch_name() const {
+ return arch_name;
+}
+
+enum llm_arch llama_model_loader::get_arch() const {
+ return llm_kv.arch;
+}
+
+const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const {
+ auto pos = weights_map.find(name);
+ if (pos != weights_map.end()) {
+ return &pos->second;
+ }
+
+ return nullptr;
+}
+
+const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const {
+ const llama_tensor_weight * weight = get_weight(name);
+ if (!weight) {
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
+ }
+ return *weight;
+}
+
+struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const {
+ const auto * weight = get_weight(name);
+ if (!weight) {
+ return nullptr;
+ }
+ return weight->tensor;
+}
+
+struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const {
+ struct ggml_tensor * tensor = get_tensor_meta(name.c_str());
+ if (!tensor) {
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
+ }
+ return tensor;
+}
+
+const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
+ const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
+
+ if (cur == NULL) {
+ if (!required) {
+ return NULL;
+ }
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
+ }
+
+ {
+ bool is_ok = true;
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
+ is_ok = false;
+ break;
+ }
+ }
+ if (!is_ok) {
+ throw std::runtime_error(
+ format("%s: tensor '%s' has wrong shape; expected %s, got %s",
+ __func__, name.c_str(),
+ llama_format_tensor_shape(ne).c_str(),
+ llama_format_tensor_shape(cur).c_str()));
+ }
+ }
+
+ return cur;
+}
+
+struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
+ LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
+
+ if (cur == NULL) {
+ return NULL;
+ }
+
+ bool duplicated = flags & TENSOR_DUPLICATED;
+
+ struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
+ ggml_set_name(tensor, ggml_get_name(cur));
+
+ if (duplicated) {
+ size_data += ggml_nbytes(cur);
+ } else {
+ n_created++;
+ }
+
+ return tensor;
+
+}
+
+struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
+
+ if (cur == NULL) {
+ return NULL;
+ }
+
+ if (cur->type != base->type) {
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
+ }
+
+ std::array<int64_t, GGML_MAX_DIMS> dims;
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+ dims[i] = i < ne.size() ? ne.begin()[i] : 1;
+ }
+
+ struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
+ dims[0], dims[1], dims[2], dims[3],
+ cur->nb[1], cur->nb[2], cur->nb[3],
+ offset);
+
+ ggml_set_name(tensor, name.c_str());
+
+ n_created++;
+
+ return tensor;
+}
+
+void llama_model_loader::done_getting_tensors() const {
+ if (n_created != n_tensors) {
+ throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+ }
+}
+
+void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
+ if (use_mmap) {
+ mappings.reserve(files.size());
+ mmaps_used.reserve(files.size());
+ for (const auto & file : files) {
+ bool is_numa = false;
+
+ auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (dev) {
+ auto * reg = ggml_backend_dev_backend_reg(dev);
+ auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
+ if (is_numa_fn) {
+ is_numa = is_numa_fn();
+ }
+ }
+
+ std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
+ mmaps_used.emplace_back(mapping->size(), 0);
+ if (mlock_mmaps) {
+ std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
+ mlock_mmap->init(mapping->addr());
+ mlock_mmaps->emplace_back(std::move(mlock_mmap));
+ }
+ mappings.emplace_back(std::move(mapping));
+ }
+ }
+
+ // compute the total size of all tensors for progress reporting
+ for (const auto & it : weights_map) {
+ size_data += ggml_nbytes(it.second.tensor);
+ }
+}
+
+void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
+ GGML_ASSERT(!mappings.empty());
+ const auto & mapping = mappings.at(idx);
+
+ *first = mapping->size();
+ *last = 0;
+ *addr = mapping->addr();
+ for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
+ const auto * weight = get_weight(ggml_get_name(tensor));
+ if (!weight || weight->idx != idx) {
+ continue;
+ }
+ *first = std::min(*first, weight->offs);
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
+ }
+}
+
+void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
+ const auto & w = require_weight(ggml_get_name(cur));
+
+ if (use_mmap) {
+ const auto & mapping = mappings.at(w.idx);
+ if (cur->data == nullptr) {
+ cur->data = (uint8_t *)mapping->addr() + w.offs;
+ } else {
+ memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur));
+ }
+ } else {
+ GGML_ASSERT(cur->data != nullptr);
+ GGML_ASSERT(w.idx < files.size());
+ const auto & file = files.at(w.idx);
+ file->seek(w.offs, SEEK_SET);
+ file->read_raw(cur->data, ggml_nbytes(cur));
+ }
+
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+ }
+}
+
+bool llama_model_loader::load_all_data(
+ struct ggml_context * ctx,
+ llama_buf_map & bufs,
+ llama_mlocks * lmlocks,
+ llama_progress_callback progress_callback,
+ void * progress_callback_user_data) {
+ GGML_ASSERT(size_data != 0 && "call init_mappings() first");
+
+ std::vector<no_init<uint8_t>> read_buf;
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
+
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
+ // NVMe raid configurations might require more / larger buffers.
+ constexpr size_t n_buffers = 4;
+
+ size_t alignment = 1;
+ for (const auto & file : files) {
+ alignment = std::max(file->read_alignment(), alignment);
+ }
+
+ // Buffer size: balance between memory usage and I/O efficiency
+ // 64MB works well for NVMe drives
+ const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
+
+ std::vector<ggml_backend_buffer_t> host_buffers;
+ std::vector<ggml_backend_event_t> events;
+ std::vector<void *> host_ptrs;
+ size_t buffer_idx = 0; // buffer to use for async loads
+ ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t {
+ if (use_mmap || check_tensors) {
+ return nullptr;
+ }
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
+ // First determine if the backend supports the necessary features for async uploads.
+ auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
+ if (!buf) {
+ LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func);
+ return nullptr;
+ }
+
+ auto * buft = ggml_backend_buffer_get_type(buf);
+ auto * dev = ggml_backend_buft_get_device(buft);
+ if (!dev) {
+ LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func,
+ ggml_backend_buft_name(buft));
+ return nullptr;
+ }
+
+ if (buft != ggml_backend_dev_buffer_type(dev)) {
+ LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func,
+ ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
+ return nullptr;
+ }
+
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
+ LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func,
+ ggml_backend_dev_name(dev));
+ return nullptr;
+ }
+
+ auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+ if (!host_buft) {
+ LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func,
+ ggml_backend_dev_name(dev));
+ return nullptr;
+ }
+
+ // If the backend is supported, create pinned memory buffers and events for synchronisation.
+ for (size_t idx = 0; idx < n_buffers; ++idx) {
+ auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+
+ if (!buf) {
+ LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
+ ggml_backend_dev_name(dev));
+ return nullptr;
+ }
+
+ host_buffers.emplace_back(buf);
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
+
+ auto * event = ggml_backend_event_new(dev);
+ if (!event) {
+ LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func,
+ ggml_backend_dev_name(dev));
+ return nullptr;
+ }
+
+ events.emplace_back(event);
+ }
+
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+ if (!backend) {
+ LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func,
+ ggml_backend_dev_name(dev));
+ return nullptr;
+ }
+
+ return backend;
+ }(__func__);
+
+ if (upload_backend) {
+ LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
+ ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
+ ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
+ ggml_backend_name(upload_backend));
+ }
+
+ for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+ const auto * weight = get_weight(ggml_get_name(cur));
+ if (weight == nullptr) {
+ // this can happen with split experts models
+ continue;
+ }
+
+ if (progress_callback) {
+ if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
+ return false;
+ }
+ }
+
+ size_t n_size = ggml_nbytes(cur);
+
+ if (use_mmap) {
+ const auto & mapping = mappings.at(weight->idx);
+ ggml_backend_buffer_t buf_mmap = nullptr;
+ if (bufs.count(weight->idx)) {
+ buf_mmap = bufs.at(weight->idx);
+ }
+ uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
+
+ if (check_tensors) {
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
+ }));
+ }
+
+ GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
+ if (buf_mmap && cur->data == nullptr) {
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
+ if (lmlocks) {
+ const auto & lmlock = lmlocks->at(weight->idx);
+ lmlock->grow_to(weight->offs + n_size);
+ }
+
+ auto & mmap_used = mmaps_used[weight->idx];
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
+ } else {
+ ggml_backend_tensor_set(cur, data, 0, n_size);
+ }
+ } else {
+ const auto & file = files.at(weight->idx);
+
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
+ file->seek(weight->offs, SEEK_SET);
+ file->read_raw(cur->data, n_size);
+ if (check_tensors) {
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
+ }));
+ }
+ } else {
+ // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
+ if (upload_backend) {
+ size_t offset = weight->offs;
+ alignment = file->read_alignment();
+ size_t aligned_offset = offset & ~(alignment - 1);
+ size_t offset_from_alignment = offset - aligned_offset;
+ file->seek(aligned_offset, SEEK_SET);
+
+ // Calculate aligned read boundaries
+ size_t read_start = aligned_offset;
+ size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
+
+ size_t bytes_read = 0;
+ size_t data_read = 0; // Actual tensor data copied (excluding padding)
+
+ while (bytes_read < read_end - read_start) {
+ size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+
+ // Align the destination pointer within the pinned buffer
+ uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
+
+ // Wait for previous upload to complete before reusing buffer
+ ggml_backend_event_synchronize(events[buffer_idx]);
+
+ // Read aligned chunk from file
+ file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+
+ // Calculate actual data portion (excluding alignment padding)
+ uintptr_t ptr_data = ptr_dest_aligned;
+ size_t data_to_copy = read_size;
+
+ // Skip alignment padding at start of first chunk
+ if (bytes_read == 0) {
+ ptr_data += offset_from_alignment;
+ data_to_copy -= offset_from_alignment;
+ }
+
+ // Trim alignment padding at end of last chunk
+ if (aligned_offset + bytes_read + read_size > offset + n_size) {
+ data_to_copy -= (read_end - (offset + n_size));
+ }
+
+ // Async upload actual data to GPU
+ ggml_backend_tensor_set_async(upload_backend, cur,
+ reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
+ ggml_backend_event_record(events[buffer_idx], upload_backend);
+
+ data_read += data_to_copy;
+ bytes_read += read_size;
+
+ ++buffer_idx;
+ buffer_idx %= n_buffers;
+ }
+ } else {
+ read_buf.resize(n_size);
+ file->seek(weight->offs, SEEK_SET);
+ file->read_raw(read_buf.data(), n_size);
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+ }
+ }
+ }
+ }
+
+ size_done += n_size;
+ }
+
+ // free temporary resources used for async uploads
+ for (auto * event : events) {
+ ggml_backend_event_synchronize(event);
+ ggml_backend_event_free(event);
+ }
+ for (auto * buf : host_buffers) {
+ ggml_backend_buffer_free(buf);
+ }
+ ggml_backend_free(upload_backend);
+
+ // check validation results
+ bool validation_failed = false;
+ for (auto & future : validation_result) {
+ auto result = future.get();
+ if (!result.second) {
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
+ validation_failed = true;
+ }
+ }
+ if (validation_failed) {
+ throw std::runtime_error("found tensors with invalid data");
+ }
+
+ // check if this is the last call and do final cleanup
+ if (size_done >= size_data) {
+ // unmap offloaded tensors and metadata
+ if (use_mmap) {
+ for (uint32_t idx = 0; idx < mappings.size(); idx++) {
+ const auto & mmap_used = mmaps_used.at(idx);
+ auto & mapping = mappings.at(idx);
+ mapping->unmap_fragment(0, mmap_used.first);
+ if (mmap_used.second != 0) {
+ mapping->unmap_fragment(mmap_used.second, mapping->size());
+ }
+ }
+ }
+ if (progress_callback) {
+ // Even though the model is done loading, we still honor
+ // cancellation since we need to free allocations.
+ return progress_callback(1.0f, progress_callback_user_data);
+ }
+ }
+
+ return true;
+}
+
+std::string llama_model_loader::ftype_name() const {
+ return llama_model_ftype_name(ftype);
+}
+
+void llama_model_loader::print_info() const {
+ LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
+ LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
+ if (n_bytes < GiB) {
+ LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
+ } else {
+ LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
+ }
+}
diff --git a/llama.cpp/src/llama-model-loader.h b/llama.cpp/src/llama-model-loader.h
new file mode 100644
index 0000000..65953dd
--- /dev/null
+++ b/llama.cpp/src/llama-model-loader.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include "llama.h"
+
+#include "llama-impl.h"
+#include "llama-arch.h"
+#include "llama-mmap.h"
+
+#include "ggml-cpp.h"
+
+#include <cstddef>
+#include <map>
+#include <stdexcept>
+#include <unordered_map>
+
+using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+
+enum llama_fver {
+ GGUF_FILE_VERSION_V1 = 1,
+ GGUF_FILE_VERSION_V2 = 2,
+ GGUF_FILE_VERSION_V3 = 3,
+};
+
+const char * llama_file_version_name(llama_fver version);
+
+struct llama_model_loader {
+ // Holds information on a model weight
+ struct llama_tensor_weight {
+ uint16_t idx; // source file index
+ size_t offs; // tensor data offset in the original file
+
+ ggml_tensor * tensor;
+
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
+ if (tensor_idx < 0) {
+ throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
+ }
+
+ offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
+ }
+ }
+ };
+
+ // custom comparator to sort weights more nicely by layer
+ struct weight_name_comparer {
+ bool operator()(const std::string & a, const std::string & b) const {
+ int a_layer = -1;
+ int b_layer = -1;
+ sscanf(a.c_str(), "blk.%d.", &a_layer);
+ sscanf(b.c_str(), "blk.%d.", &b_layer);
+ if (a_layer != b_layer) {
+ return a_layer < b_layer;
+ }
+ return a < b;
+ }
+ };
+
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
+ static const int TENSOR_DUPLICATED = 1 << 1;
+ static const int TENSOR_SKIP = 1 << 2;
+
+ int n_kv = 0;
+ int n_tensors = 0;
+ int n_created = 0;
+
+ uint64_t n_elements = 0;
+ size_t n_bytes = 0;
+
+ bool use_mmap = false;
+ bool use_direct_io = false;
+ bool check_tensors;
+ bool no_alloc;
+
+ llama_files files;
+ llama_ftype ftype;
+ llama_fver fver;
+
+ llama_mmaps mappings;
+
+ std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
+ std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
+ const llama_model_tensor_buft_override * tensor_buft_overrides;
+
+ gguf_context_ptr meta;
+ std::vector<ggml_context_ptr> contexts;
+
+ std::string arch_name;
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
+
+ size_t size_done = 0;
+ size_t size_data = 0;
+ std::vector<std::pair<size_t, size_t>> mmaps_used;
+
+ llama_model_loader(
+ const std::string & fname,
+ std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
+ bool use_mmap,
+ bool use_direct_io,
+ bool check_tensors,
+ bool no_alloc,
+ const llama_model_kv_override * param_overrides_p,
+ const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
+
+ template<typename T>
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
+ get_arr_n(const std::string & key, T & result, bool required = true);
+
+ template<typename T>
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
+ get_arr_n(enum llm_kv kid, T & result, bool required = true);
+
+ template<typename T>
+ bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
+
+ template<typename T, size_t N_MAX>
+ bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
+
+ template<typename T>
+ bool get_arr(enum llm_kv kid, T & result, bool required = true);
+
+ template<typename T>
+ bool get_key(const std::string & key, T & result, bool required = true);
+
+ template<typename T>
+ bool get_key(enum llm_kv kid, T & result, bool required = true);
+
+ template<typename T, size_t N_MAX>
+ bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
+
+ template<typename T>
+ bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
+
+ bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
+
+ std::string get_arch_name() const;
+
+ enum llm_arch get_arch() const;
+
+ const llama_tensor_weight * get_weight(const char * name) const;
+
+ const llama_tensor_weight & require_weight(const char * name) const;
+
+ struct ggml_tensor * get_tensor_meta(const char * name) const;
+
+ struct ggml_tensor * require_tensor_meta(const std::string & name) const;
+
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
+
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
+
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
+
+ void done_getting_tensors() const;
+
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
+
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
+
+ // for backwards compatibility, does not support ggml-backend
+ void load_data_for(struct ggml_tensor * cur) const;
+
+ // Returns false if cancelled by progress_callback
+ bool load_all_data(
+ struct ggml_context * ctx,
+ llama_buf_map & bufs,
+ llama_mlocks * lmlocks,
+ llama_progress_callback progress_callback,
+ void * progress_callback_user_data);
+
+ std::string ftype_name() const;
+
+ void print_info() const;
+};
diff --git a/llama.cpp/src/llama-model-saver.cpp b/llama.cpp/src/llama-model-saver.cpp
new file mode 100644
index 0000000..36e3530
--- /dev/null
+++ b/llama.cpp/src/llama-model-saver.cpp
@@ -0,0 +1,285 @@
+#include "llama-model-saver.h"
+
+#include "gguf.h"
+
+#include "llama.h"
+#include "llama-hparams.h"
+#include "llama-model.h"
+#include "llama-vocab.h"
+
+#include <string>
+
+llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
+ gguf_ctx = gguf_init_empty();
+}
+
+llama_model_saver::~llama_model_saver() {
+ gguf_free(gguf_ctx);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
+ gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
+ gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
+ gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
+ gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
+ gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+[[noreturn]]
+void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
+ GGML_UNUSED(key);
+ GGML_UNUSED(value);
+ GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
+}
+
+template <typename Container>
+void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
+ const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
+ GGML_ASSERT(n_values <= value.size());
+
+ if (n_values == 0) {
+ return;
+ }
+
+ if (per_layer) {
+ bool all_values_the_same = true;
+ for (size_t i = 1; i < n_values; ++i) {
+ if (value[i] != value[0]) {
+ all_values_the_same = false;
+ break;
+ }
+ }
+ if (all_values_the_same) {
+ add_kv(key, value[0]);
+ return;
+ }
+ }
+
+ if (std::is_same<typename Container::value_type, uint8_t>::value) {
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
+ } else if (std::is_same<typename Container::value_type, int8_t>::value) {
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
+ } else if (std::is_same<typename Container::value_type, uint32_t>::value) {
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
+ } else if (std::is_same<typename Container::value_type, int32_t>::value) {
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
+ } else if (std::is_same<typename Container::value_type, float>::value) {
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
+ } else if (std::is_same<Container, std::string>::value) {
+ gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
+ } else {
+ GGML_ABORT("fatal error");
+ }
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
+ std::vector<const char *> tmp(value.size());
+ for (size_t i = 0; i < value.size(); ++i) {
+ tmp[i] = value[i].c_str();
+ }
+ gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
+}
+
+void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
+ if (!tensor) {
+ return;
+ }
+ if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
+ GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
+ return;
+ }
+ gguf_add_tensor(gguf_ctx, tensor);
+}
+
+void llama_model_saver::add_kv_from_model() {
+ const llama_hparams & hparams = model.hparams;
+ const llama_vocab & vocab = model.vocab;
+
+ const int32_t n_vocab = vocab.n_tokens();
+ std::vector<std::string> tokens(n_vocab);
+ std::vector<float> scores(n_vocab);
+ std::vector<int32_t> token_types(n_vocab);
+
+ for (int32_t id = 0; id < n_vocab; ++id) {
+ const llama_vocab::token_data & token_data = vocab.get_token_data(id);
+
+ tokens[id] = token_data.text;
+ scores[id] = token_data.score;
+
+ switch(token_data.attr) {
+ case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
+ case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
+ case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
+ case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
+ case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
+ case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
+ case LLAMA_TOKEN_ATTR_UNDEFINED:
+ default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
+ }
+ }
+
+ // add_kv(LLM_KV_GENERAL_TYPE, ???);
+ add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
+ // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
+ // add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
+ add_kv(LLM_KV_GENERAL_NAME, model.name);
+ // add_kv(LLM_KV_GENERAL_AUTHOR, ???);
+ // add_kv(LLM_KV_GENERAL_VERSION, ???);
+ // add_kv(LLM_KV_GENERAL_URL, ???);
+ // add_kv(LLM_KV_GENERAL_DESCRIPTION, ???);
+ // add_kv(LLM_KV_GENERAL_LICENSE, ???);
+ // add_kv(LLM_KV_GENERAL_SOURCE_URL, ???);
+ // add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???);
+
+ add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
+ add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
+ add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
+ if (hparams.n_embd_out_impl > 0) {
+ add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl);
+ }
+ add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
+ add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
+ add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
+ // add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
+ add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
+ add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
+ add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
+ add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
+ add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
+ add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
+ add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
+ add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
+ add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
+ add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
+ add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
+ add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
+
+ add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
+ add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
+ add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+ add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
+ add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
+ add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
+ add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
+ add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+ add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+ add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
+
+ const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
+
+ add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
+ add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
+ // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
+ add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
+ add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
+ add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor);
+ add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
+ add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
+ add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
+
+ // TODO: implement split file support
+ // add_kv(LLM_KV_SPLIT_NO, ???);
+ // add_kv(LLM_KV_SPLIT_COUNT, ???);
+ // add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???);
+
+ add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
+
+ add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
+
+ add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
+ add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre());
+ add_kv(LLM_KV_TOKENIZER_LIST, tokens);
+ add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types);
+ add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types());
+ add_kv(LLM_KV_TOKENIZER_SCORES, scores);
+ add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges());
+ // FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
+ add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos()));
+ add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos()));
+ add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot()));
+ add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom()));
+ add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk()));
+ add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep()));
+ add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad()));
+ // add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated
+ // add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
+ add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
+ add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
+ add_kv(LLM_KV_TOKENIZER_ADD_SEP, vocab.get_add_sep());
+ add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
+ add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
+ add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
+ // add_kv(LLM_KV_TOKENIZER_HF_JSON, ???);
+ // add_kv(LLM_KV_TOKENIZER_RWKV, ???);
+ add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre()));
+ add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf()));
+ add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid()));
+ add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad()));
+ add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep()));
+ add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep()));
+
+ // TODO: implement LoRA support
+ // add_kv(LLM_KV_ADAPTER_TYPE, ???);
+ // add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
+
+ // deprecated
+ // add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
+ // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
+ // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
+}
+
+void llama_model_saver::add_tensors_from_model() {
+ if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
+ add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
+ }
+ add_tensor(model.type_embd);
+ add_tensor(model.pos_embd);
+ add_tensor(model.tok_norm);
+ add_tensor(model.tok_norm_b);
+ add_tensor(model.output_norm);
+ add_tensor(model.output_norm_b);
+ add_tensor(model.output);
+ add_tensor(model.output_b);
+ add_tensor(model.output_norm_enc);
+ add_tensor(model.cls);
+ add_tensor(model.cls_b);
+ add_tensor(model.cls_out);
+ add_tensor(model.cls_out_b);
+
+ for (const struct llama_layer & layer : model.layers) {
+ for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
+ add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
+ }
+ }
+}
+
+void llama_model_saver::save(const std::string & path_model) {
+ gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
+}
+
diff --git a/llama.cpp/src/llama-model-saver.h b/llama.cpp/src/llama-model-saver.h
new file mode 100644
index 0000000..a5a434c
--- /dev/null
+++ b/llama.cpp/src/llama-model-saver.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-arch.h"
+
+#include <vector>
+
+struct llama_model_saver {
+ struct gguf_context * gguf_ctx = nullptr;
+ const struct llama_model & model;
+ const struct LLM_KV llm_kv;
+
+ llama_model_saver(const struct llama_model & model);
+ ~llama_model_saver();
+
+ void add_kv(enum llm_kv key, uint32_t value);
+ void add_kv(enum llm_kv key, int32_t value);
+ void add_kv(enum llm_kv key, float value);
+ void add_kv(enum llm_kv key, bool value);
+ void add_kv(enum llm_kv key, const char * value);
+
+ [[noreturn]]
+ void add_kv(enum llm_kv key, char value); // needed to make the template below compile
+
+ template <typename Container>
+ void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
+
+ void add_kv(enum llm_kv key, const std::vector<std::string> & value);
+
+ void add_tensor(const struct ggml_tensor * tensor);
+
+ void add_kv_from_model();
+
+ void add_tensors_from_model();
+
+ void save(const std::string & path_model);
+};
diff --git a/llama.cpp/src/llama-model.cpp b/llama.cpp/src/llama-model.cpp
new file mode 100644
index 0000000..5816e9a
--- /dev/null
+++ b/llama.cpp/src/llama-model.cpp
@@ -0,0 +1,8953 @@
+#include "llama-model.h"
+
+#include "llama-impl.h"
+#include "llama-mmap.h"
+#include "llama-cparams.h"
+#include "llama-model-loader.h"
+
+#include "llama-kv-cache.h"
+#include "llama-kv-cache-iswa.h"
+#include "llama-memory-hybrid.h"
+#include "llama-memory-hybrid-iswa.h"
+#include "llama-memory-recurrent.h"
+
+#include "ggml-cpp.h"
+
+#include "models/models.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cfloat>
+#include <cstring>
+#include <cmath>
+#include <functional>
+#include <map>
+#include <regex>
+#include <sstream>
+#include <stdexcept>
+
+const char * llm_type_name(llm_type type) {
+ switch (type) {
+ case LLM_TYPE_14M: return "14M";
+ case LLM_TYPE_17M: return "17M";
+ case LLM_TYPE_22M: return "22M";
+ case LLM_TYPE_33M: return "33M";
+ case LLM_TYPE_47M: return "47M";
+ case LLM_TYPE_60M: return "60M";
+ case LLM_TYPE_70M: return "70M";
+ case LLM_TYPE_80M: return "80M";
+ case LLM_TYPE_109M: return "109M";
+ case LLM_TYPE_137M: return "137M";
+ case LLM_TYPE_140M: return "140M";
+ case LLM_TYPE_149M: return "149M";
+ case LLM_TYPE_160M: return "160M";
+ case LLM_TYPE_190M: return "190M";
+ case LLM_TYPE_220M: return "220M";
+ case LLM_TYPE_250M: return "250M";
+ case LLM_TYPE_256M: return "256M";
+ case LLM_TYPE_270M: return "270M";
+ case LLM_TYPE_335M: return "335M";
+ case LLM_TYPE_350M: return "350M";
+ case LLM_TYPE_360M: return "360M";
+ case LLM_TYPE_395M: return "395M";
+ case LLM_TYPE_410M: return "410M";
+ case LLM_TYPE_450M: return "450M";
+ case LLM_TYPE_475M: return "475M";
+ case LLM_TYPE_558M: return "558M";
+ case LLM_TYPE_700M: return "700M";
+ case LLM_TYPE_770M: return "770M";
+ case LLM_TYPE_780M: return "780M";
+ case LLM_TYPE_950M: return "950M";
+ case LLM_TYPE_0_3B: return "0.3B";
+ case LLM_TYPE_0_5B: return "0.5B";
+ case LLM_TYPE_0_6B: return "0.6B";
+ case LLM_TYPE_1B: return "1B";
+ case LLM_TYPE_1_2B: return "1.2B";
+ case LLM_TYPE_1_3B: return "1.3B";
+ case LLM_TYPE_1_4B: return "1.4B";
+ case LLM_TYPE_1_5B: return "1.5B";
+ case LLM_TYPE_1_6B: return "1.6B";
+ case LLM_TYPE_1_7B: return "1.7B";
+ case LLM_TYPE_1_8B: return "1.8B";
+ case LLM_TYPE_2B: return "2B";
+ case LLM_TYPE_2_6B: return "2.6B";
+ case LLM_TYPE_2_8B: return "2.8B";
+ case LLM_TYPE_2_9B: return "2.9B";
+ case LLM_TYPE_3B: return "3B";
+ case LLM_TYPE_4B: return "4B";
+ case LLM_TYPE_6B: return "6B";
+ case LLM_TYPE_6_9B: return "6.9B";
+ case LLM_TYPE_7B: return "7B";
+ case LLM_TYPE_8B: return "8B";
+ case LLM_TYPE_9B: return "9B";
+ case LLM_TYPE_11B: return "11B";
+ case LLM_TYPE_12B: return "12B";
+ case LLM_TYPE_13B: return "13B";
+ case LLM_TYPE_14B: return "14B";
+ case LLM_TYPE_15B: return "15B";
+ case LLM_TYPE_16B: return "16B";
+ case LLM_TYPE_20B: return "20B";
+ case LLM_TYPE_26B: return "26B";
+ case LLM_TYPE_27B: return "27B";
+ case LLM_TYPE_30B: return "30B";
+ case LLM_TYPE_32B: return "32B";
+ case LLM_TYPE_34B: return "34B";
+ case LLM_TYPE_35B: return "35B";
+ case LLM_TYPE_36B: return "36B";
+ case LLM_TYPE_40B: return "40B";
+ case LLM_TYPE_65B: return "65B";
+ case LLM_TYPE_70B: return "70B";
+ case LLM_TYPE_120B: return "120B";
+ case LLM_TYPE_142B: return "142B";
+ case LLM_TYPE_236B: return "236B";
+ case LLM_TYPE_290B: return "290B";
+ case LLM_TYPE_314B: return "314B";
+ case LLM_TYPE_405B: return "405B";
+ case LLM_TYPE_671B: return "671B";
+ case LLM_TYPE_SMALL: return "0.1B";
+ case LLM_TYPE_MEDIUM: return "0.4B";
+ case LLM_TYPE_LARGE: return "0.8B";
+ case LLM_TYPE_XL: return "1.5B";
+ case LLM_TYPE_A1_7B: return "A1.7B";
+ case LLM_TYPE_A2_7B: return "A2.7B";
+ case LLM_TYPE_8x7B: return "8x7B";
+ case LLM_TYPE_8x22B: return "8x22B";
+ case LLM_TYPE_16x12B: return "16x12B";
+ case LLM_TYPE_16x3_8B: return "16x3.8B";
+ case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
+ case LLM_TYPE_57B_A14B: return "57B.A14B";
+ case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
+ case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
+ case LLM_TYPE_A13B: return "A13B";
+ case LLM_TYPE_7B_A1B: return "7B.A1B";
+ case LLM_TYPE_8B_A1B: return "8B.A1B";
+ case LLM_TYPE_16B_A1B: return "16B.A1B";
+ case LLM_TYPE_21B_A3B: return "21B.A3B";
+ case LLM_TYPE_30B_A3B: return "30B.A3B";
+ case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
+ case LLM_TYPE_35B_A3B: return "35B.A3B";
+ case LLM_TYPE_48B_A3B: return "48B.A3B";
+ case LLM_TYPE_80B_A3B: return "80B.A3B";
+ case LLM_TYPE_100B_A6B: return "100B.A6B";
+ case LLM_TYPE_102B_A12B: return "102B.A12B";
+ case LLM_TYPE_106B_A12B: return "106B.A12B";
+ case LLM_TYPE_196B_A11B: return "196B.A11B";
+ case LLM_TYPE_230B_A10B: return "230B.A10B";
+ case LLM_TYPE_235B_A22B: return "235B.A22B";
+ case LLM_TYPE_300B_A47B: return "300B.A47B";
+ case LLM_TYPE_310B_A15B: return "310B.A15B";
+ case LLM_TYPE_355B_A32B: return "355B.A32B";
+ case LLM_TYPE_E2B: return "E2B";
+ case LLM_TYPE_E4B: return "E4B";
+ default: return "?B";
+ }
+}
+
+static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
+ switch (type) {
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
+ default: return "unknown";
+ }
+}
+
+static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
+ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
+};
+
+std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
+ return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
+}
+
+static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
+ for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
+ if (kv.second == name) {
+ return (llama_rope_scaling_type) kv.first;
+ }
+ }
+
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+}
+
+// checks if the weight tensor can be used with the specified buffer type and device
+static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
+ GGML_ASSERT(w != nullptr);
+
+ if (op == GGML_OP_NONE) {
+ return true;
+ }
+
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ggml_context_ptr ctx_ptr { ggml_init(params) };
+ if (!ctx_ptr) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+ ggml_context * ctx = ctx_ptr.get();
+
+ ggml_tensor * op_tensor = nullptr;
+
+ switch (op) {
+ case GGML_OP_GET_ROWS:
+ {
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_get_rows(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT:
+ {
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul_mat(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT_ID:
+ {
+ int n_expert_used = hparams.n_expert_used;
+ ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+ op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
+ } break;
+ case GGML_OP_ADD:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_add(ctx, a, w);
+ } break;
+ case GGML_OP_ADD_ID:
+ {
+ int n_expert_used = hparams.n_expert_used;
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+ op_tensor = ggml_add_id(ctx, a, w, c);
+ } break;
+ case GGML_OP_MUL:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul(ctx, a, w);
+ } break;
+ case GGML_OP_DIV:
+ {
+ ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
+ op_tensor = ggml_div(ctx, a, w);
+ } break;
+ case GGML_OP_ROPE:
+ {
+ int n_embd_head = hparams.n_embd_head_v;
+ int n_head = hparams.n_head();
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_rope_ext(
+ ctx, a, b, w,
+ 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+ );
+
+ } break;
+ case GGML_OP_SSM_CONV:
+ {
+ const int64_t n_seq_tokens = 512;
+ const int64_t n_seqs = 3;
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
+ op_tensor = ggml_ssm_conv(ctx, conv_x, w);
+ } break;
+ case GGML_OP_SSM_SCAN:
+ {
+ // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
+ const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
+ const int64_t n_head = w->ne[1];
+ const int64_t head_dim = hparams.ssm_d_inner / n_head;
+ const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
+ const int64_t n_seq_tokens = 512;
+ const int64_t n_seqs = 3;
+ ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
+ ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
+ ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+ ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+ ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
+ } break;
+ case GGML_OP_RWKV_WKV6:
+ {
+ // FIXME
+ const int64_t S = 123;
+ const int64_t H = 123;
+ const int64_t n_tokens = 123;
+ const int64_t n_seqs = 123;
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * tf = w;
+ ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
+ op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
+ } break;
+ case GGML_OP_IM2COL:
+ {
+ const int n_embd_inp = hparams.n_embd_inp();
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
+ op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+ } break;
+ case GGML_OP_SCALE:
+ {
+ op_tensor = ggml_scale(ctx, w, 1.0f);
+ } break;
+ default:
+ GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
+ }
+
+ // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+ GGML_ASSERT(w->buffer == nullptr);
+ w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+ ggml_backend_buffer_free(w->buffer);
+ w->buffer = nullptr;
+
+ return op_supported;
+}
+
+// lists of buffer types used for each layer
+using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
+
+// find the first buffer type in the list that can use the tensor
+static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
+ GGML_ASSERT(!buft_list.empty());
+ for (const auto & cur : buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
+ return cur_buft;
+ }
+ }
+
+ return nullptr;
+}
+
+// CPU: ACCEL -> GPU host -> CPU extra -> CPU
+static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
+ buft_list_t buft_list;
+
+ // add ACCEL buffer types
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+ auto * buft = ggml_backend_dev_buffer_type(dev);
+ // skip
+ if (buft != ggml_backend_cpu_buffer_type()) {
+ buft_list.emplace_back(dev, buft);
+ }
+ }
+ }
+
+ // add a host buffer type
+ // storing the tensors in a host buffer is useful when the processing of large batches
+ // is offloaded to a GPU device, since it reduces the time spent on data transfers
+ // generally, this will be done using the first device in the list
+ // a better approach would be to handle this on a weight-by-weight basis using the offload_op
+ // function of the device to determine if it would benefit from being stored in a host buffer
+ if (!no_host) {
+ for (auto * dev : devices) {
+ ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
+ if (buft) {
+ buft_list.emplace_back(dev, buft);
+ break;
+ }
+ }
+ }
+
+ // add extra buffer types
+ if (use_extra_bufts) {
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (cpu_dev == nullptr) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
+ if (ggml_backend_dev_get_extra_bufts_fn) {
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+ while (extra_bufts && *extra_bufts) {
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
+ ++extra_bufts;
+ }
+ }
+ }
+
+ // add the CPU buffer type
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+ buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+ }
+ }
+
+ return buft_list;
+}
+
+// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
+static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
+ buft_list_t buft_list;
+
+ // add the device split buffer type if requested and available
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+ auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
+ ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
+ if (ggml_backend_split_buffer_type_fn) {
+ size_t dev_index = [&]() {
+ auto * reg = ggml_backend_dev_backend_reg(dev);
+ for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
+ if (ggml_backend_reg_dev_get(reg, i) == dev) {
+ return i;
+ }
+ }
+ throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
+ }();
+ auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
+ if (buft != nullptr) {
+ buft_list.emplace_back(dev, buft);
+ }
+ }
+ }
+
+ // add the device default buffer type
+ buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+
+ // add the device extra buffer type (if any)
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+ ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
+
+ if (ggml_backend_dev_get_extra_bufts_fn) {
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
+ while (extra_bufts && *extra_bufts) {
+ buft_list.emplace_back(dev, *extra_bufts);
+ ++extra_bufts;
+ }
+ }
+
+ return buft_list;
+}
+
+struct llama_model::impl {
+ impl() = default;
+ ~impl() = default;
+
+ uint64_t n_elements = 0;
+
+ size_t n_bytes = 0;
+
+ std::string desc_str;
+
+ // model memory mapped files
+ llama_mmaps mappings;
+
+ // objects representing data potentially being locked in memory
+ llama_mlocks mlock_bufs;
+ llama_mlocks mlock_mmaps;
+
+ // contexts where the model tensors metadata is stored as well as the corresponding buffers:
+ std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
+
+ buft_list_t cpu_buft_list;
+ std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
+
+ struct layer_dev {
+ ggml_backend_dev_t dev;
+ buft_list_t * buft_list;
+ };
+
+ layer_dev dev_input = {};
+ layer_dev dev_output = {};
+ std::vector<layer_dev> dev_layer;
+
+ bool has_tensor_overrides;
+};
+
+llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
+ pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
+}
+
+llama_model::~llama_model() {
+ for (auto * lora : loras) {
+ delete lora;
+ }
+}
+
+void llama_model::load_stats(llama_model_loader & ml) {
+ pimpl->n_elements = ml.n_elements;
+ pimpl->n_bytes = ml.n_bytes;
+}
+
+void llama_model::load_arch(llama_model_loader & ml) {
+ arch = ml.get_arch();
+ if (arch == LLM_ARCH_UNKNOWN) {
+ throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
+ }
+}
+
+void llama_model::load_hparams(llama_model_loader & ml) {
+ const gguf_context * ctx = ml.meta.get();
+
+ // get metadata as string
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
+ gguf_type type = gguf_get_kv_type(ctx, i);
+ if (type == GGUF_TYPE_ARRAY) {
+ continue;
+ }
+ const char * name = gguf_get_key(ctx, i);
+ const std::string value = gguf_kv_to_str(ctx, i);
+ gguf_kv.emplace(name, value);
+ }
+
+ // get general kv
+ ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+
+ // everything past this point is not vocab-related
+ // for CLIP models, we only need to load tensors, no hparams
+ if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
+ return;
+ }
+
+ ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
+ ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+ ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
+ ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
+
+ if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+ ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd);
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
+
+ ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
+ ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
+
+ ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
+ ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
+ }
+
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
+ if (hparams.n_expert > 0) {
+ GGML_ASSERT(hparams.n_expert_used > 0);
+ GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
+ if (hparams.n_expert_groups > 1) {
+ GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
+ GGML_ASSERT(hparams.n_group_used > 0);
+ GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
+ }
+ } else {
+ GGML_ASSERT(hparams.n_expert_used == 0);
+ GGML_ASSERT(hparams.n_expert_groups == 0);
+ }
+
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(
+ hparams.recurrent_layer_arr.begin(),
+ hparams.recurrent_layer_arr.end(),
+ llm_arch_is_recurrent(ml.get_arch()));
+
+ std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
+ std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
+
+ std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
+ std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
+ std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
+ std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
+ std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
+ std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
+
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+
+ // n_head_kv is optional, default to n_head
+ hparams.n_head_kv_arr = hparams.n_head_arr;
+
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
+
+ bool rope_finetuned = false;
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+ hparams.rope_finetuned = rope_finetuned;
+
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
+
+ // rope_freq_base (optional)
+ hparams.rope_freq_base_train = 10000.0f;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
+
+ std::string rope_scaling("linear");
+ ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
+ hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
+
+ // TODO: Handle SWA metadata similarly when models start implementing it
+ // rope_freq_scale (inverse of the kv) is optional
+ float ropescale = 0.0f;
+ if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
+ // try the old key name
+ ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
+ }
+ hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
+
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
+
+ // non-transformer models do not have attention heads
+ if (hparams.n_head() > 0) {
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
+ // gpt-j n_rot = rotary_dim
+
+ hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+
+ hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+
+ // sanity check for n_rot (optional)
+ hparams.n_rot = hparams.n_embd_head_k;
+
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
+
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
+ if (hparams.n_rot != hparams.n_embd_head_k) {
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
+ }
+ }
+ } else {
+ hparams.n_rot = 0;
+ hparams.n_embd_head_k = 0;
+ hparams.n_embd_head_v = 0;
+ }
+
+ // for differentiating model types
+ uint32_t n_vocab = 0;
+ ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
+
+ // for classifier models
+ ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
+ if (!classifier_labels.empty()) {
+ hparams.n_cls_out = classifier_labels.size();
+ }
+
+ // arch-specific KVs
+ switch (arch) {
+ case LLM_ARCH_LLAMA:
+ case LLM_ARCH_LLAMA_EMBED:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ if (hparams.n_expert == 8) {
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_8x7B; break;
+ case 56: type = LLM_TYPE_8x22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } else {
+ switch (hparams.n_layer) {
+ case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
+ case 22: type = LLM_TYPE_1B; break;
+ case 26: type = LLM_TYPE_3B; break;
+ case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
+ case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
+ // granite uses a vocab with len 49152
+ case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
+ case 36: type = LLM_TYPE_8B; break; // granite
+ case 40: type = LLM_TYPE_13B; break;
+ case 48: type = LLM_TYPE_34B; break;
+ case 60: type = LLM_TYPE_30B; break;
+ case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ }
+ } break;
+ case LLM_ARCH_LLAMA4:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
+
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (found_swa && hparams.n_swa == 0) {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
+ hparams.n_swa = 8192;
+ hparams.n_attn_temp_floor_scale = 8192;
+ hparams.f_attn_temp_scale = 0.1f;
+ hparams.f_attn_temp_offset = 1.0f;
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ }
+
+ switch (hparams.n_expert) {
+ case 0: {
+ // MobileLLM (no MoE)
+ switch (hparams.n_embd) {
+ case 2048: type = LLM_TYPE_140M; break;
+ case 4096: type = LLM_TYPE_360M; break;
+ case 6144: type = LLM_TYPE_950M; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case 16: type = LLM_TYPE_17B_16E; break;
+ case 128: type = LLM_TYPE_17B_128E; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
+ } break;
+ case LLM_ARCH_ARCEE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ // Arcee uses the same structure as Llama
+ switch (hparams.n_layer) {
+ case 36: type = LLM_TYPE_4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_AFMOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+
+ // Set up interleaved sliding window attention (ISWA)
+ // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
+ if (hparams.n_swa > 0) {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.set_swa_pattern(4);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ }
+
+ // Default to sigmoid if not set
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+ }
+
+ switch (hparams.n_layer) {
+ case 56: type = LLM_TYPE_6B; break;
+ case 32: type = LLM_TYPE_26B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_DECI:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 80: type = LLM_TYPE_70B; break;
+ case 162: type = LLM_TYPE_405B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MINICPM:
+ {
+ // Backward-compatible defaults for older MiniCPM GGUFs
+ hparams.f_embedding_scale = 12.0f;
+ hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
+ hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ // Optional KV reads, override defaults if present in newer GGUF exports
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
+
+ // MiniCPM uses rope by default, unlike Granite which uses it as a switch
+ hparams.rope_finetuned = true;
+
+ switch (hparams.n_layer) {
+ case 52: type = LLM_TYPE_1B; break;
+ case 40: type = LLM_TYPE_2B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MINICPM3:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+
+ switch (hparams.n_layer) {
+ case 62: type = LLM_TYPE_4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GROK:
+ {
+ // defaults for old GGUFs
+ hparams.yarn_beta_fast = 8.0f;
+ hparams.f_logit_scale = 0.5773502691896257f;
+ hparams.f_embedding_scale = 78.38367176906169f;
+ hparams.f_attn_out_scale = 0.08838834764831845f;
+ hparams.f_attn_logit_softcapping = 30.0f;
+ hparams.f_router_logit_softcapping = 30.0f;
+ // no final_logit_softcapping in grok-1
+ hparams.f_final_logit_softcapping = 0.0f;
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
+ ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
+ ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
+
+ switch (hparams.n_layer) {
+ case 64: type = LLM_TYPE_314B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_FALCON:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 60: type = LLM_TYPE_40B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_BAICHUAN:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ if (type == LLM_TYPE_13B) {
+ // TODO: become GGUF KV parameter
+ hparams.f_max_alibi_bias = 8.0f;
+ }
+ } break;
+ case LLM_ARCH_STARCODER:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_1B; break;
+ case 36: type = LLM_TYPE_3B; break;
+ case 42: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_15B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_REFACT:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_1B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ // TODO: become GGUF KV parameter
+ hparams.f_max_alibi_bias = 8.0f;
+ } break;
+ case LLM_ARCH_BERT:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+
+ switch (hparams.n_layer) {
+ case 3:
+ type = LLM_TYPE_17M; break; // bge-micro
+ case 6:
+ type = LLM_TYPE_22M; break; // MiniLM-L6
+ case 12:
+ switch (hparams.n_embd) {
+ case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
+ case 768: type = LLM_TYPE_109M; break; // bge-base
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 24:
+ type = LLM_TYPE_335M; break; // bge-large
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MODERN_BERT:
+ {
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (found_swa && hparams.n_swa > 0) {
+ uint32_t swa_period = 3;
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ }
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+
+ switch (hparams.n_layer) {
+ case 12:
+ type = LLM_TYPE_47M; break; // granite-embedding-small
+ case 22:
+ type = LLM_TYPE_149M; break; // modern-bert-base
+ case 28:
+ type = LLM_TYPE_395M; break; // modern-bert-large
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_JINA_BERT_V2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+ hparams.f_max_alibi_bias = 8.0f;
+
+ switch (hparams.n_layer) {
+ case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
+ case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_JINA_BERT_V3:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+
+ switch (hparams.n_layer) {
+ case 24:
+ type = LLM_TYPE_558M; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_NOMIC_BERT:
+ case LLM_ARCH_NOMIC_BERT_MOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
+
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+ if (arch == LLM_ARCH_NOMIC_BERT) {
+ type = LLM_TYPE_137M;
+ } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
+ type = LLM_TYPE_475M;
+ }
+ }
+ } break;
+ case LLM_ARCH_NEO_BERT:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+
+ if (hparams.n_layer == 28) {
+ type = LLM_TYPE_250M;
+ }
+ } break;
+ case LLM_ARCH_BLOOM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_1B; break;
+ case 30:
+ switch (hparams.n_embd) {
+ case 2560: type = LLM_TYPE_3B; break;
+ case 4096: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ // TODO: become GGUF KV parameter
+ hparams.f_max_alibi_bias = 8.0f;
+ } break;
+ case LLM_ARCH_MPT:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 48: type = LLM_TYPE_30B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_STABLELM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_3B; break;
+ case 40: type = LLM_TYPE_12B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN2VL:
+ {
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+ }
+ // fall through
+ case LLM_ARCH_QWEN2:
+ {
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
+ case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 36: type = LLM_TYPE_3B; break;
+ case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
+ case 48: type = LLM_TYPE_14B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ case 80: type = LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_DREAM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ // Dream models are primarily 7B with 28 layers
+ switch (hparams.n_layer) {
+ case 28:
+ type = LLM_TYPE_7B;
+ break;
+ default:
+ type = LLM_TYPE_UNKNOWN;
+ }
+ // Set non-causal attention for diffusion models
+ hparams.causal_attn = false;
+ }
+ break;
+ case LLM_ARCH_LLADA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
+ switch (hparams.n_layer) {
+ case 32:
+ type = LLM_TYPE_8B;
+ break;
+ default:
+ type = LLM_TYPE_UNKNOWN;
+ }
+ // Set non-causal attention for diffusion models
+ hparams.causal_attn = false;
+ }
+ break;
+ case LLM_ARCH_LLADA_MOE:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ // diffusion language model uses non-causal attention
+ hparams.causal_attn = false;
+ switch (hparams.n_layer) {
+ case 16: type = LLM_TYPE_A1_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_RND1:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 48: type = LLM_TYPE_30B_A3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ // Set non-causal attention for diffusion models
+ hparams.causal_attn = false;
+ } break;
+ case LLM_ARCH_QWEN2MOE:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_A2_7B; break;
+ case 28: type = LLM_TYPE_57B_A14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN3:
+ {
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
+ case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
+ case 40: type = LLM_TYPE_14B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MAINCODER:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_1B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN3VL:
+ {
+ ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 28: type = LLM_TYPE_1_7B; break;
+ case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN3MOE:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 48: type = LLM_TYPE_30B_A3B; break;
+ case 94: type = LLM_TYPE_235B_A22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN3VLMOE:
+ {
+ ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 48: type = LLM_TYPE_30B_A3B; break;
+ case 94: type = LLM_TYPE_235B_A22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_PHI2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_PHI3:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_3B; break;
+ case 40: type = LLM_TYPE_14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+
+ if (found_swa && hparams.n_swa > 0) {
+ LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
+
+ // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+ hparams.n_swa = 0;
+ hparams.set_swa_pattern(1);
+ }
+ } break;
+ case LLM_ARCH_PHIMOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_16x3_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_PLAMO:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 40: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_PLAMO2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ // Load Mamba SSM parameters
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
+ }
+
+ switch (hparams.n_layer) {
+ case 16: type = LLM_TYPE_1B; break;
+ case 32:
+ if (hparams.n_embd == 2048) {
+ type = LLM_TYPE_2B;
+ } else if (hparams.n_embd == 4096) {
+ type = LLM_TYPE_8B;
+ }
+ break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ // Load attention parameters
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+ } break;
+ case LLM_ARCH_PLAMO3:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (found_swa && hparams.n_swa > 0) {
+ uint32_t swa_period = 8;
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ }
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_2B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GPT2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 12: type = LLM_TYPE_SMALL; break;
+ case 24: type = LLM_TYPE_MEDIUM; break;
+ case 36: type = LLM_TYPE_LARGE; break;
+ case 48: type = LLM_TYPE_XL; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_CODESHELL:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 42: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_ORION:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+ switch (hparams.n_layer) {
+ case 40: type = LLM_TYPE_14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_INTERNLM2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 48: type = LLM_TYPE_20B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 18: type = LLM_TYPE_2B; break;
+ case 28: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
+ {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.n_swa = 4096; // default value of gemma 2
+ hparams.set_swa_pattern(2);
+ hparams.attn_soft_cap = true;
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+
+ switch (hparams.n_layer) {
+ case 26: type = LLM_TYPE_2B; break;
+ case 42: type = LLM_TYPE_9B; break;
+ case 46: type = LLM_TYPE_27B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
+ hparams.f_attention_scale = type == LLM_TYPE_27B
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ } break;
+ case LLM_ARCH_GEMMA3:
+ {
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (found_swa && hparams.n_swa > 0) {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.set_swa_pattern(6);
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ }
+
+ hparams.f_final_logit_softcapping = 0.0f;
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 18: type = LLM_TYPE_270M; break;
+ case 26: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_8B; break; // Rnj-1
+ case 34: type = LLM_TYPE_4B; break;
+ case 48: type = LLM_TYPE_12B; break;
+ case 62: type = LLM_TYPE_27B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
+ hparams.f_attention_scale = type == LLM_TYPE_27B
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+ } break;
+ case LLM_ARCH_GEMMA3N:
+ {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.set_swa_pattern(5);
+
+ hparams.n_layer_kv_from_start = 20;
+ hparams.f_attention_scale = 1.0f;
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 30: type = LLM_TYPE_E2B; break;
+ case 35: type = LLM_TYPE_E4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA_EMBEDDING:
+ {
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
+ hparams.set_swa_pattern(6);
+
+ hparams.causal_attn = false; // embeddings do not use causal attention
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+
+ //applied only if model converted with --sentence-transformers-dense-modules
+ ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
+ ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
+ ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
+ ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
+
+ GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
+ GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_0_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+
+ } break;
+ case LLM_ARCH_STARCODER2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 30: type = LLM_TYPE_3B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_15B; break;
+ case 52: type = LLM_TYPE_20B; break; // granite
+ case 88: type = LLM_TYPE_34B; break; // granite
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MAMBA:
+ {
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 24:
+ switch (hparams.n_embd) {
+ case 768: type = LLM_TYPE_SMALL; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 48:
+ switch (hparams.n_embd) {
+ case 1024: type = LLM_TYPE_MEDIUM; break;
+ case 1536: type = LLM_TYPE_LARGE; break;
+ case 2048: type = LLM_TYPE_XL; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 64:
+ switch (hparams.n_embd) {
+ case 2560: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MAMBA2:
+ {
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 24:
+ switch (hparams.n_embd) {
+ case 768: type = LLM_TYPE_SMALL; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 48:
+ switch (hparams.n_embd) {
+ case 1024: type = LLM_TYPE_MEDIUM; break;
+ case 1536: type = LLM_TYPE_LARGE; break;
+ case 2048: type = LLM_TYPE_XL; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 64:
+ switch (hparams.n_embd) {
+ case 2560: type = LLM_TYPE_3B; break;
+ case 4096: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_JAMBA:
+ {
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
+ }
+
+ switch (hparams.n_layer) {
+ // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
+ case 12: // 900M 8x???M
+ case 32: // 51B 16x?B
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_XVERSE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ case 80: type = LLM_TYPE_65B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_COMMAND_R:
+ {
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 40: type = LLM_TYPE_35B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_COHERE2:
+ {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.set_swa_pattern(4);
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_DBRX:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
+
+ switch (hparams.n_layer) {
+ case 40: type = LLM_TYPE_16x12B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_OLMO:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
+
+ switch (hparams.n_layer) {
+ case 22: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 80: type = LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_OLMO2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (found_swa && hparams.n_swa > 0) {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.set_swa_pattern(4);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ }
+
+ switch (hparams.n_layer) {
+ case 16: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_SEED_OSS:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 64: type = LLM_TYPE_36B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_OLMOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 16: type = LLM_TYPE_A1_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_OPENELM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 16: type = LLM_TYPE_270M; break;
+ case 20: type = LLM_TYPE_450M; break;
+ case 28: type = LLM_TYPE_1B; break;
+ case 36: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GPTNEOX:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
+ switch (hparams.n_layer) {
+ case 6:
+ switch (hparams.n_ff()) {
+ case 512: type = LLM_TYPE_14M; break;
+ case 2048: type = LLM_TYPE_70M; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 12:
+ switch (hparams.n_ff()) {
+ case 3072: type = LLM_TYPE_160M; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 16:
+ switch (hparams.n_ff()) {
+ case 8192: type = LLM_TYPE_1B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 24:
+ switch (hparams.n_ff()) {
+ case 4096: type = LLM_TYPE_410M; break;
+ case 8192: type = LLM_TYPE_1_4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 32:
+ switch (hparams.n_ff()) {
+ case 10240: type = LLM_TYPE_2_8B; break;
+ case 16384: type = LLM_TYPE_6_9B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 36:
+ switch (hparams.n_ff()) {
+ case 20480: type = LLM_TYPE_12B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 44:
+ switch (hparams.n_ff()) {
+ case 24576: type = LLM_TYPE_20B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_ARCTIC:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ if (hparams.n_expert == 128) {
+ switch (hparams.n_layer) {
+ case 35: type = LLM_TYPE_10B_128x3_66B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } else {
+ type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_DEEPSEEK:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+
+ switch (hparams.n_ff_exp) {
+ case 1408: type = LLM_TYPE_16B; break;
+ case 1792: type = LLM_TYPE_20B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_DEEPSEEK2:
+ {
+ // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+ const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ if (!is_lite) {
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
+ }
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+ // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
+ // that have no expert_gating_func model parameter set
+ if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
+ // GLM 4.7 Lite
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+ } else {
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+ }
+ }
+
+ if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+ // cancel the factor from the convert script
+ hparams.rope_yarn_log_mul /= 0.1f;
+ }
+
+ // (optional) temperature tuning - used by mistral-large
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
+
+ hparams.f_attn_temp_offset = 0.0f;
+
+ switch (hparams.n_layer) {
+ case 27: type = LLM_TYPE_16B; break;
+ case 47: type = LLM_TYPE_30B_A3B; break;
+ case 60: type = LLM_TYPE_236B; break;
+ case 61: type = LLM_TYPE_671B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_PLM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_1_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_CHATGLM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 28: {
+ if (hparams.n_head(0) == 16) {
+ type = LLM_TYPE_1_5B;
+ } else {
+ type = LLM_TYPE_6B;
+ }
+ } break;
+ case 40: {
+ if (hparams.n_head(0) == 24) {
+ type = LLM_TYPE_4B;
+ } else {
+ type = LLM_TYPE_9B;
+ }
+ } break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GLM4:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+ switch (hparams.n_layer) {
+ case 40: type = LLM_TYPE_9B; break;
+ case 61: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GLM4_MOE:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+ // MoE parameters
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+
+ // Expert gating function (GLM-4.5 uses sigmoid)
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+ }
+
+ // NextN/MTP parameters
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+
+ // TODO: when MTP is implemented, this should probably be updated if needed
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+ switch (hparams.n_layer) {
+ case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
+ case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
+ case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_BITNET:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 26: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_T5:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+
+ uint32_t dec_start_token_id;
+ if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
+ hparams.dec_start_token_id = dec_start_token_id;
+ }
+
+ hparams.dec_n_layer = hparams.n_layer;
+ ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
+
+ switch (hparams.n_layer) {
+ case 6: type = LLM_TYPE_60M; break; // t5-small
+ case 8: type = LLM_TYPE_80M; break; // flan-t5-small
+ case 12:
+ switch (hparams.n_ff()) {
+ case 3072: type = LLM_TYPE_220M; break; // t5-base
+ case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 24:
+ switch (hparams.n_ff()) {
+ case 4096: type = LLM_TYPE_770M; break; // t5-large
+ case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
+ case 16384: type = LLM_TYPE_3B; break; // t5-3b
+ case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
+ case 65536: type = LLM_TYPE_11B; break; // t5-11b
+ case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+ type = LLM_TYPE_UNKNOWN;
+ } break;
+ case LLM_ARCH_JAIS:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_1_3B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ /* TODO: add variants */
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_NEMOTRON:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_NEMOTRON_H:
+ case LLM_ARCH_NEMOTRON_H_MOE:
+ {
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ // A layer is recurrent IFF the n_head_kv value is set to 0 and
+ // the n_ff value is set to 0
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
+ }
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+
+ switch (hparams.n_layer) {
+ case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
+ case 56: type = LLM_TYPE_9B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_EXAONE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_EXAONE4:
+ {
+ if (hparams.n_layer == 64) { // 32B
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.n_swa = 4096;
+ hparams.set_swa_pattern(4);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ }
+
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 30: type = LLM_TYPE_1_2B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_EXAONE_MOE:
+ {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.n_swa = 128;
+ hparams.set_swa_pattern(4);
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_30B_A3B; break;
+ case 48:
+ case 49: type = LLM_TYPE_235B_A22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_RWKV6:
+ case LLM_ARCH_RWKV6QWEN2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+ ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
+ ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
+ ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
+ ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
+ ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_1_6B; break;
+ case 32:
+ switch (hparams.n_embd) {
+ case 2560: type = LLM_TYPE_3B; break;
+ case 4096: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 61: type = LLM_TYPE_14B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_RWKV7:
+ case LLM_ARCH_ARWKV7:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+ ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
+ ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
+ ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
+ ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
+ ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
+
+ switch (hparams.n_layer) {
+ case 12:
+ switch (hparams.n_embd) {
+ case 768: type = LLM_TYPE_190M; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 24:
+ switch (hparams.n_embd) {
+ case 1024: type = LLM_TYPE_450M; break;
+ case 2048: type = LLM_TYPE_1_5B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 28:
+ switch (hparams.n_embd) {
+ case 1536: type = LLM_TYPE_1_5B; break;
+ case 3584: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 32:
+ switch (hparams.n_embd) {
+ case 2560: type = LLM_TYPE_2_9B; break;
+ case 4096: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ case 61:
+ switch (hparams.n_embd) {
+ case 4096: type = LLM_TYPE_14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ } break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
+
+ // Granite uses rope_finetuned as a switch for rope, so default to true
+ bool rope_finetuned = true;
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+ hparams.rope_finetuned = rope_finetuned;
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_3B; break;
+ case 40: type = LLM_TYPE_3B; break;
+ // Add additional layer/vocab/etc checks here for other model sizes
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ // For Granite MoE Shared
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
+ } break;
+ case LLM_ARCH_GRANITE_HYBRID:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
+
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ // Granite uses rope_finetuned as a switch for rope, so default to true
+ bool rope_finetuned = true;
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+ hparams.rope_finetuned = rope_finetuned;
+
+ // A layer is recurrent IFF the n_head_kv value is set to 0
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
+ }
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_embd) {
+ case 768: type = LLM_TYPE_350M; break;
+ case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
+ case 2048: case 2560: type = LLM_TYPE_3B; break;
+ case 4096: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+
+ // For Granite MoE Shared
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
+ } break;
+ case LLM_ARCH_CHAMELEON:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
+ ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_7B; break;
+ case 48: type = LLM_TYPE_34B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_WAVTOKENIZER_DEC:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ } break;
+ case LLM_ARCH_BAILINGMOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+
+ switch (hparams.n_layer) {
+ case 28: type = LLM_TYPE_16B; break;
+ case 88: type = LLM_TYPE_290B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_BAILINGMOE2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+
+ // TODO: when MTP is implemented, this should probably be updated if needed
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+ switch (hparams.n_layer) {
+ case 20: type = LLM_TYPE_16B_A1B; break;
+ case 21: type = LLM_TYPE_16B_A1B; break;
+ case 32: type = LLM_TYPE_100B_A6B; break;
+ case 33: type = LLM_TYPE_100B_A6B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_DOTS1:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ switch (hparams.n_layer) {
+ case 62: type = LLM_TYPE_142B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_ERNIE4_5:
+ case LLM_ARCH_ERNIE4_5_MOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ if (arch == LLM_ARCH_ERNIE4_5_MOE) {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ }
+
+ switch (hparams.n_layer) {
+ case 18: type = LLM_TYPE_0_3B; break;
+ case 28: type = LLM_TYPE_21B_A3B; break;
+ case 54: type = LLM_TYPE_300B_A47B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_FALCON_H1:
+ {
+ // Common parameters
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ // SSM parameters
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
+
+ switch (hparams.n_layer) {
+ case 36:
+ type = LLM_TYPE_0_5B; break;
+ case 24:
+ type = LLM_TYPE_1_5B; break;
+ case 66:
+ type = LLM_TYPE_1B; break;
+ case 32:
+ type = LLM_TYPE_3B; break;
+ case 44:
+ type = LLM_TYPE_7B; break;
+ case 72:
+ type = LLM_TYPE_34B; break;
+ default:
+ type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_HUNYUAN_MOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_A13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_HUNYUAN_DENSE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_embd) {
+ case 1024: type = LLM_TYPE_0_5B; break;
+ case 2048: type = LLM_TYPE_1_8B; break;
+ case 3072: type = LLM_TYPE_4B; break;
+ case 4096: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_SMOLLM3:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ hparams.n_no_rope_layer_step = 4;
+
+ switch (hparams.n_layer) {
+ case 36: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_OPENAI_MOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.set_swa_pattern(2);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_20B; break;
+ case 36: type = LLM_TYPE_120B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_LFM2:
+ {
+ ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+ hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
+ }
+ hparams.n_layer_dense_lead = hparams.n_layer;
+ switch (hparams.n_ff()) {
+ case 4608: type = LLM_TYPE_350M; break;
+ case 6912: type = LLM_TYPE_700M; break;
+ case 8192: type = LLM_TYPE_1_2B; break;
+ case 10752: type = LLM_TYPE_2_6B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_LFM2MOE:
+ {
+ ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
+
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+ hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
+ }
+
+ type = LLM_TYPE_8B_A1B;
+ } break;
+ case LLM_ARCH_SMALLTHINKER:
+ {
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+
+ if (found_swa && hparams.n_swa > 0) {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.n_swa = 4096;
+ hparams.set_swa_pattern(4, true);
+
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ hparams.n_no_rope_layer_step = hparams.n_layer;
+ }
+
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_4B; break;
+ case 52: type = LLM_TYPE_20B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GROVEMOE:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
+ ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
+ ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 48: type = LLM_TYPE_30B_A3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_APERTUS:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MINIMAX_M2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+
+ switch (hparams.n_layer) {
+ case 62: type = LLM_TYPE_230B_A10B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_COGVLM:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_PANGU_EMBED:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
+ case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN3NEXT:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ // Load linear attention (gated delta net) parameters
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ // Mark recurrent layers (linear attention layers)
+ {
+ uint32_t full_attn_interval = 4;
+ ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
+ }
+ }
+
+ switch (hparams.n_layer) {
+ case 48: type = LLM_TYPE_80B_A3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN35:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+
+ // Load linear attention (gated delta net) parameters
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ // Mark recurrent layers (linear attention layers)
+ {
+ uint32_t full_attn_interval = 4;
+ ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
+ }
+ }
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_2B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_QWEN35MOE:
+ {
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+
+ // Load linear attention (gated delta net) parameters
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
+
+ // Mark recurrent layers (linear attention layers)
+ {
+ uint32_t full_attn_interval = 4;
+ ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
+ }
+ }
+
+ switch (hparams.n_layer) {
+ case 28: type = LLM_TYPE_35B_A3B; break;
+ case 48: type = LLM_TYPE_80B_A3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MISTRAL3:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
+
+ hparams.f_attn_temp_offset = 0.0f;
+
+ // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
+ if (hparams.f_attn_temp_scale != 0.0f) {
+ hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
+ if (hparams.n_attn_temp_floor_scale == 0) {
+ throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
+ }
+ }
+
+ switch (hparams.n_layer) {
+ case 26: type = LLM_TYPE_3B; break;
+ case 34: type = LLM_TYPE_8B; break;
+ case 40: type = LLM_TYPE_14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_MIMO2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+
+ switch (hparams.n_layer) {
+ case 48: type = LLM_TYPE_310B_A15B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_KIMI_LINEAR:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+ ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
+
+ // MLA qk_rope_head_dim (for reference)
+ // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
+
+ // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
+ // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
+ }
+
+ // MoE parameters - Kimi uses moe_intermediate_size = 1024
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
+
+ switch (hparams.n_layer) {
+ case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_STEP35:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
+ // MoE + SWA parameters
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+
+ // Step35 uses sigmoid gating by default (if not set in GGUF)
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+ }
+
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
+
+ switch (hparams.n_layer) {
+ case 45: type = LLM_TYPE_196B_A11B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
+ default: throw std::runtime_error("unsupported model architecture");
+ }
+
+ pimpl->n_bytes = ml.n_bytes;
+
+ pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
+
+ if (hparams.f_max_alibi_bias > 0.0f) {
+ hparams.use_alibi = true;
+ }
+
+ hparams.rope_type = llama_model_rope_type(this);
+}
+
+void llama_model::load_vocab(llama_model_loader & ml) {
+ const auto kv = LLM_KV(arch);
+
+ vocab.load(ml, kv);
+}
+
+bool llama_model::load_tensors(llama_model_loader & ml) {
+ const auto & split_mode = params.split_mode;
+ const auto & use_mlock = params.use_mlock;
+ const auto & tensor_split = params.tensor_split;
+
+ const int n_layer = hparams.n_layer;
+ const int n_gpu_layers = this->n_gpu_layers();
+
+ const bool use_mmap_buffer = true;
+
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
+ __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
+
+ // build a list of buffer types for the CPU and GPU devices
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
+ for (auto * dev : devices) {
+ buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
+ // add CPU buffer types as a fallback
+ buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
+ pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
+ }
+
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (cpu_dev == nullptr) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+
+ // calculate the split points
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
+ std::vector<float> splits(n_devices());
+ if (all_zero) {
+ // default split, by free memory
+ for (size_t i = 0; i < n_devices(); ++i) {
+ ggml_backend_dev_t dev = devices[i];
+ size_t total;
+ size_t free;
+ ggml_backend_dev_memory(dev, &free, &total);
+
+ // devices can return 0 bytes for free and total memory if they do not
+ // have any to report. in this case, we will use the host memory as a fallback
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+ if (free == 0 && total == 0) {
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
+ }
+ splits[i] = free;
+ }
+ } else {
+ std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
+ }
+
+ // sum and normalize the splits to get the split points
+ float split_sum = 0.0f;
+ for (size_t i = 0; i < n_devices(); ++i) {
+ split_sum += splits[i];
+ splits[i] = split_sum;
+ }
+ for (size_t i = 0; i < n_devices(); ++i) {
+ splits[i] /= split_sum;
+ }
+
+ const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
+ const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
+ auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
+ const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
+ if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
+ return {cpu_dev, &pimpl->cpu_buft_list};
+ }
+ const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
+ auto * dev = devices.at(layer_gpu);
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
+ return {dev, &pimpl->gpu_buft_list.at(dev)};
+ };
+
+ // assign the input layer
+ // there is very little benefit to offloading the input layer, so always keep it on the CPU
+ pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
+
+ // assign the repeating layers to the devices according to the splits
+ pimpl->dev_layer.resize(n_layer);
+ for (int il = 0; il < n_layer; ++il) {
+ pimpl->dev_layer[il] = get_layer_buft_list(il);
+ }
+
+ // assign the output layer
+ pimpl->dev_output = get_layer_buft_list(n_layer);
+
+ // one ggml context per buffer type
+ int max_n_tensors = ml.n_tensors;
+ max_n_tensors += 1; // duplicated output tensor
+ max_n_tensors += n_layer*2; // duplicated rope freq tensors
+ const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
+
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+ struct ggml_backend_buft_comparator {
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+ }
+ };
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ctx_size,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+
+ ctx_map.emplace(buft, ctx);
+
+ return ctx;
+ }
+ return it->second.get();
+ };
+
+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
+
+ // create tensors for the weights
+ {
+ // note: cast to int64_t since we will use these for the tensor dimensions
+ const int64_t n_head = hparams.n_head();
+ const int64_t n_head_kv = hparams.n_head_kv();
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ const int64_t n_ff = hparams.n_ff();
+ const int64_t n_embd_gqa = n_embd_v_gqa;
+ const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_token_types = vocab.n_token_types();
+ const int64_t n_rot = hparams.n_rot;
+ const int64_t n_expert = hparams.n_expert;
+ const int64_t n_expert_used = hparams.n_expert_used;
+ const int64_t n_ctx_train = hparams.n_ctx_train;
+
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
+ throw std::runtime_error("model has expert layers but no expert layers are used");
+ }
+
+ int n_moved_tensors = 0;
+ ggml_tensor * first_moved_tensor = nullptr;
+ ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
+ ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
+
+ auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
+ ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
+
+ if (!t_meta) {
+ if (flags & TENSOR_NOT_REQUIRED) {
+ return nullptr;
+ }
+ throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
+ }
+
+ // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
+ // the tensor is duplicated
+ // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
+ llm_tensor tn_tensor = tn.tensor;
+ if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
+ tn_tensor = LLM_TENSOR_OUTPUT;
+ }
+
+ llm_tensor_info info;
+ try {
+ info = llm_tensor_info_for(tn_tensor);
+ } catch (const std::out_of_range & e) {
+ throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
+ }
+
+ // skip unused tensors
+ if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
+ const size_t nbytes = ggml_nbytes(t_meta);
+ LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
+
+ ml.size_data -= nbytes;
+ ml.n_created++;
+
+ return nullptr;
+ }
+
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
+ ggml_op op;
+ bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
+ if (bias) {
+ if (info.op == GGML_OP_MUL_MAT_ID) {
+ op = GGML_OP_ADD_ID;
+ } else {
+ op = GGML_OP_ADD;
+ }
+ } else {
+ op = info.op;
+ }
+
+ // sanity checks
+ if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
+ if (tn.bid != -1) {
+ GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
+ }
+ } else {
+ if (tn.bid == -1) {
+ GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
+ }
+ }
+
+ // select the buffer type for this tensor
+ buft_list_t * buft_list;
+ switch (info.layer) {
+ case LLM_TENSOR_LAYER_INPUT:
+ buft_list = pimpl->dev_input.buft_list;
+ break;
+ case LLM_TENSOR_LAYER_OUTPUT:
+ buft_list = pimpl->dev_output.buft_list;
+ break;
+ case LLM_TENSOR_LAYER_REPEATING:
+ buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
+ break;
+ default:
+ GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
+ }
+
+ ggml_backend_buffer_type_t buft = nullptr;
+
+ // check overrides
+ if (ml.tensor_buft_overrides) {
+ std::string tensor_name = tn.str();
+ for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
+ std::regex pattern(overrides->pattern);
+ if (std::regex_search(tensor_name, pattern)) {
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
+ // when overriding to a CPU buffer, consider the extra buffer types
+ buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
+ } else {
+ buft = overrides->buft;
+ }
+
+ LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
+ tensor_name.c_str(),
+ ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
+ ggml_backend_buft_name(buft));
+ break;
+ }
+ }
+ }
+
+ if (!buft) {
+ buft = select_weight_buft(hparams, t_meta, op, *buft_list);
+ if (!buft) {
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+ }
+ }
+
+ // avoid using a host buffer when using mmap
+ auto * buft_dev = ggml_backend_buft_get_device(buft);
+ if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (!cpu_dev) {
+ throw std::runtime_error("no CPU backend found");
+ }
+ buft = ggml_backend_dev_buffer_type(cpu_dev);
+ }
+
+ if (buft != buft_list->front().second) {
+ n_moved_tensors++;
+ if (!first_moved_tensor) {
+ first_moved_tensor = t_meta;
+ first_moved_from_buft = buft_list->front().second;
+ first_moved_to_buft = buft;
+ }
+ }
+
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
+ if (flags & TENSOR_DUPLICATED) {
+ ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+ if (t) {
+ return t;
+ }
+ }
+ return ml.create_tensor(ctx, tn, ne, flags);
+ };
+
+ layers.resize(n_layer);
+
+ // TODO: move to a separate function
+ const auto tn = LLM_TN(arch);
+ switch (arch) {
+ case LLM_ARCH_LLAMA:
+ case LLM_ARCH_REFACT:
+ case LLM_ARCH_MINICPM:
+ case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
+ case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ if (n_expert == 0) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // optional MLP bias
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+
+ // For Granite MoE Shared
+ if (hparams.n_ff_shexp > 0) {
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+ }
+ }
+ }
+ } break;
+ case LLM_ARCH_LLADA:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output =
+ create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+ // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
+ layer.wq =
+ create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+ // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
+ layer.wo =
+ create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
+ TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+
+ // optional MLP bias
+ layer.ffn_gate_b =
+ create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_b =
+ create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
+ }
+ }
+ break;
+ case LLM_ARCH_LLADA_MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_LLAMA4:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
+
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+ if (is_moe_layer) {
+ int n_ff_exp = hparams.n_ff_exp;
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert
+ const int64_t n_ff_shexp = n_ff_exp;
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
+ } else {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_DECI:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
+ const int64_t n_ff = hparams.n_ff(i);
+ const int64_t n_head = hparams.n_head(i);
+ const int64_t n_head_kv = hparams.n_head_kv(i);
+
+ if (n_head_kv == 0 && n_head > 0) {
+ // linear attention for DeciLMCausalModel
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ }
+ else if (n_head_kv > 0) {
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+ }
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ if (n_ff > 0) {
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ }
+
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ if (n_ff > 0) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+
+ // optional MLP bias
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_MINICPM3:
+ {
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+ const int64_t q_lora_rank = hparams.n_lora_q;
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ } break;
+ case LLM_ARCH_GROK:
+ {
+ if (n_expert == 0) {
+ throw std::runtime_error("Grok model cannot have zero experts");
+ }
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ if (!layer.ffn_post_norm) {
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_DBRX:
+ {
+ if (n_expert == 0) {
+ throw std::runtime_error("DBRX model cannot have zero experts");
+ }
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_BAICHUAN:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_FALCON:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_STARCODER:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ // needs to be on GPU
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_BERT:
+ case LLM_ARCH_NOMIC_BERT:
+ case LLM_ARCH_NOMIC_BERT_MOE:
+ case LLM_ARCH_JINA_BERT_V3:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
+
+ if (arch == LLM_ARCH_BERT) {
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
+
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ }
+
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ if (!layer.wqkv) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
+
+ if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ } else {
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ if (arch == LLM_ARCH_NOMIC_BERT) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ }
+ }
+
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+ layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_MODERN_BERT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ for(int i = 0; i < n_layer; ++i) {
+ auto& layer = layers[i];
+
+ if ( i != 0 ) {
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ } else{
+ // layer 0 uses identity
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ }
+
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ }
+
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+
+ } break;
+ case LLM_ARCH_NEO_BERT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_JINA_BERT_V2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
+
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
+
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i]; // JinaBertLayer
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
+
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
+ layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
+
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+ const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
+ ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
+ const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
+
+ GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
+ layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+ layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_BLOOM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_MPT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ // AWQ ScaleActivation layer
+ layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_STABLELM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors, present in Stable LM 2 1.6B
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ // optional q and k layernorms, present in StableLM 2 12B
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN2:
+ case LLM_ARCH_QWEN2VL:
+ case LLM_ARCH_DREAM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN2MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
+ }
+
+ // MoE branch
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN3:
+ case LLM_ARCH_QWEN3VL:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ // output rerank head
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN3MOE:
+ case LLM_ARCH_QWEN3VLMOE:
+ case LLM_ARCH_RND1:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
+ }
+
+ // MoE branch
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_PHI2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ if (layer.wqkv == nullptr) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_PHI3:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
+
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ } break;
+ case LLM_ARCH_PHIMOE:
+ {
+ const int64_t n_embd_head = n_embd / n_head;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
+ if (layer.wqkv == nullptr) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ }
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ } break;
+ case LLM_ARCH_PLAMO:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_PLAMO2:
+ {
+ // mamba parameters
+ const uint32_t d_conv = hparams.ssm_d_conv;
+ const uint32_t d_state = hparams.ssm_d_state;
+ const uint32_t num_heads = hparams.ssm_dt_rank;
+ const uint32_t intermediate_size = hparams.ssm_d_inner;
+ const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
+
+ // attention parameters
+ const uint32_t qk_dim = hparams.n_embd_head_k;
+ const uint32_t v_dim = hparams.n_embd_head_v;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+ bool is_mamba_layer = hparams.is_recurrent(i);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (is_mamba_layer) {
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
+
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
+
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
+
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
+
+ layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
+ layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
+ layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
+ } else {
+ const int64_t num_attention_heads = hparams.n_head(i);
+ const int64_t q_num_heads = num_attention_heads;
+ const int64_t num_key_value_heads = hparams.n_head_kv(i);
+ const int64_t k_num_heads = num_key_value_heads;
+ const int64_t v_num_heads = num_key_value_heads;
+ const int64_t q_proj_dim = q_num_heads * qk_dim;
+ const int64_t k_proj_dim = k_num_heads * qk_dim;
+ const int64_t v_proj_dim = v_num_heads * v_dim;
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
+ }
+
+ // All layers have post-attention norm, FFN norm, and FFN tensors
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_PLAMO3:
+ {
+ const int64_t head_dim_q = hparams.n_embd_head_k;
+ const int64_t head_dim_v = hparams.n_embd_head_v;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ const int64_t num_attention_heads = hparams.n_head(i);
+ const int64_t num_key_value_heads = hparams.n_head_kv(i);
+ const int64_t q_proj_dim = num_attention_heads * head_dim_q;
+ const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
+ const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
+ const int64_t n_ff_cur = hparams.n_ff(i);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
+ {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_GPT2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_CODESHELL:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if tok embd is NULL, init from output
+ if (tok_embd == NULL) {
+ tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_ORION:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_INTERNLM2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_GEMMA:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_GEMMA3:
+ case LLM_ARCH_GEMMA_EMBEDDING:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ // Dense linear weights
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
+ dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
+
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_GEMMA3N:
+ {
+ const int64_t n_altup = hparams.n_altup;
+ const int64_t laurel_rank = hparams.laurel_rank;
+ const int64_t n_embd_altup = hparams.n_embd_altup;
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
+
+ altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
+ altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
+ per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
+ per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ // altup & laurel
+ layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
+ layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
+ layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
+ layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
+ layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
+ layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
+ layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
+ layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
+ layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
+ layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
+ layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_STARCODER2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // optional bias tensors
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_MAMBA:
+ {
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t dt_rank = hparams.ssm_dt_rank;
+
+ // only an expansion factor of 2 is supported for now
+ if (2 * n_embd != d_inner) {
+ throw std::runtime_error("only an expansion factor of 2 is supported for now");
+ }
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ // norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
+
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
+
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
+
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
+
+ // no "weight" suffix for these
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
+
+ // out_proj
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_MAMBA2:
+ {
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_head = hparams.ssm_dt_rank;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
+
+ // only an expansion factor of 2 is supported for now
+ GGML_ASSERT(2 * n_embd == d_inner);
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ // norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
+
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
+
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
+
+ // no "weight" suffix for these
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
+
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
+
+ // out_proj
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_JAMBA:
+ {
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t dt_rank = hparams.ssm_dt_rank;
+
+ // only an expansion factor of 2 is supported for now
+ GGML_ASSERT(2 * n_embd == d_inner);
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ const int64_t n_head_kv = hparams.n_head_kv(i);
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
+
+ auto & layer = layers[i];
+
+ // norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (n_head_kv == 0) {
+ // Mamba layer
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
+
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
+
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
+
+ layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
+
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
+
+ layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
+ layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
+
+ // no "weight" suffix for these
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
+
+ // out_proj
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+ } else {
+ // Attention layers
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ }
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+
+ if (layer.ffn_gate_inp) {
+ // MoE
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ } else {
+ // FFN (no MoE)
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_GRANITE_HYBRID:
+ {
+ // mamba2 Mixer SSM params
+ // NOTE: int64_t for tensor dimensions
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
+
+ // only an expansion factor of 2 is supported for now
+ GGML_ASSERT(2 * n_embd == d_inner);
+
+ // embeddings
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ // norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (hparams.is_recurrent(i)) {
+ // ssm layers
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
+
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
+
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
+
+ // no "weight" suffix for these
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
+
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
+
+ // out_proj
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+ } else {
+ // attention layers (with optional bias)
+ const int64_t n_head_i = hparams.n_head(i);
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ }
+
+ // feed forward (w/ optional biases)
+ if (n_expert > 0) {
+ // MoE FFN
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+
+ // For Granite MoE Shared
+ if (hparams.n_ff_shexp > 0) {
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+ }
+ } else {
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ }
+ }
+ } break;
+ case LLM_ARCH_XVERSE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_COMMAND_R:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ // init output from the input tok embed
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (n_layer >= 64){
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+ }
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_COHERE2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ // init output from the input tok embed
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
+ TENSOR_DUPLICATED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+ }
+ }
+ break;
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_OLMO2:
+ {
+ const int64_t n_embd_head = n_embd / n_head;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_SEED_OSS:
+ {
+ const uint32_t head_dim = hparams.n_embd_head_k;
+ const int64_t n_qo_dim = n_head * head_dim;
+ const int64_t n_kv_dim = n_head_kv * head_dim;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
+
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ }
+ } break;
+
+ case LLM_ARCH_OLMOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_OPENELM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ // init output from the input tok embed
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ const int64_t n_head = hparams.n_head(i);
+ const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
+ const int64_t n_ff = hparams.n_ff(i);
+
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_GPTNEOX:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_ARCTIC:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_DEEPSEEK:
+ {
+
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (i < (int) hparams.n_layer_dense_lead) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_DEEPSEEK2:
+ {
+ const bool is_mla = hparams.is_mla();
+
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+
+ const int64_t q_lora_rank = hparams.n_lora_q;
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ if (q_lora_rank > 0) {
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+ }
+
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+ if (q_lora_rank > 0) {
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
+ } else {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
+ }
+
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
+
+ // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
+ if (is_mla) {
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
+ } else {
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (i < (int) hparams.n_layer_dense_lead) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_PLM:
+ {
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_BITNET:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_T5:
+ {
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ // n_layer: number of encoder_layers
+ // dec_n_layer: number of decoder_layers
+ const int dec_n_layer = hparams.dec_n_layer;
+ if (dec_n_layer > n_layer) {
+ layers.resize(dec_n_layer);
+ }
+
+ // load encoder layers
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+
+ // load decoder layers
+ for (int i = 0; i < dec_n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
+ // this tensor seems to be unused in HF transformers implementation
+ layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_JAIS:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_CHATGLM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ if (layer.wqkv == nullptr) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_GLM4:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ if (layer.wqkv == nullptr) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
+
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_GLM4_MOE:
+ {
+ const int64_t n_expert = hparams.n_expert;
+ const int64_t n_expert_used = hparams.n_expert_used;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
+ GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+ }
+
+ // Load ALL tensors including NextN layer to satisfy total tensor count
+ // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
+ for (int i = 0; i < n_layer; ++i) {
+ int flags = 0;
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ // skip all tensors in the NextN layers
+ flags |= TENSOR_SKIP;
+ }
+
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
+
+ // GLM-style attention with bias terms
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
+
+ // K/Q norm tensors (optional for GLM-4.5 355B variant)
+ layer.attn_q_norm = create_tensor(
+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
+ layer.attn_k_norm = create_tensor(
+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
+
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
+
+ // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
+ // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
+ const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
+
+ if (use_moe) {
+ // MoE layers
+ layer.ffn_gate_inp =
+ create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
+
+ // MoE branch
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+ layer.ffn_gate_exps = create_tensor(
+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
+ layer.ffn_down_exps = create_tensor(
+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
+ layer.ffn_up_exps = create_tensor(
+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
+
+ // Shared expert
+ if (n_expert_shared > 0) {
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
+ layer.ffn_gate_shexp = create_tensor(
+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+ layer.ffn_down_shexp = create_tensor(
+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
+ layer.ffn_up_shexp = create_tensor(
+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+ }
+ } else {
+ // Dense layers (first k layers) - GLM uses separate gate/up projections
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
+ }
+
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+ // Optional tensors
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+ }
+ }
+ }
+ break;
+ case LLM_ARCH_NEMOTRON:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // optional MLP bias
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_NEMOTRON_H:
+ case LLM_ARCH_NEMOTRON_H_MOE:
+ {
+ // mamba2 Mixer SSM params
+ // NOTE: int64_t for tensor dimensions
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
+
+ // embeddings
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ // all blocks use the attn norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (hparams.is_recurrent(i)) {
+ // ssm layers
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
+
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
+
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
+
+ // no "weight" suffix for these
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
+
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
+
+ // out_proj
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+ } else if (hparams.n_ff(i) == 0) {
+ // attention layers (with optional bias)
+ const int64_t n_head_i = hparams.n_head(i);
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ } else {
+ if (n_expert != 0) {
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
+
+ // MoE branch
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+
+ } else {
+ // mlp layers
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
+ }
+ }
+ }
+ } break;
+ case LLM_ARCH_EXAONE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_EXAONE4:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_EXAONE_MOE:
+ {
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert = hparams.n_expert;
+ const int64_t n_expert_used = hparams.n_expert_used;
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
+ const int64_t head_dim = hparams.n_embd_head_k;
+ const int64_t n_qo_dim = n_head * head_dim;
+ const int64_t n_kv_dim = n_head_kv * head_dim;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ int flags = 0;
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ // skip all tensors in the NextN layers
+ flags |= TENSOR_SKIP;
+ }
+
+ auto & layer = layers[i];
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
+
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+ // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
+ if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
+
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+ }
+
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
+
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
+ }
+ }
+ } break;
+ case LLM_ARCH_RWKV6:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // Block 0, LN0
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+ const int head_size = hparams.wkv_head_size;
+ const int attn_hidden_size = n_embd;
+ const int ffn_size = hparams.n_ff_arr[0];
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
+
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+ layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+ layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
+ GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
+
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
+ layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+ layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+ layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+ layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
+ layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+ layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+ layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
+
+ layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
+ layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
+ layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
+ }
+
+ } break;
+ case LLM_ARCH_RWKV6QWEN2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+ const int head_size = hparams.wkv_head_size;
+ const int attn_hidden_size = n_embd;
+ const int n_head_kv = hparams.n_head_kv();
+ int attn_key_value_size;
+ if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
+ attn_key_value_size = attn_hidden_size;
+ } else {
+ attn_key_value_size = n_head_kv * head_size;
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+ layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
+
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+ layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+ layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ // optional bias tensors
+ layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
+
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_RWKV7:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // Block 0, LN0
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ const int n_lora_decay = hparams.n_lora_decay;
+ const int n_lora_iclr = hparams.n_lora_iclr;
+ const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
+ const int n_lora_gate = hparams.n_lora_gate;
+ const int attn_hidden_size = n_embd;
+ const int ffn_size = hparams.n_ff_arr[0];
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
+
+ layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
+
+ layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
+ layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
+ layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
+
+ if (i == 0) {
+ // actually not used
+ layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+ layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
+ layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
+ } else {
+ layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+ layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
+ layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
+ }
+
+ layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
+ layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
+
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
+
+ layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
+ layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
+ layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
+
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+ layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
+ layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+ layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+
+ layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
+ layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
+ }
+
+ } break;
+ case LLM_ARCH_ARWKV7:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ const int n_lora_decay = hparams.n_lora_decay;
+ const int n_lora_iclr = hparams.n_lora_iclr;
+ const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
+ const int n_lora_gate = hparams.n_lora_gate;
+ const int attn_hidden_size = n_embd;
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
+
+ layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
+ layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
+ layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
+
+ if (i == 0) {
+ // actually not used
+ layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+ layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
+ layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
+ } else {
+ layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+ layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
+ layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
+ }
+
+ layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
+
+ try {
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
+ } catch(std::runtime_error & e) {
+ // ARWKV models may not have gate tensors
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
+ }
+
+ layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
+ layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
+ layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
+
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+ layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+
+ } break;
+ case LLM_ARCH_CHAMELEON:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_WAVTOKENIZER_DEC:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
+
+ conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
+ conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
+
+ // posnet
+ {
+ const int64_t n_embd = hparams.posnet.n_embd;
+
+ for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
+ auto & layer = layers[i].posnet;
+
+ // posnet:
+ //
+ // - resnet
+ // - resnet
+ // - attn
+ // - resnet
+ // - resnet
+ // - norm
+ //
+ switch (i) {
+ case 0:
+ case 1:
+ case 3:
+ case 4:
+ {
+ layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
+ layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
+
+ layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
+ layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
+
+ layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
+ layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
+
+ layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
+ layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
+ } break;
+ case 2:
+ {
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
+ } break;
+ case 5:
+ {
+ layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
+ } break;
+ default: GGML_ABORT("unknown posnet layer");
+ };
+ }
+ }
+
+ GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
+
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
+
+ // convnext
+ {
+ const int64_t n_embd = hparams.convnext.n_embd;
+
+ for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
+ auto & layer = layers[i].convnext;
+
+ layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
+ layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
+
+ layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
+
+ layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
+ layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
+
+ layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
+ layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
+
+ layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
+ }
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ }
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {hparams.n_embd_out()}, 0);
+ } break;
+ case LLM_ARCH_BAILINGMOE:
+ {
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ }
+ } break;
+ case LLM_ARCH_BAILINGMOE2:
+ {
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
+
+ for (int i = 0; i < n_layer; ++i) {
+ int flags = 0;
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ // skip all tensors in the NextN layers
+ flags |= TENSOR_SKIP;
+ }
+
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
+
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+ if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
+ const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
+
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+ } else { // Dense layers
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
+ }
+
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
+ }
+ }
+ } break;
+ case LLM_ARCH_DOTS1:
+ {
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (i < (int) hparams.n_layer_dense_lead) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_ARCEE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_AFMOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ // dual attention normalization
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ // attention projections
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ // Q/K normalization
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ // attention gating
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+
+ // dual ffn normalization
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
+ // MoE layers
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+
+ // grouped expert weights
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+ // shared expert
+ if (n_expert_shared > 0) {
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+ }
+ } else {
+ // Dense layers
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_ERNIE4_5:
+ case LLM_ARCH_ERNIE4_5_MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
+ int n_ff_exp = hparams.n_ff_exp;
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert (if present)
+ if (hparams.n_ff_shexp > 0) {
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
+ }
+ } else { // Dense layers
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_FALCON_H1:
+ {
+ // Common
+ const int64_t hidden_size = hparams.n_embd; // hidden_size
+
+ // mamba2 Mixer SSM params
+ const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
+ const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
+ const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
+ const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
+ const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
+ const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
+ const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
+
+ // attn params
+ const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
+ const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
+
+ // ffn params
+ const int64_t ffn_intermediate_size = hparams.n_ff(0);
+
+ // embeddings
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
+
+ // output
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ /*SSM LAYERS*/
+ // ssm in
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
+ // ssm 1d conv
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
+ // ssm_dt
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
+ // no "weight" suffix for these
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
+ // ssm_norm
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
+ // out_proj
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
+
+ /*ATTENTION LAYERS*/
+ // attention layers (with optional bias)
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
+
+
+ // feed forward (w/ optional biases)
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
+
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_HUNYUAN_MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_HUNYUAN_DENSE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ }
+ } break;
+ case LLM_ARCH_SMOLLM3:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_OPENAI_MOE:
+ {
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
+
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // bias
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
+ layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
+ layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_LFM2:
+ case LLM_ARCH_LFM2MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
+
+ // ffn/moe is same for transformer and conv layers
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ if (is_moe_layer) {
+ GGML_ASSERT(n_expert && n_expert_used);
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+ } else { // dense
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+
+ // for operator_norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (!hparams.is_recurrent(i)) {
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ } else {
+ layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
+ layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
+ layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
+ }
+ }
+
+ // for LFM2-ColBert-350M
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
+ } break;
+ case LLM_ARCH_SMALLTHINKER:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
+
+ // MoE branch
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+ }
+ } break;
+ case LLM_ARCH_GROVEMOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
+ GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ // MoE branch
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+ const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
+ layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
+ layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_APERTUS:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ } else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+
+ // Q and K layernorms for Apertus
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_MINIMAX_M2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_KIMI_LINEAR:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ // Check for KDA specific tensors to determine layer type or if it's a mixed model
+ // Assuming KDA layer if KDA tensors are present
+
+ // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
+ const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
+ const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
+ const int64_t ssm_d_conv = hparams.ssm_d_conv;
+
+ // Try loading KDA specific tensors (using SSM_ prefix)
+ // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
+ // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+ if (!layer.ssm_q_conv) {
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
+ }
+
+ if (layer.ssm_q_conv) {
+ // KDA Layer - Conv1d weights may be 3D or 4D
+ layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+ if (!layer.ssm_k_conv) {
+ layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
+ }
+ layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+ if (!layer.ssm_v_conv) {
+ layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
+ }
+
+ // q, k, v projections
+ // Python: q_proj, k_proj, v_proj
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0);
+
+ // KDA specific projections
+ // f_a_proj, f_b_proj
+ layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
+ layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
+
+ // b_proj (beta mixing coefficient)
+ layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
+
+ // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
+ if (!layer.ssm_a) {
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
+ }
+
+ // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
+
+ // g_a_proj, g_b_proj (output gate)
+ layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
+ layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
+
+ // o_norm (reusing SSM_NORM)
+ layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
+
+ // o_proj
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
+
+ } else {
+ // MLA Layer - use MLA-specific head dimensions
+ const int64_t q_lora_rank = hparams.n_lora_q;
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+ if (layer.attn_q_a_norm) {
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
+ } else {
+ // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
+ }
+
+ // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
+ // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
+ const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
+ // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
+ if (!layer.wkv_b) { // MLA KV cache enabled
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
+ }
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
+ }
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ // MoE intermediate size (different from dense FFN)
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+
+ // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
+ // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
+ if (i < (int) hparams.n_layer_dense_lead) {
+ // Dense FFN layer - use normal n_ff
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ } else {
+ // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared experts use moe_intermediate_size * num_shared_experts
+ // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
+ // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
+ const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_COGVLM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
+ layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.visexp_ffn_up = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_PANGU_EMBED:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ // weight tensors
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ // bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd_head_k * n_head}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ } else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN3NEXT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+ }
+
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+ // Calculate dimensions from hyperparameters
+ const int64_t head_k_dim = hparams.ssm_d_state;
+ const int64_t head_v_dim = hparams.ssm_d_state;
+ const int64_t n_k_heads = hparams.ssm_n_group;
+ const int64_t n_v_heads = hparams.ssm_dt_rank;
+ const int64_t key_dim = head_k_dim * n_k_heads;
+ const int64_t value_dim = head_v_dim * n_v_heads;
+ const int64_t conv_dim = key_dim * 2 + value_dim;
+
+ // Calculate projection sizes
+ const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
+ const int64_t ba_dim = n_v_heads * 2;
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+ if (!hparams.is_recurrent(i)) {
+ // Attention layers
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+ // Q/K normalization for attention layers
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+ } else {
+ // Linear attention (gated delta net) specific tensors
+ // Create tensors with calculated dimensions
+ // note: ssm_in is used by legacy GGUF
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
+ layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
+ }
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+
+ // Shared experts
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN35MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+ }
+
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+ // Calculate dimensions from hyperparameters
+ const int64_t head_k_dim = hparams.ssm_d_state;
+ const int64_t head_v_dim = hparams.ssm_d_state;
+ const int64_t n_k_heads = hparams.ssm_n_group;
+ const int64_t n_v_heads = hparams.ssm_dt_rank;
+ const int64_t key_dim = head_k_dim * n_k_heads;
+ const int64_t value_dim = head_v_dim * n_v_heads;
+ const int64_t conv_dim = key_dim * 2 + value_dim;
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+ if (!hparams.is_recurrent(i)) {
+ // Attention layers
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+ // Q/K normalization for attention layers
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+ } else {
+ // Linear attention (gated delta net) specific tensors
+ // Create tensors with calculated dimensions
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
+ layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
+ layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
+ }
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+
+ // Shared experts
+ const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN35:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+ }
+
+ // Calculate dimensions from hyperparameters
+ const int64_t head_k_dim = hparams.ssm_d_state;
+ const int64_t head_v_dim = hparams.ssm_d_state;
+ const int64_t n_k_heads = hparams.ssm_n_group;
+ const int64_t n_v_heads = hparams.ssm_dt_rank;
+ const int64_t key_dim = head_k_dim * n_k_heads;
+ const int64_t value_dim = head_v_dim * n_v_heads;
+ const int64_t conv_dim = key_dim * 2 + value_dim;
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+ if (!hparams.is_recurrent(i)) {
+ // Attention layers
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+ // Q/K normalization for attention layers
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+ } else {
+ // Linear attention (gated delta net) specific tensors
+ // Create tensors with calculated dimensions
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
+ layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
+ layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
+ }
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_MIMO2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+ uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
+ uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
+ uint32_t n_head = hparams.n_head(i);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ // non-MoE branch
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+ // MoE branch
+ int64_t n_ff_exp = hparams.n_ff_exp;
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_STEP35:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
+ // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
+ uint32_t n_rot_max = 0;
+ for (int i = 0; i < n_layer; ++i) {
+ n_rot_max = std::max(n_rot_max, hparams.n_rot);
+ }
+ if (n_rot_max == 0) {
+ n_rot_max = n_rot;
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ const uint32_t n_head_l = hparams.n_head(i);
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+
+ // optional rope factors (llama3) / longrope tensors
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ } else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
+
+ // head-wise attention gate (Step35 self_attn.g_proj)
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ // dense MLP (leading dense blocks)
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+ // MoE routed experts + selection bias (router_bias)
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+ // shared expert MLP
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_MAINCODER:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ default:
+ throw std::runtime_error("unknown architecture");
+ }
+
+ if (n_moved_tensors > 0) {
+ LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
+ __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
+ ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+ }
+ }
+
+ ml.done_getting_tensors();
+
+ ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
+ pimpl->mappings.reserve(ml.mappings.size());
+
+ // create the backend buffers
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
+ ctx_buf_maps.reserve(ctx_map.size());
+
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
+ const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+ pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
+
+ for (auto & [buft, ctx_ptr] : ctx_map) {
+ ggml_context * ctx = ctx_ptr.get();
+
+ // skip contexts without tensors
+ if (ggml_get_first_tensor(ctx) == nullptr) {
+ continue;
+ }
+
+ llama_buf_map buf_map;
+ buf_map.reserve(n_max_backend_buffer);
+
+ // check if it is possible to use buffer_from_host_ptr with this buffer type
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+ if (!dev) {
+ // FIXME: workaround for CPU backend buft having a NULL device
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (!dev) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+ }
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
+ bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
+
+ std::vector<ggml_backend_buffer_ptr> bufs;
+ if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
+ GGML_ASSERT(!ml.no_alloc);
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
+ // then we could just use metal for all layers
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
+ void * addr = nullptr;
+ size_t first, last; // NOLINT
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
+ if (first >= last) {
+ continue;
+ }
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
+ ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
+ if (buf == nullptr) {
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+ }
+ bufs.emplace_back(buf);
+ buf_map.emplace(idx, buf);
+ }
+ } else {
+ ggml_backend_buffer_t buf;
+ if (ml.no_alloc) {
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+ t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
+ }
+ } else {
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
+ }
+ if (buf == nullptr) {
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+ }
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
+ pimpl->mlock_bufs.emplace_back(new llama_mlock);
+ auto & mlock_buf = pimpl->mlock_bufs.back();
+ mlock_buf->init (ggml_backend_buffer_get_base(buf));
+ mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
+ }
+ bufs.emplace_back(buf);
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+ buf_map.emplace(idx, buf);
+ }
+ }
+ pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
+
+ for (auto & buf : buf_map) {
+ // indicate that this buffer contains weights
+ // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+ }
+
+ ctx_buf_maps.emplace_back(ctx, buf_map);
+ }
+
+ if (llama_supports_gpu_offload()) {
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+ int n_repeating = n_gpu;
+ if (n_repeating > 0) {
+ LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+ n_repeating--;
+ }
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
+
+ const int max_backend_supported_layers = hparams.n_layer + 1;
+ const int max_offloadable_layers = hparams.n_layer + 1;
+
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+ }
+
+ // print memory requirements per buffer type
+ for (auto & [_, bufs] : pimpl->ctxs_bufs) {
+ for (auto & buf: bufs) {
+ LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
+ __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+ }
+ }
+
+ // populate tensors_by_name
+ for (auto & [ctx, _] : pimpl->ctxs_bufs) {
+ for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
+ tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+ }
+ }
+
+ if (ml.no_alloc) {
+ return true;
+ }
+
+ // load tensor data
+ for (auto & [ctx, buf_map] : ctx_buf_maps) {
+ if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+ return false;
+ }
+ }
+
+ if (use_mmap_buffer) {
+ for (auto & mapping : ml.mappings) {
+ pimpl->mappings.emplace_back(std::move(mapping));
+ }
+ }
+
+ return true;
+}
+
+std::string llama_model::arch_name() const {
+ return llm_arch_name(arch);
+}
+
+std::string llama_model::type_name() const {
+ return llm_type_name(type);
+}
+
+std::string llama_model::desc() const {
+ return pimpl->desc_str;
+}
+
+size_t llama_model::size() const {
+ return pimpl->n_bytes;
+}
+
+size_t llama_model::n_tensors() const {
+ return tensors_by_name.size();
+}
+
+size_t llama_model::n_devices() const {
+ return devices.size();
+}
+
+uint32_t llama_model::n_gpu_layers() const {
+ return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
+}
+
+llama_split_mode llama_model::split_mode() const {
+ return params.split_mode;
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
+ for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
+ if (hparams.no_alloc) {
+ GGML_ASSERT(bufs.size() == 1);
+ ggml_backend_buffer_t buf = bufs[0].get();
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
+ } else {
+ for (const auto & buf : bufs) {
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+ }
+ }
+ }
+ return ret;
+}
+
+uint64_t llama_model::n_elements() const {
+ return pimpl->n_elements;
+}
+
+void llama_model::print_info() const {
+ const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
+
+ auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
+ bool is_var = false;
+
+ std::vector<uint32_t> v;
+ for (uint32_t i = 0; i < n; ++i) {
+ v.push_back(f(i));
+ if (v[i] != v[0]) {
+ is_var = true;
+ }
+ }
+
+ std::stringstream ss;
+
+ if (is_var) {
+ ss << "[";
+ for (uint32_t i = 0; i < n; ++i) {
+ ss << v[i];
+ if (i < n - 1) {
+ ss << ", ";
+ }
+ }
+ ss << "]";
+ } else {
+ ss << v[0];
+ }
+
+ return ss.str();
+ };
+
+ // hparams
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
+ LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
+
+ if (!hparams.vocab_only) {
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
+ LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
+ LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
+ LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
+ LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
+ LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
+ LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
+ }
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
+ // MRoPE (Multi-axis Rotary Position Embedding) sections
+ if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
+ LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
+ }
+ if (!classifier_labels.empty()) {
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
+
+ size_t i = 0;
+ for (auto label : classifier_labels) {
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
+ }
+ }
+ }
+
+ if (arch == LLM_ARCH_MAMBA ||
+ arch == LLM_ARCH_MAMBA2 ||
+ arch == LLM_ARCH_JAMBA ||
+ arch == LLM_ARCH_FALCON_H1 ||
+ arch == LLM_ARCH_PLAMO2 ||
+ arch == LLM_ARCH_GRANITE_HYBRID ||
+ arch == LLM_ARCH_QWEN3NEXT ||
+ arch == LLM_ARCH_QWEN35 ||
+ arch == LLM_ARCH_QWEN35MOE ||
+ arch == LLM_ARCH_NEMOTRON_H ||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
+ }
+
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
+ if (pimpl->n_elements >= 1e12) {
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
+ } else if (pimpl->n_elements >= 1e9) {
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
+ } else if (pimpl->n_elements >= 1e6) {
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
+ } else {
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
+ }
+
+ // general kv
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
+
+ if (arch == LLM_ARCH_DEEPSEEK) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ }
+
+ if (arch == LLM_ARCH_DEEPSEEK2) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ }
+
+ if (arch == LLM_ARCH_QWEN2MOE) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ }
+
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ }
+
+ if (arch == LLM_ARCH_MINICPM ||
+ arch == LLM_ARCH_GRANITE ||
+ arch == LLM_ARCH_GRANITE_MOE ||
+ arch == LLM_ARCH_GRANITE_HYBRID ||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ }
+
+ if (arch == LLM_ARCH_BAILINGMOE) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ }
+
+ if (arch == LLM_ARCH_BAILINGMOE2) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
+ }
+
+ if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ }
+
+ if (arch == LLM_ARCH_GROVEMOE) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
+ }
+
+ vocab.print_info();
+}
+
+ggml_backend_dev_t llama_model::dev_layer(int il) const {
+ return pimpl->dev_layer.at(il).dev;
+}
+
+ggml_backend_dev_t llama_model::dev_output() const {
+ return pimpl->dev_output.dev;
+}
+
+template<typename F>
+static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context_ptr ctx { ggml_init(params) };
+ if (!ctx) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+ ggml_tensor * op_tensor = fn(ctx.get());
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op_tensor->src[i] != nullptr) {
+ assert(op_tensor->src[i]->buffer == nullptr);
+ op_tensor->src[i]->buffer = buf.get();
+ }
+ }
+
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+
+ return op_supported;
+}
+
+template<typename F>
+static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
+ for (const auto & cur : buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (buft_supported(cur_buft, cur_dev, fn)) {
+ return cur_buft;
+ }
+ }
+
+ throw std::runtime_error(format("no suitable buffer type found"));
+}
+
+ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
+ return ::select_buft(
+ *pimpl->dev_layer.at(il).buft_list,
+ [&](ggml_context * ctx) {
+ ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+ ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+ return ggml_add(ctx, cur, layer_dir);
+ });
+}
+
+bool llama_model::has_tensor_overrides() const {
+ return pimpl->has_tensor_overrides;
+}
+
+const ggml_tensor * llama_model::get_tensor(const char * name) const {
+ auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
+ [name](const std::pair<std::string, ggml_tensor *> & it) {
+ return it.first == name;
+ });
+ if (it == tensors_by_name.end()) {
+ return nullptr;
+ }
+
+ return it->second;
+}
+
+float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
+ return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
+}
+
+float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
+ return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+}
+
+ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
+ const uint32_t n_ctx_seq = cparams.n_ctx_seq;
+
+ // choose long/short freq factors based on the context size
+ if (layers[il].rope_freqs != nullptr) {
+ return layers[il].rope_freqs;
+ }
+
+ if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
+ return layers[il].rope_long;
+ }
+
+ return layers[il].rope_short;
+}
+
+llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
+ llama_memory_i * res;
+
+ switch (arch) {
+ // Models that need specific instantiation should be handled in the
+ // switch statement
+ case LLM_ARCH_BERT:
+ case LLM_ARCH_JINA_BERT_V2:
+ case LLM_ARCH_JINA_BERT_V3:
+ case LLM_ARCH_NOMIC_BERT:
+ case LLM_ARCH_NOMIC_BERT_MOE:
+ case LLM_ARCH_NEO_BERT:
+ case LLM_ARCH_WAVTOKENIZER_DEC:
+ case LLM_ARCH_MODERN_BERT:
+ case LLM_ARCH_GEMMA_EMBEDDING:
+ case LLM_ARCH_DREAM:
+ case LLM_ARCH_LLADA:
+ case LLM_ARCH_LLADA_MOE:
+ case LLM_ARCH_RND1:
+ {
+ res = nullptr;
+ } break;
+ // Models that need standard caching should rely on recurrent/hybrid
+ // checks
+ default:
+ {
+ if (llm_arch_is_recurrent(arch)) {
+ res = new llama_memory_recurrent(
+ *this,
+ GGML_TYPE_F32,
+ GGML_TYPE_F32,
+ cparams.offload_kqv,
+ std::max((uint32_t) 1, cparams.n_seq_max),
+ cparams.n_seq_max,
+ nullptr);
+ } else if (llm_arch_is_hybrid(arch)) {
+
+ // The main difference between hybrid architectures is the
+ // layer filters, so pick the right one here
+ llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
+ llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
+ if (arch == LLM_ARCH_FALCON_H1) {
+ filter_attn = [&](int32_t) { return true; };
+ filter_recr = [&](int32_t) { return true; };
+ } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
+ filter_attn = [&](int32_t il) {
+ return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
+ };
+ filter_recr = [&](int32_t il) {
+ return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
+ };
+ }
+
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ // Use hybrid-iswa for hybrid models with SWA
+ res = new llama_memory_hybrid_iswa(
+ /* model */ *this,
+ /* attn_type_k */ params.type_k,
+ /* attn_type_v */ params.type_v,
+ /* attn_v_trans */ !cparams.flash_attn,
+ /* attn_swa_full */ params.swa_full,
+ /* attn_kv_size */ cparams.n_ctx,
+ /* attn_n_ubatch */ cparams.n_ubatch,
+ /* attn_n_pad */ 1,
+ /* recurrent_type_r */ GGML_TYPE_F32,
+ /* recurrent_type_s */ GGML_TYPE_F32,
+ /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+ /* n_seq_max */ cparams.n_seq_max,
+ /* offload */ cparams.offload_kqv,
+ /* unified */ cparams.kv_unified,
+ /* filter_attn */ std::move(filter_attn),
+ /* filter_recr */ std::move(filter_recr));
+ } else {
+ res = new llama_memory_hybrid(
+ /* model */ *this,
+ /* attn_type_k */ params.type_k,
+ /* attn_type_v */ params.type_v,
+ /* attn_v_trans */ !cparams.flash_attn,
+ /* attn_kv_size */ cparams.n_ctx,
+ /* attn_n_pad */ 1,
+ /* attn_n_swa */ hparams.n_swa,
+ /* attn_swa_type */ hparams.swa_type,
+ /* recurrent_type_k */ GGML_TYPE_F32,
+ /* recurrent_type_v */ GGML_TYPE_F32,
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+ /* n_seq_max */ cparams.n_seq_max,
+ /* offload */ cparams.offload_kqv,
+ /* unified */ cparams.kv_unified,
+ /* filter_attn */ std::move(filter_attn),
+ /* filter_recr */ std::move(filter_recr));
+ }
+ } else {
+ llama_memory_i::layer_reuse_cb reuse = nullptr;
+
+ if (arch == LLM_ARCH_GEMMA3N) {
+ reuse = [&](int32_t il) {
+ if (il >= (int32_t) hparams.n_layer_kv_from_start) {
+ return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
+ }
+
+ return -1;
+ };
+ }
+
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ GGML_ASSERT(hparams.is_swa_any());
+
+ res = new llama_kv_cache_iswa(
+ *this,
+ params.type_k,
+ params.type_v,
+ !cparams.flash_attn,
+ cparams.offload_kqv,
+ params.swa_full,
+ cparams.kv_unified,
+ cparams.n_ctx_seq,
+ cparams.n_seq_max,
+ cparams.n_ubatch,
+ 1,
+ nullptr,
+ reuse);
+ } else {
+ GGML_ASSERT(!hparams.is_swa_any());
+
+ res = new llama_kv_cache(
+ *this,
+ params.type_k,
+ params.type_v,
+ !cparams.flash_attn,
+ cparams.offload_kqv,
+ cparams.kv_unified,
+ cparams.n_ctx_seq,
+ cparams.n_seq_max,
+ 1,
+ hparams.n_swa,
+ hparams.swa_type,
+ nullptr,
+ nullptr);
+ }
+ }
+ }
+ }
+
+ return res;
+}
+
+ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+ std::unique_ptr<llm_graph_context> llm;
+
+ switch (arch) {
+ case LLM_ARCH_LLAMA:
+ {
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
+ } break;
+ case LLM_ARCH_LLAMA4:
+ {
+ if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
+ } else {
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params);
+ }
+ } break;
+ case LLM_ARCH_LLAMA_EMBED:
+ {
+ llm = std::make_unique<llm_build_llama<true>>(*this, params);
+ } break;
+ case LLM_ARCH_MAINCODER:
+ {
+ llm = std::make_unique<llm_build_maincoder>(*this, params);
+ } break;
+ case LLM_ARCH_DECI:
+ {
+ llm = std::make_unique<llm_build_deci>(*this, params);
+ } break;
+ case LLM_ARCH_BAICHUAN:
+ {
+ llm = std::make_unique<llm_build_baichuan>(*this, params);
+ } break;
+ case LLM_ARCH_FALCON:
+ {
+ llm = std::make_unique<llm_build_falcon>(*this, params);
+ } break;
+ case LLM_ARCH_GROK:
+ {
+ llm = std::make_unique<llm_build_grok>(*this, params);
+ } break;
+ case LLM_ARCH_STARCODER:
+ {
+ llm = std::make_unique<llm_build_starcoder>(*this, params);
+ } break;
+ case LLM_ARCH_REFACT:
+ {
+ llm = std::make_unique<llm_build_refact>(*this, params);
+ } break;
+ case LLM_ARCH_BERT:
+ case LLM_ARCH_JINA_BERT_V2:
+ case LLM_ARCH_JINA_BERT_V3:
+ case LLM_ARCH_NOMIC_BERT:
+ case LLM_ARCH_NOMIC_BERT_MOE:
+ {
+ llm = std::make_unique<llm_build_bert>(*this, params);
+ } break;
+ case LLM_ARCH_MODERN_BERT:
+ {
+ llm = std::make_unique<llm_build_modern_bert>(*this, params);
+ } break;
+ case LLM_ARCH_NEO_BERT:
+ {
+ llm = std::make_unique<llm_build_neo_bert>(*this, params);
+ } break;
+ case LLM_ARCH_BLOOM:
+ {
+ llm = std::make_unique<llm_build_bloom>(*this, params);
+ } break;
+ case LLM_ARCH_MPT:
+ {
+ llm = std::make_unique<llm_build_mpt>(*this, params);
+ } break;
+ case LLM_ARCH_STABLELM:
+ {
+ llm = std::make_unique<llm_build_stablelm>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN:
+ {
+ llm = std::make_unique<llm_build_qwen>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN2:
+ {
+ llm = std::make_unique<llm_build_qwen2>(*this, params);
+ } break;
+ case LLM_ARCH_DREAM:
+ {
+ llm = std::make_unique<llm_build_dream>(*this, params);
+ }
+ break;
+ case LLM_ARCH_LLADA:
+ {
+ llm = std::make_unique<llm_build_llada>(*this, params);
+ }
+ break;
+ case LLM_ARCH_LLADA_MOE:
+ {
+ llm = std::make_unique<llm_build_llada_moe>(*this, params);
+ }
+ break;
+ case LLM_ARCH_RND1:
+ {
+ llm = std::make_unique<llm_build_rnd1>(*this, params);
+ }
+ break;
+ case LLM_ARCH_QWEN2VL:
+ {
+ llm = std::make_unique<llm_build_qwen2vl>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN2MOE:
+ {
+ llm = std::make_unique<llm_build_qwen2moe>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN3:
+ {
+ llm = std::make_unique<llm_build_qwen3>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN3MOE:
+ {
+ llm = std::make_unique<llm_build_qwen3moe>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN3VL:
+ {
+ llm = std::make_unique<llm_build_qwen3vl>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN3VLMOE:
+ {
+ llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
+ } break;
+ case LLM_ARCH_PHI2:
+ {
+ llm = std::make_unique<llm_build_phi2>(*this, params);
+ } break;
+ case LLM_ARCH_PHI3:
+ case LLM_ARCH_PHIMOE:
+ {
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params);
+ } else {
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params);
+ }
+ } break;
+ case LLM_ARCH_PLAMO:
+ {
+ llm = std::make_unique<llm_build_plamo>(*this, params);
+ } break;
+ case LLM_ARCH_PLAMO2:
+ {
+ llm = std::make_unique<llm_build_plamo2>(*this, params);
+ } break;
+ case LLM_ARCH_PLAMO3:
+ {
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
+ } else {
+ llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
+ }
+ } break;
+ case LLM_ARCH_GPT2:
+ {
+ llm = std::make_unique<llm_build_gpt2>(*this, params);
+ } break;
+ case LLM_ARCH_CODESHELL:
+ {
+ llm = std::make_unique<llm_build_codeshell>(*this, params);
+ } break;
+ case LLM_ARCH_ORION:
+ {
+ llm = std::make_unique<llm_build_orion>(*this, params);
+ } break;
+ case LLM_ARCH_INTERNLM2:
+ {
+ llm = std::make_unique<llm_build_internlm2>(*this, params);
+ } break;
+ case LLM_ARCH_MINICPM3:
+ {
+ llm = std::make_unique<llm_build_minicpm3>(*this, params);
+ } break;
+ case LLM_ARCH_GEMMA:
+ {
+ llm = std::make_unique<llm_build_gemma>(*this, params);
+ } break;
+ case LLM_ARCH_GEMMA2:
+ {
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
+ } break;
+ case LLM_ARCH_GEMMA3:
+ {
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+ llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
+ } else {
+ llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
+ }
+ } break;
+ case LLM_ARCH_GEMMA3N:
+ {
+ llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
+ } break;
+ case LLM_ARCH_GEMMA_EMBEDDING:
+ {
+ llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
+ } break;
+ case LLM_ARCH_STARCODER2:
+ {
+ llm = std::make_unique<llm_build_starcoder2>(*this, params);
+ } break;
+ case LLM_ARCH_MAMBA:
+ case LLM_ARCH_MAMBA2:
+ {
+ llm = std::make_unique<llm_build_mamba>(*this, params);
+ } break;
+ case LLM_ARCH_JAMBA:
+ {
+ llm = std::make_unique<llm_build_jamba>(*this, params);
+ } break;
+ case LLM_ARCH_XVERSE:
+ {
+ llm = std::make_unique<llm_build_xverse>(*this, params);
+ } break;
+ case LLM_ARCH_COMMAND_R:
+ {
+ llm = std::make_unique<llm_build_command_r>(*this, params);
+ } break;
+ case LLM_ARCH_COHERE2:
+ {
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
+ } break;
+ case LLM_ARCH_DBRX:
+ {
+ llm = std::make_unique<llm_build_dbrx>(*this, params);
+ } break;
+ case LLM_ARCH_OLMO:
+ {
+ llm = std::make_unique<llm_build_olmo>(*this, params);
+ } break;
+ case LLM_ARCH_OLMO2:
+ {
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+ llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
+ } else {
+ llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
+ }
+ } break;
+ case LLM_ARCH_OLMOE:
+ {
+ llm = std::make_unique<llm_build_olmoe>(*this, params);
+ } break;
+ case LLM_ARCH_OPENELM:
+ {
+ llm = std::make_unique<llm_build_openelm>(*this, params);
+ } break;
+ case LLM_ARCH_GPTNEOX:
+ {
+ llm = std::make_unique<llm_build_gptneox>(*this, params);
+ } break;
+ case LLM_ARCH_ARCTIC:
+ {
+ llm = std::make_unique<llm_build_arctic>(*this, params);
+ } break;
+ case LLM_ARCH_DEEPSEEK:
+ {
+ llm = std::make_unique<llm_build_deepseek>(*this, params);
+ } break;
+ case LLM_ARCH_DEEPSEEK2:
+ {
+ llm = std::make_unique<llm_build_deepseek2>(*this, params);
+ } break;
+ case LLM_ARCH_CHATGLM:
+ {
+ llm = std::make_unique<llm_build_chatglm>(*this, params);
+ } break;
+ case LLM_ARCH_GLM4:
+ {
+ llm = std::make_unique<llm_build_glm4>(*this, params);
+ } break;
+ case LLM_ARCH_GLM4_MOE:
+ {
+ llm = std::make_unique<llm_build_glm4_moe>(*this, params);
+ } break;
+ case LLM_ARCH_BITNET:
+ {
+ llm = std::make_unique<llm_build_bitnet>(*this, params);
+ } break;
+ case LLM_ARCH_T5:
+ {
+ switch (params.gtype) {
+ case LLM_GRAPH_TYPE_ENCODER:
+ llm = std::make_unique<llm_build_t5_enc>(*this, params);
+ break;
+ case LLM_GRAPH_TYPE_DEFAULT:
+ case LLM_GRAPH_TYPE_DECODER:
+ llm = std::make_unique<llm_build_t5_dec>(*this, params);
+ break;
+ default:
+ GGML_ABORT("invalid graph type");
+ };
+ } break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ llm = std::make_unique<llm_build_t5_enc>(*this, params);
+ }
+ break;
+ case LLM_ARCH_JAIS:
+ {
+ llm = std::make_unique<llm_build_jais>(*this, params);
+ } break;
+ case LLM_ARCH_NEMOTRON:
+ {
+ llm = std::make_unique<llm_build_nemotron>(*this, params);
+ } break;
+ case LLM_ARCH_NEMOTRON_H:
+ case LLM_ARCH_NEMOTRON_H_MOE:
+ {
+ llm = std::make_unique<llm_build_nemotron_h>(*this, params);
+ } break;
+ case LLM_ARCH_EXAONE:
+ {
+ llm = std::make_unique<llm_build_exaone>(*this, params);
+ } break;
+ case LLM_ARCH_EXAONE4:
+ {
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+ llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
+ } else {
+ llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
+ }
+ } break;
+ case LLM_ARCH_EXAONE_MOE:
+ {
+ llm = std::make_unique<llm_build_exaone_moe>(*this, params);
+ } break;
+ case LLM_ARCH_RWKV6:
+ {
+ llm = std::make_unique<llm_build_rwkv6>(*this, params);
+ } break;
+ case LLM_ARCH_RWKV6QWEN2:
+ {
+ llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
+ } break;
+ case LLM_ARCH_RWKV7:
+ {
+ llm = std::make_unique<llm_build_rwkv7>(*this, params);
+ } break;
+ case LLM_ARCH_ARWKV7:
+ {
+ llm = std::make_unique<llm_build_arwkv7>(*this, params);
+ } break;
+ case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
+ case LLM_ARCH_MINICPM:
+ {
+ llm = std::make_unique<llm_build_granite>(*this, params);
+ } break;
+ case LLM_ARCH_GRANITE_HYBRID:
+ {
+ llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
+ } break;
+ case LLM_ARCH_CHAMELEON:
+ {
+ llm = std::make_unique<llm_build_chameleon>(*this, params);
+ } break;
+ case LLM_ARCH_WAVTOKENIZER_DEC:
+ {
+ llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
+ } break;
+ case LLM_ARCH_PLM:
+ {
+ llm = std::make_unique<llm_build_plm>(*this, params);
+ } break;
+ case LLM_ARCH_BAILINGMOE:
+ {
+ llm = std::make_unique<llm_build_bailingmoe>(*this, params);
+ } break;
+ case LLM_ARCH_BAILINGMOE2:
+ {
+ llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
+ } break;
+ case LLM_ARCH_SEED_OSS:
+ {
+ llm = std::make_unique<llm_build_seed_oss>(*this, params);
+ } break;
+ case LLM_ARCH_DOTS1:
+ {
+ llm = std::make_unique<llm_build_dots1>(*this, params);
+ } break;
+ case LLM_ARCH_ARCEE:
+ {
+ llm = std::make_unique<llm_build_arcee>(*this, params);
+ } break;
+ case LLM_ARCH_AFMOE:
+ {
+ llm = std::make_unique<llm_build_afmoe>(*this, params);
+ } break;
+ case LLM_ARCH_ERNIE4_5:
+ {
+ llm = std::make_unique<llm_build_ernie4_5>(*this, params);
+ } break;
+ case LLM_ARCH_ERNIE4_5_MOE:
+ {
+ llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
+ } break;
+ case LLM_ARCH_HUNYUAN_MOE:
+ {
+ llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
+ } break;
+ case LLM_ARCH_HUNYUAN_DENSE:
+ {
+ llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
+ } break;
+ case LLM_ARCH_SMOLLM3:
+ {
+ llm = std::make_unique<llm_build_smollm3>(*this, params);
+ } break;
+ case LLM_ARCH_OPENAI_MOE:
+ {
+ llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
+ } break;
+ case LLM_ARCH_FALCON_H1:
+ {
+ llm = std::make_unique<llm_build_falcon_h1>(*this, params);
+ } break;
+ case LLM_ARCH_LFM2:
+ case LLM_ARCH_LFM2MOE:
+ {
+ llm = std::make_unique<llm_build_lfm2>(*this, params);
+ } break;
+ case LLM_ARCH_SMALLTHINKER:
+ {
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+ llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
+ } else {
+ llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
+ }
+ } break;
+ case LLM_ARCH_GROVEMOE:
+ {
+ llm = std::make_unique<llm_build_grovemoe>(*this, params);
+ } break;
+ case LLM_ARCH_APERTUS:
+ {
+ llm = std::make_unique<llm_build_apertus>(*this, params);
+ } break;
+ case LLM_ARCH_MINIMAX_M2:
+ {
+ llm = std::make_unique<llm_build_minimax_m2>(*this, params);
+ } break;
+ case LLM_ARCH_COGVLM:
+ {
+ llm = std::make_unique<llm_build_cogvlm>(*this, params);
+ } break;
+ case LLM_ARCH_PANGU_EMBED:
+ {
+ llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN3NEXT:
+ {
+ llm = std::make_unique<llm_build_qwen3next>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN35:
+ {
+ llm = std::make_unique<llm_build_qwen35>(*this, params);
+ } break;
+ case LLM_ARCH_QWEN35MOE:
+ {
+ llm = std::make_unique<llm_build_qwen35moe>(*this, params);
+ } break;
+ case LLM_ARCH_MISTRAL3:
+ {
+ llm = std::make_unique<llm_build_mistral3>(*this, params);
+ } break;
+ case LLM_ARCH_MIMO2:
+ {
+ llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
+ } break;
+ case LLM_ARCH_KIMI_LINEAR:
+ {
+ llm = std::make_unique<llm_build_kimi_linear>(*this, params);
+ } break;
+ case LLM_ARCH_STEP35:
+ {
+ llm = std::make_unique<llm_build_step35_iswa>(*this, params);
+ } break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+
+ // add on pooling layer
+ llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
+
+ // add backend sampling layers (if any)
+ llm->build_sampling();
+
+ // if the gguf model was converted with --sentence-transformers-dense-modules
+ // there will be two additional dense projection layers
+ // dense linear projections are applied after pooling
+ // TODO: move reranking logic here and generalize
+ llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
+
+ llm->res->set_outputs();
+
+ return llm->res->get_gf();
+}
+
+
+//
+// interface implementation
+//
+
+llama_model_params llama_model_default_params() {
+ llama_model_params result = {
+ /*.devices =*/ nullptr,
+ /*.tensor_buft_overrides =*/ nullptr,
+ /*.n_gpu_layers =*/ -1,
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
+ /*.main_gpu =*/ 0,
+ /*.tensor_split =*/ nullptr,
+ /*.progress_callback =*/ nullptr,
+ /*.progress_callback_user_data =*/ nullptr,
+ /*.kv_overrides =*/ nullptr,
+ /*.vocab_only =*/ false,
+ /*.use_mmap =*/ true,
+ /*.use_direct_io =*/ false,
+ /*.use_mlock =*/ false,
+ /*.check_tensors =*/ false,
+ /*.use_extra_bufts =*/ true,
+ /*.no_host =*/ false,
+ /*.no_alloc =*/ false,
+ };
+
+ return result;
+}
+
+const llama_vocab * llama_model_get_vocab(const llama_model * model) {
+ return &model->vocab;
+}
+
+void llama_free_model(llama_model * model) {
+ llama_model_free(model);
+}
+
+void llama_model_free(llama_model * model) {
+ delete model;
+}
+
+int32_t llama_model_n_ctx_train(const llama_model * model) {
+ return model->hparams.n_ctx_train;
+}
+
+int32_t llama_model_n_embd(const llama_model * model) {
+ return model->hparams.n_embd;
+}
+
+int32_t llama_model_n_embd_inp(const llama_model * model) {
+ return model->hparams.n_embd_inp();
+}
+
+int32_t llama_model_n_embd_out(const llama_model * model) {
+ return model->hparams.n_embd_out();
+}
+
+int32_t llama_model_n_layer(const llama_model * model) {
+ return model->hparams.n_layer;
+}
+
+int32_t llama_model_n_head(const llama_model * model) {
+ return model->hparams.n_head();
+}
+
+int32_t llama_model_n_head_kv(const llama_model * model) {
+ return model->hparams.n_head_kv();
+}
+
+int32_t llama_model_n_swa(const llama_model * model) {
+ return model->hparams.n_swa;
+}
+
+uint32_t llama_model_n_cls_out(const struct llama_model * model) {
+ return model->hparams.n_cls_out;
+}
+
+const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
+ if (i < model->classifier_labels.size()) {
+ return model->classifier_labels[i].c_str();
+ }
+
+ return nullptr;
+}
+
+// deprecated
+int32_t llama_n_ctx_train(const llama_model * model) {
+ return llama_model_n_ctx_train(model);
+}
+
+// deprecated
+int32_t llama_n_embd(const llama_model * model) {
+ return llama_model_n_embd(model);
+}
+
+// deprecated
+int32_t llama_n_layer(const llama_model * model) {
+ return llama_model_n_layer(model);
+}
+
+// deprecated
+int32_t llama_n_head(const llama_model * model) {
+ return llama_model_n_head(model);
+}
+
+llama_rope_type llama_model_rope_type(const llama_model * model) {
+ switch (model->arch) {
+ // these models do not use RoPE
+ case LLM_ARCH_CLIP:
+ case LLM_ARCH_GPT2:
+ case LLM_ARCH_GPTJ:
+ case LLM_ARCH_MPT:
+ case LLM_ARCH_REFACT:
+ case LLM_ARCH_BLOOM:
+ case LLM_ARCH_MAMBA:
+ case LLM_ARCH_MAMBA2:
+ case LLM_ARCH_JAMBA:
+ case LLM_ARCH_JINA_BERT_V2:
+ case LLM_ARCH_T5:
+ case LLM_ARCH_T5ENCODER:
+ case LLM_ARCH_JAIS:
+ case LLM_ARCH_RWKV6:
+ case LLM_ARCH_RWKV6QWEN2:
+ case LLM_ARCH_RWKV7:
+ case LLM_ARCH_ARWKV7:
+ case LLM_ARCH_WAVTOKENIZER_DEC:
+ case LLM_ARCH_NEMOTRON_H:
+ case LLM_ARCH_NEMOTRON_H_MOE:
+ case LLM_ARCH_KIMI_LINEAR:
+ return LLAMA_ROPE_TYPE_NONE;
+
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
+ case LLM_ARCH_LLAMA:
+ case LLM_ARCH_LLADA:
+ case LLM_ARCH_LLAMA4:
+ case LLM_ARCH_DECI:
+ case LLM_ARCH_BAICHUAN:
+ case LLM_ARCH_STARCODER:
+ case LLM_ARCH_INTERNLM2:
+ case LLM_ARCH_MINICPM:
+ case LLM_ARCH_XVERSE:
+ case LLM_ARCH_COMMAND_R:
+ case LLM_ARCH_COHERE2:
+ case LLM_ARCH_OLMO:
+ case LLM_ARCH_ARCTIC:
+ case LLM_ARCH_DEEPSEEK:
+ case LLM_ARCH_DEEPSEEK2:
+ case LLM_ARCH_PLM:
+ case LLM_ARCH_CHATGLM:
+ case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
+ case LLM_ARCH_GRANITE_HYBRID:
+ case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_BAILINGMOE:
+ case LLM_ARCH_NEO_BERT:
+ case LLM_ARCH_SMOLLM3:
+ case LLM_ARCH_ARCEE:
+ case LLM_ARCH_ERNIE4_5:
+ case LLM_ARCH_ERNIE4_5_MOE:
+ case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
+ case LLM_ARCH_MAINCODER:
+ return LLAMA_ROPE_TYPE_NORM;
+
+ // the pairs of head values are offset by n_rot/2
+ case LLM_ARCH_FALCON:
+ case LLM_ARCH_FALCON_H1:
+ case LLM_ARCH_GROK:
+ case LLM_ARCH_DBRX:
+ case LLM_ARCH_BERT:
+ case LLM_ARCH_JINA_BERT_V3:
+ case LLM_ARCH_MODERN_BERT:
+ case LLM_ARCH_NOMIC_BERT:
+ case LLM_ARCH_NOMIC_BERT_MOE:
+ case LLM_ARCH_STABLELM:
+ case LLM_ARCH_BITNET:
+ case LLM_ARCH_QWEN:
+ case LLM_ARCH_QWEN2:
+ case LLM_ARCH_DREAM:
+ case LLM_ARCH_QWEN2MOE:
+ case LLM_ARCH_QWEN3:
+ case LLM_ARCH_QWEN3MOE:
+ case LLM_ARCH_LLADA_MOE:
+ case LLM_ARCH_RND1:
+ case LLM_ARCH_OLMO2:
+ case LLM_ARCH_OLMOE:
+ case LLM_ARCH_PHI2:
+ case LLM_ARCH_PHI3:
+ case LLM_ARCH_PHIMOE:
+ case LLM_ARCH_PLAMO:
+ case LLM_ARCH_PLAMO2:
+ case LLM_ARCH_PLAMO3:
+ case LLM_ARCH_GEMMA:
+ case LLM_ARCH_GEMMA2:
+ case LLM_ARCH_GEMMA3:
+ case LLM_ARCH_GEMMA3N:
+ case LLM_ARCH_GEMMA_EMBEDDING:
+ case LLM_ARCH_STARCODER2:
+ case LLM_ARCH_OPENELM:
+ case LLM_ARCH_GPTNEOX:
+ case LLM_ARCH_CODESHELL:
+ case LLM_ARCH_ORION:
+ case LLM_ARCH_NEMOTRON:
+ case LLM_ARCH_EXAONE:
+ case LLM_ARCH_EXAONE4:
+ case LLM_ARCH_EXAONE_MOE:
+ case LLM_ARCH_MINICPM3:
+ case LLM_ARCH_BAILINGMOE2:
+ case LLM_ARCH_DOTS1:
+ case LLM_ARCH_HUNYUAN_MOE:
+ case LLM_ARCH_OPENAI_MOE:
+ case LLM_ARCH_HUNYUAN_DENSE:
+ case LLM_ARCH_LFM2:
+ case LLM_ARCH_LFM2MOE:
+ case LLM_ARCH_SMALLTHINKER:
+ case LLM_ARCH_SEED_OSS:
+ case LLM_ARCH_GROVEMOE:
+ case LLM_ARCH_APERTUS:
+ case LLM_ARCH_MINIMAX_M2:
+ case LLM_ARCH_COGVLM:
+ case LLM_ARCH_PANGU_EMBED:
+ case LLM_ARCH_AFMOE:
+ case LLM_ARCH_QWEN3NEXT:
+ case LLM_ARCH_MIMO2:
+ case LLM_ARCH_STEP35:
+ return LLAMA_ROPE_TYPE_NEOX;
+
+ case LLM_ARCH_QWEN2VL:
+ return LLAMA_ROPE_TYPE_MROPE;
+ case LLM_ARCH_QWEN3VL:
+ case LLM_ARCH_QWEN3VLMOE:
+ case LLM_ARCH_QWEN35:
+ case LLM_ARCH_QWEN35MOE:
+ return LLAMA_ROPE_TYPE_IMROPE;
+
+ case LLM_ARCH_GLM4:
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
+ case LLM_ARCH_GLM4_MOE:
+ return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
+
+ // all model arches should be listed explicitly here
+ case LLM_ARCH_UNKNOWN:
+ GGML_ABORT("unknown architecture");
+ }
+
+ return LLAMA_ROPE_TYPE_NONE;
+}
+
+float llama_model_rope_freq_scale_train(const llama_model * model) {
+ return model->hparams.rope_freq_scale_train;
+}
+
+int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
+ const auto & it = model->gguf_kv.find(key);
+ if (it == model->gguf_kv.end()) {
+ if (buf_size > 0) {
+ buf[0] = '\0';
+ }
+ return -1;
+ }
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+int32_t llama_model_meta_count(const llama_model * model) {
+ return (int)model->gguf_kv.size();
+}
+
+const char * llama_model_meta_key_str(llama_model_meta_key key) {
+ switch (key) {
+ case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
+ case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
+ case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
+ case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
+ case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
+ case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
+ case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
+ case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
+ default: return nullptr;
+ }
+}
+
+int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
+ if (buf_size > 0) {
+ buf[0] = '\0';
+ }
+ return -1;
+ }
+ auto it = model->gguf_kv.begin();
+ std::advance(it, i);
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
+}
+
+int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
+ if (buf_size > 0) {
+ buf[0] = '\0';
+ }
+ return -1;
+ }
+ auto it = model->gguf_kv.begin();
+ std::advance(it, i);
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
+ return snprintf(buf, buf_size, "%s", model->desc().c_str());
+}
+
+uint64_t llama_model_size(const llama_model * model) {
+ return model->size();
+}
+
+const char * llama_model_chat_template(const llama_model * model, const char * name) {
+ const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
+ : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
+ const auto & it = model->gguf_kv.find(key);
+ if (it == model->gguf_kv.end()) {
+ // one-off fix for very popular models (so we are not flooded with issues)
+ // do not extend this list unless absolutely necessary
+ // Mistral-Small-2503 does not have built-in chat template
+ llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
+ if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
+ return "mistral-v7-tekken";
+ }
+
+ return nullptr;
+ }
+
+ return it->second.c_str();
+}
+
+uint64_t llama_model_n_params(const llama_model * model) {
+ return model->n_elements();
+}
+
+bool llama_model_has_encoder(const llama_model * model) {
+ switch (model->arch) {
+ case LLM_ARCH_T5: return true;
+ case LLM_ARCH_T5ENCODER: return true;
+ default: return false;
+ }
+}
+
+bool llama_model_has_decoder(const llama_model * model) {
+ switch (model->arch) {
+ case LLM_ARCH_T5ENCODER: return false;
+ default: return true;
+ }
+}
+
+llama_token llama_model_decoder_start_token(const llama_model * model) {
+ return model->hparams.dec_start_token_id;
+}
+
+bool llama_model_is_recurrent(const llama_model * model) {
+ return llm_arch_is_recurrent(model->arch);
+}
+
+bool llama_model_is_hybrid(const llama_model * model) {
+ return llm_arch_is_hybrid(model->arch);
+}
+
+bool llama_model_is_diffusion(const llama_model * model) {
+ return llm_arch_is_diffusion(model->arch);
+}
+
+const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
+ return model->tensors_by_name;
+}
diff --git a/llama.cpp/src/llama-model.h b/llama.cpp/src/llama-model.h
new file mode 100644
index 0000000..adc8ff6
--- /dev/null
+++ b/llama.cpp/src/llama-model.h
@@ -0,0 +1,563 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-arch.h"
+#include "llama-graph.h"
+#include "llama-hparams.h"
+#include "llama-memory.h"
+#include "llama-vocab.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+struct llama_cparams;
+struct llama_ubatch;
+struct llama_model_loader;
+
+// available models
+enum llm_type {
+ LLM_TYPE_UNKNOWN,
+ LLM_TYPE_14M,
+ LLM_TYPE_17M,
+ LLM_TYPE_22M,
+ LLM_TYPE_33M,
+ LLM_TYPE_47M,
+ LLM_TYPE_60M,
+ LLM_TYPE_70M,
+ LLM_TYPE_80M,
+ LLM_TYPE_109M,
+ LLM_TYPE_137M,
+ LLM_TYPE_140M,
+ LLM_TYPE_149M,
+ LLM_TYPE_160M,
+ LLM_TYPE_190M,
+ LLM_TYPE_220M,
+ LLM_TYPE_250M,
+ LLM_TYPE_256M,
+ LLM_TYPE_270M,
+ LLM_TYPE_335M,
+ LLM_TYPE_350M,
+ LLM_TYPE_360M,
+ LLM_TYPE_395M,
+ LLM_TYPE_410M,
+ LLM_TYPE_450M,
+ LLM_TYPE_475M,
+ LLM_TYPE_558M,
+ LLM_TYPE_700M,
+ LLM_TYPE_770M,
+ LLM_TYPE_780M,
+ LLM_TYPE_950M,
+ LLM_TYPE_0_3B,
+ LLM_TYPE_0_5B,
+ LLM_TYPE_0_6B,
+ LLM_TYPE_1B,
+ LLM_TYPE_1_2B,
+ LLM_TYPE_1_3B,
+ LLM_TYPE_1_4B,
+ LLM_TYPE_1_5B,
+ LLM_TYPE_1_6B,
+ LLM_TYPE_1_7B,
+ LLM_TYPE_1_8B,
+ LLM_TYPE_2B,
+ LLM_TYPE_2_6B,
+ LLM_TYPE_2_8B,
+ LLM_TYPE_2_9B,
+ LLM_TYPE_3B,
+ LLM_TYPE_4B,
+ LLM_TYPE_6B,
+ LLM_TYPE_6_9B,
+ LLM_TYPE_7B,
+ LLM_TYPE_8B,
+ LLM_TYPE_9B,
+ LLM_TYPE_11B,
+ LLM_TYPE_12B,
+ LLM_TYPE_13B,
+ LLM_TYPE_14B,
+ LLM_TYPE_15B,
+ LLM_TYPE_16B,
+ LLM_TYPE_20B,
+ LLM_TYPE_26B,
+ LLM_TYPE_27B,
+ LLM_TYPE_30B,
+ LLM_TYPE_32B,
+ LLM_TYPE_34B,
+ LLM_TYPE_35B,
+ LLM_TYPE_36B,
+ LLM_TYPE_40B,
+ LLM_TYPE_65B,
+ LLM_TYPE_70B,
+ LLM_TYPE_120B,
+ LLM_TYPE_142B,
+ LLM_TYPE_236B,
+ LLM_TYPE_290B,
+ LLM_TYPE_314B,
+ LLM_TYPE_405B,
+ LLM_TYPE_671B,
+ LLM_TYPE_SMALL,
+ LLM_TYPE_MEDIUM,
+ LLM_TYPE_LARGE,
+ LLM_TYPE_XL,
+ LLM_TYPE_A1_7B,
+ LLM_TYPE_A2_7B,
+ LLM_TYPE_8x7B,
+ LLM_TYPE_8x22B,
+ LLM_TYPE_16x12B,
+ LLM_TYPE_16x3_8B,
+ LLM_TYPE_10B_128x3_66B,
+ LLM_TYPE_57B_A14B,
+ LLM_TYPE_17B_16E, // llama4 Scout
+ LLM_TYPE_17B_128E, // llama4 Maverick
+ LLM_TYPE_A13B,
+ LLM_TYPE_7B_A1B,
+ LLM_TYPE_8B_A1B, // lfm2moe
+ LLM_TYPE_16B_A1B,
+ LLM_TYPE_21B_A3B, // Ernie MoE small
+ LLM_TYPE_30B_A3B,
+ LLM_TYPE_31B_A3_5B,
+ LLM_TYPE_35B_A3B, // Qwen3.5
+ LLM_TYPE_48B_A3B, // Kimi Linear
+ LLM_TYPE_80B_A3B, // Qwen3 Next
+ LLM_TYPE_100B_A6B,
+ LLM_TYPE_102B_A12B, // Solar-Open
+ LLM_TYPE_106B_A12B, // GLM-4.5-Air
+ LLM_TYPE_196B_A11B, // Step3.5-Flash
+ LLM_TYPE_230B_A10B, // Minimax M2
+ LLM_TYPE_235B_A22B,
+ LLM_TYPE_300B_A47B, // Ernie MoE big
+ LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
+ LLM_TYPE_355B_A32B, // GLM-4.5
+ LLM_TYPE_E2B,
+ LLM_TYPE_E4B,
+};
+
+std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
+
+struct llama_layer_posnet {
+ // resnet
+ struct ggml_tensor * norm1 = nullptr;
+ struct ggml_tensor * norm1_b = nullptr;
+
+ struct ggml_tensor * conv1 = nullptr;
+ struct ggml_tensor * conv1_b = nullptr;
+
+ struct ggml_tensor * norm2 = nullptr;
+ struct ggml_tensor * norm2_b = nullptr;
+
+ struct ggml_tensor * conv2 = nullptr;
+ struct ggml_tensor * conv2_b = nullptr;
+
+ // attention
+ struct ggml_tensor * attn_norm = nullptr;
+ struct ggml_tensor * attn_norm_b = nullptr;
+
+ struct ggml_tensor * attn_q = nullptr;
+ struct ggml_tensor * attn_q_b = nullptr;
+
+ struct ggml_tensor * attn_k = nullptr;
+ struct ggml_tensor * attn_k_b = nullptr;
+
+ struct ggml_tensor * attn_v = nullptr;
+ struct ggml_tensor * attn_v_b = nullptr;
+
+ struct ggml_tensor * attn_o = nullptr;
+ struct ggml_tensor * attn_o_b = nullptr;
+
+ // normalize
+ struct ggml_tensor * norm = nullptr;
+ struct ggml_tensor * norm_b = nullptr;
+};
+
+struct llama_layer_convnext {
+ struct ggml_tensor * dw = nullptr;
+ struct ggml_tensor * dw_b = nullptr;
+
+ struct ggml_tensor * norm = nullptr;
+ struct ggml_tensor * norm_b = nullptr;
+
+ struct ggml_tensor * pw1 = nullptr;
+ struct ggml_tensor * pw1_b = nullptr;
+
+ struct ggml_tensor * pw2 = nullptr;
+ struct ggml_tensor * pw2_b = nullptr;
+
+ struct ggml_tensor * gamma = nullptr;
+};
+
+struct llama_layer_shortconv {
+ struct ggml_tensor * in_proj = nullptr;
+ struct ggml_tensor * conv = nullptr;
+ struct ggml_tensor * out_proj = nullptr;
+};
+
+struct llama_layer_nextn {
+ struct ggml_tensor * eh_proj = nullptr;
+ struct ggml_tensor * embed_tokens = nullptr;
+ struct ggml_tensor * enorm = nullptr;
+ struct ggml_tensor * hnorm = nullptr;
+ struct ggml_tensor * shared_head_head = nullptr;
+ struct ggml_tensor * shared_head_norm = nullptr;
+};
+
+struct llama_layer {
+ // normalization
+ struct ggml_tensor * attn_norm = nullptr;
+ struct ggml_tensor * attn_norm_b = nullptr;
+ struct ggml_tensor * attn_norm_2 = nullptr;
+ struct ggml_tensor * attn_norm_2_b = nullptr;
+ struct ggml_tensor * attn_q_norm = nullptr;
+ struct ggml_tensor * attn_q_norm_b = nullptr;
+ struct ggml_tensor * attn_k_norm = nullptr;
+ struct ggml_tensor * attn_k_norm_b = nullptr;
+ struct ggml_tensor * attn_out_norm = nullptr;
+ struct ggml_tensor * attn_out_norm_b = nullptr;
+ struct ggml_tensor * attn_q_a_norm = nullptr;
+ struct ggml_tensor * attn_kv_a_norm = nullptr;
+ struct ggml_tensor * attn_sub_norm = nullptr;
+ struct ggml_tensor * attn_post_norm = nullptr;
+ struct ggml_tensor * ffn_sub_norm = nullptr;
+ struct ggml_tensor * attn_norm_cross = nullptr;
+ struct ggml_tensor * attn_norm_enc = nullptr;
+ struct ggml_tensor * ssm_norm = nullptr;
+ struct ggml_tensor * ssm_dt_norm = nullptr;
+ struct ggml_tensor * ssm_b_norm = nullptr;
+ struct ggml_tensor * ssm_c_norm = nullptr;
+
+ // attention
+ struct ggml_tensor * wq = nullptr;
+ struct ggml_tensor * wk = nullptr;
+ struct ggml_tensor * wv = nullptr;
+ struct ggml_tensor * wo = nullptr;
+ struct ggml_tensor * wqkv = nullptr;
+ struct ggml_tensor * wq_a = nullptr;
+ struct ggml_tensor * wq_b = nullptr;
+ struct ggml_tensor * wkv_a_mqa = nullptr;
+ struct ggml_tensor * wkv_b = nullptr;
+ struct ggml_tensor * wk_b = nullptr;
+ struct ggml_tensor * wv_b = nullptr;
+ struct ggml_tensor * wq_cross = nullptr;
+ struct ggml_tensor * wk_cross = nullptr;
+ struct ggml_tensor * wv_cross = nullptr;
+ struct ggml_tensor * wo_cross = nullptr;
+ struct ggml_tensor * wq_enc = nullptr;
+ struct ggml_tensor * wk_enc = nullptr;
+ struct ggml_tensor * wv_enc = nullptr;
+ struct ggml_tensor * wo_enc = nullptr;
+ struct ggml_tensor * wqkv_gate = nullptr;
+
+ // attention bias
+ struct ggml_tensor * bq = nullptr;
+ struct ggml_tensor * bk = nullptr;
+ struct ggml_tensor * bv = nullptr;
+ struct ggml_tensor * bo = nullptr;
+ struct ggml_tensor * bqkv = nullptr;
+
+ // relative position bias
+ struct ggml_tensor * attn_rel_b = nullptr;
+ struct ggml_tensor * attn_rel_b_enc = nullptr;
+ struct ggml_tensor * attn_rel_b_cross = nullptr;
+
+ // normalization
+ struct ggml_tensor * ffn_norm = nullptr;
+ struct ggml_tensor * ffn_norm_b = nullptr;
+ struct ggml_tensor * ffn_post_norm = nullptr;
+ struct ggml_tensor * layer_out_norm = nullptr;
+ struct ggml_tensor * layer_out_norm_b = nullptr;
+ struct ggml_tensor * ffn_norm_exps = nullptr;
+ struct ggml_tensor * ffn_norm_enc = nullptr;
+
+ // ff
+ struct ggml_tensor * ffn_gate = nullptr; // w1
+ struct ggml_tensor * ffn_down = nullptr; // w2
+ struct ggml_tensor * ffn_up = nullptr; // w3
+ struct ggml_tensor * ffn_gate_enc = nullptr;
+ struct ggml_tensor * ffn_down_enc = nullptr;
+ struct ggml_tensor * ffn_up_enc = nullptr;
+
+ // ff MoE
+ struct ggml_tensor * ffn_gate_inp = nullptr;
+ struct ggml_tensor * ffn_gate_exps = nullptr;
+ struct ggml_tensor * ffn_down_exps = nullptr;
+ struct ggml_tensor * ffn_up_exps = nullptr;
+ struct ggml_tensor * ffn_gate_inp_b = nullptr;
+ struct ggml_tensor * ffn_gate_exps_b = nullptr;
+ struct ggml_tensor * ffn_down_exps_b = nullptr;
+ struct ggml_tensor * ffn_up_exps_b = nullptr;
+
+ // ff shared expert (shexp)
+ struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
+ struct ggml_tensor * ffn_gate_shexp = nullptr;
+ struct ggml_tensor * ffn_down_shexp = nullptr;
+ struct ggml_tensor * ffn_up_shexp = nullptr;
+
+ // ff adjugate experts (chexps)
+ struct ggml_tensor * ffn_gate_chexps = nullptr;
+ struct ggml_tensor * ffn_down_chexps = nullptr;
+ struct ggml_tensor * ffn_up_chexps = nullptr;
+
+ // ff bias
+ struct ggml_tensor * ffn_gate_b = nullptr;
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
+ struct ggml_tensor * ffn_act = nullptr;
+ struct ggml_tensor * ffn_exp_probs_b = nullptr;
+
+ // mamba proj
+ struct ggml_tensor * ssm_in = nullptr;
+ struct ggml_tensor * ssm_x = nullptr;
+ struct ggml_tensor * ssm_dt = nullptr;
+ struct ggml_tensor * ssm_out = nullptr;
+
+ // mamba
+ struct ggml_tensor * ssm_conv1d = nullptr;
+ struct ggml_tensor * ssm_a = nullptr;
+ struct ggml_tensor * ssm_d = nullptr;
+
+ // mamba bias
+ struct ggml_tensor * ssm_conv1d_b = nullptr;
+ struct ggml_tensor * ssm_dt_b = nullptr;
+
+ // qwen3next
+ struct ggml_tensor * ssm_beta_alpha = nullptr;
+
+ // qwen3.5
+ struct ggml_tensor * ssm_alpha = nullptr;
+
+ // rwkv
+ struct ggml_tensor * time_mix_w1 = nullptr;
+ struct ggml_tensor * time_mix_w2 = nullptr;
+ struct ggml_tensor * time_mix_lerp_x = nullptr;
+ struct ggml_tensor * time_mix_lerp_w = nullptr;
+ struct ggml_tensor * time_mix_lerp_k = nullptr;
+ struct ggml_tensor * time_mix_lerp_v = nullptr;
+ struct ggml_tensor * time_mix_lerp_r = nullptr;
+ struct ggml_tensor * time_mix_lerp_g = nullptr;
+ struct ggml_tensor * time_mix_lerp_fused = nullptr;
+
+ struct ggml_tensor * time_mix_first = nullptr;
+ struct ggml_tensor * time_mix_decay = nullptr;
+ struct ggml_tensor * time_mix_decay_w1 = nullptr;
+ struct ggml_tensor * time_mix_decay_w2 = nullptr;
+ struct ggml_tensor * time_mix_key = nullptr;
+ struct ggml_tensor * time_mix_key_b = nullptr;
+ struct ggml_tensor * time_mix_value = nullptr;
+ struct ggml_tensor * time_mix_value_b = nullptr;
+ struct ggml_tensor * time_mix_receptance = nullptr;
+ struct ggml_tensor * time_mix_receptance_b = nullptr;
+ struct ggml_tensor * time_mix_gate = nullptr;
+
+ // rwkv7
+ struct ggml_tensor * time_mix_w0 = nullptr;
+ struct ggml_tensor * time_mix_a0 = nullptr;
+ struct ggml_tensor * time_mix_a1 = nullptr;
+ struct ggml_tensor * time_mix_a2 = nullptr;
+ struct ggml_tensor * time_mix_v0 = nullptr;
+ struct ggml_tensor * time_mix_v1 = nullptr;
+ struct ggml_tensor * time_mix_v2 = nullptr;
+ struct ggml_tensor * time_mix_g1 = nullptr;
+ struct ggml_tensor * time_mix_g2 = nullptr;
+ struct ggml_tensor * time_mix_k_k = nullptr;
+ struct ggml_tensor * time_mix_k_a = nullptr;
+ struct ggml_tensor * time_mix_r_k = nullptr;
+
+ struct ggml_tensor * time_mix_ln = nullptr;
+ struct ggml_tensor * time_mix_ln_b = nullptr;
+ struct ggml_tensor * time_mix_output = nullptr;
+
+ struct ggml_tensor * channel_mix_lerp_k = nullptr;
+ struct ggml_tensor * channel_mix_lerp_r = nullptr;
+
+ struct ggml_tensor * channel_mix_key = nullptr;
+ struct ggml_tensor * channel_mix_receptance = nullptr;
+ struct ggml_tensor * channel_mix_value = nullptr;
+
+ // long rope factors
+ struct ggml_tensor * rope_long = nullptr;
+ struct ggml_tensor * rope_short = nullptr;
+ struct ggml_tensor * rope_freqs = nullptr;
+
+ // bitnet scale
+ struct ggml_tensor * wq_scale = nullptr;
+ struct ggml_tensor * wk_scale = nullptr;
+ struct ggml_tensor * wv_scale = nullptr;
+ struct ggml_tensor * wo_scale = nullptr;
+ struct ggml_tensor * ffn_gate_scale = nullptr;
+ struct ggml_tensor * ffn_up_scale = nullptr;
+ struct ggml_tensor * ffn_down_scale = nullptr;
+
+ // altup & laurel
+ struct ggml_tensor * per_layer_inp_gate = nullptr;
+ struct ggml_tensor * per_layer_proj = nullptr;
+ struct ggml_tensor * per_layer_post_norm = nullptr;
+ struct ggml_tensor * altup_correct_coef = nullptr;
+ struct ggml_tensor * altup_correct_scale = nullptr;
+ struct ggml_tensor * altup_predict_coef = nullptr;
+ struct ggml_tensor * altup_router = nullptr;
+ struct ggml_tensor * altup_router_norm = nullptr;
+ struct ggml_tensor * laurel_l = nullptr;
+ struct ggml_tensor * laurel_r = nullptr;
+ struct ggml_tensor * laurel_post_norm = nullptr;
+
+ // openai-moe
+ struct ggml_tensor * attn_sinks = nullptr;
+
+ // cogvlm
+ struct ggml_tensor * visexp_attn_wqkv = nullptr;
+ struct ggml_tensor * visexp_attn_wo = nullptr;
+ struct ggml_tensor * visexp_ffn_gate = nullptr;
+ struct ggml_tensor * visexp_ffn_down = nullptr;
+ struct ggml_tensor * visexp_ffn_up = nullptr;
+
+ // xIELU activation parameters for Apertus
+ struct ggml_tensor * ffn_act_alpha_n = nullptr;
+ struct ggml_tensor * ffn_act_alpha_p = nullptr;
+ struct ggml_tensor * ffn_act_beta = nullptr;
+ struct ggml_tensor * ffn_act_eps = nullptr;
+
+ // Kimi Linear KDA (using ssm_ prefix for consistency)
+ // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
+ struct ggml_tensor * ssm_q_conv = nullptr;
+ struct ggml_tensor * ssm_k_conv = nullptr;
+ struct ggml_tensor * ssm_v_conv = nullptr;
+ struct ggml_tensor * ssm_f_a = nullptr;
+ struct ggml_tensor * ssm_f_b = nullptr;
+ struct ggml_tensor * ssm_beta = nullptr;
+ struct ggml_tensor * ssm_g_a = nullptr;
+ struct ggml_tensor * ssm_g_b = nullptr;
+ struct ggml_tensor * ssm_o_norm = nullptr;
+
+ struct llama_layer_posnet posnet;
+
+ struct llama_layer_convnext convnext;
+
+ struct llama_layer_shortconv shortconv;
+
+ struct llama_layer_nextn nextn;
+};
+
+struct llama_model {
+ llm_type type = LLM_TYPE_UNKNOWN;
+ llm_arch arch = LLM_ARCH_UNKNOWN;
+
+ std::string name = "n/a";
+
+ llama_hparams hparams = {};
+ llama_vocab vocab;
+
+ // for classifier models
+ std::vector<std::string> classifier_labels;
+
+ struct ggml_tensor * tok_embd = nullptr;
+ struct ggml_tensor * type_embd = nullptr;
+ struct ggml_tensor * pos_embd = nullptr;
+ struct ggml_tensor * tok_norm = nullptr;
+ struct ggml_tensor * tok_norm_b = nullptr;
+
+ struct ggml_tensor * output_norm = nullptr;
+ struct ggml_tensor * output_norm_b = nullptr;
+ struct ggml_tensor * output = nullptr;
+ struct ggml_tensor * output_b = nullptr;
+ struct ggml_tensor * output_norm_enc = nullptr;
+
+ // classifier
+ struct ggml_tensor * cls = nullptr;
+ struct ggml_tensor * cls_b = nullptr;
+ struct ggml_tensor * cls_out = nullptr;
+ struct ggml_tensor * cls_out_b = nullptr;
+
+ struct ggml_tensor * conv1d = nullptr;
+ struct ggml_tensor * conv1d_b = nullptr;
+
+ // gemma3n altup
+ struct ggml_tensor * tok_embd_per_layer = nullptr;
+ struct ggml_tensor * altup_proj = nullptr;
+ struct ggml_tensor * altup_unembd_proj = nullptr;
+ struct ggml_tensor * per_layer_model_proj = nullptr;
+ struct ggml_tensor * per_layer_proj_norm = nullptr;
+
+ std::vector<llama_layer> layers;
+
+ //Dense linear projections for SentenceTransformers models like embeddinggemma
+ // For Sentence Transformers models structure see
+ // https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
+ struct ggml_tensor * dense_2_out_layers = nullptr;
+ struct ggml_tensor * dense_3_out_layers = nullptr;
+
+ // gguf metadata
+ std::unordered_map<std::string, std::string> gguf_kv;
+
+ // list of devices used in this model
+ std::vector<ggml_backend_dev_t> devices;
+
+ // for quantize-stats only
+ std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+
+ // for keeping track of associated LoRA adapters
+ std::unordered_set<llama_adapter_lora *> loras;
+
+ int64_t t_load_us = 0;
+ int64_t t_start_us = 0;
+
+ explicit llama_model(const struct llama_model_params & params);
+ ~llama_model();
+
+ void load_stats (llama_model_loader & ml);
+ void load_arch (llama_model_loader & ml);
+ void load_hparams(llama_model_loader & ml);
+ void load_vocab (llama_model_loader & ml);
+ bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
+
+ std::string arch_name() const;
+ std::string type_name() const;
+
+ std::string desc() const;
+
+ size_t size() const; // file size
+ size_t n_tensors() const;
+ size_t n_devices() const;
+
+ uint32_t n_gpu_layers() const;
+ llama_split_mode split_mode() const;
+
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
+
+ // total number of parameters in the model
+ uint64_t n_elements() const;
+
+ void print_info() const;
+
+ ggml_backend_dev_t dev_layer(int il) const;
+ ggml_backend_dev_t dev_output() const;
+
+ ggml_backend_buffer_type_t select_buft(int il) const;
+
+ bool has_tensor_overrides() const;
+
+ const struct ggml_tensor * get_tensor(const char * name) const;
+
+ float get_rope_freq_base (const llama_cparams & cparams, int il) const;
+ float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
+
+ ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
+
+ // TODO: move this to new llm_arch_model_i interface
+ llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
+
+ // TODO: move this to new llm_arch_model_i interface
+ ggml_cgraph * build_graph(const llm_graph_params & params) const;
+
+private:
+ llama_model_params params;
+
+ struct impl;
+ std::unique_ptr<impl> pimpl;
+};
+
+const char * llm_type_name(llm_type type);
+
+// For internal test use
+// TODO: remove
+const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
diff --git a/llama.cpp/src/llama-quant.cpp b/llama.cpp/src/llama-quant.cpp
new file mode 100644
index 0000000..a789164
--- /dev/null
+++ b/llama.cpp/src/llama-quant.cpp
@@ -0,0 +1,1069 @@
+#include "llama-quant.h"
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-model-loader.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <cinttypes>
+#include <fstream>
+#include <mutex>
+#include <regex>
+#include <thread>
+#include <unordered_map>
+
+// Quantization types. Changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+ std::string name;
+ ggml_type quant = GGML_TYPE_COUNT;
+};
+
+static void zeros(std::ofstream & file, size_t n) {
+ char zero = 0;
+ for (size_t i = 0; i < n; ++i) {
+ file.write(&zero, 1);
+ }
+}
+
+static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
+ if (prune.empty()) {
+ return orig_name;
+ }
+
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
+ const int blk = std::stoi(match[1]);
+ std::string new_name = orig_name;
+
+ if (mapped.count(blk)) {
+ // Already mapped, do nothing
+ } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
+ mapped[blk] = "";
+ } else if (blk < prune.front()) {
+ mapped[blk] = std::to_string(blk);
+ next_id = blk + 1;
+ } else {
+ mapped[blk] = std::to_string(next_id);
+ ++next_id;
+ }
+
+ return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
+ }
+
+ return orig_name;
+}
+
+static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
+ if (mapped.empty()) {
+ return orig_name;
+ }
+
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
+ const std::string blk(match[1]);
+ std::string new_name = orig_name;
+
+ for (const auto & p : mapped) {
+ if (p.second == blk) {
+ LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
+ return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
+ }
+ }
+ GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
+ }
+
+ return orig_name;
+}
+
+struct quantize_state_impl {
+ const llama_model & model;
+ const llama_model_quantize_params * params;
+
+ int n_attention_wv = 0;
+ int n_ffn_down = 0;
+ int n_ffn_gate = 0;
+ int n_ffn_up = 0;
+ int i_attention_wv = 0;
+ int i_ffn_down = 0;
+ int i_ffn_gate = 0;
+ int i_ffn_up = 0;
+
+ int n_k_quantized = 0;
+ int n_fallback = 0;
+
+ bool has_imatrix = false;
+
+ // used to figure out if a model shares tok_embd with the output weight
+ bool has_output = false;
+
+ quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
+ : model(model)
+ , params(params)
+ {}
+};
+
+static void llama_tensor_dequantize_impl(
+ ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
+ const size_t nelements, const int nthread
+) {
+ if (output.size() < nelements) {
+ output.resize(nelements);
+ }
+ float * f32_output = (float *) output.data();
+
+ const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
+ if (ggml_is_quantized(tensor->type)) {
+ if (qtype->to_float == NULL) {
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
+ }
+ } else if (tensor->type != GGML_TYPE_F16 &&
+ tensor->type != GGML_TYPE_BF16) {
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
+ }
+
+ if (nthread < 2) {
+ if (tensor->type == GGML_TYPE_F16) {
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
+ } else if (tensor->type == GGML_TYPE_BF16) {
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
+ } else if (ggml_is_quantized(tensor->type)) {
+ qtype->to_float(tensor->data, f32_output, nelements);
+ } else {
+ GGML_ABORT("fatal error"); // unreachable
+ }
+ return;
+ }
+
+ size_t block_size;
+ if (tensor->type == GGML_TYPE_F16 ||
+ tensor->type == GGML_TYPE_BF16) {
+ block_size = 1;
+ } else {
+ block_size = (size_t)ggml_blck_size(tensor->type);
+ }
+
+ size_t block_size_bytes = ggml_type_size(tensor->type);
+
+ GGML_ASSERT(nelements % block_size == 0);
+ size_t nblocks = nelements / block_size;
+ size_t blocks_per_thread = nblocks / nthread;
+ size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
+
+ size_t in_buff_offs = 0;
+ size_t out_buff_offs = 0;
+
+ for (int tnum = 0; tnum < nthread; tnum++) {
+ size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+ size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
+ size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+ if (typ == GGML_TYPE_F16) {
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+ } else if (typ == GGML_TYPE_BF16) {
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
+ } else {
+ qtype->to_float(inbuf, outbuf, nels);
+ }
+ };
+ workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
+ in_buff_offs += thr_block_bytes;
+ out_buff_offs += thr_elems;
+ }
+ for (auto & w : workers) { w.join(); }
+ workers.clear();
+}
+
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+ const std::string name = ggml_get_name(tensor);
+
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
+ const llm_arch arch = qs.model.arch;
+ const auto tn = LLM_TN(arch);
+
+ auto use_more_bits = [](int i_layer, int n_layers) -> bool {
+ return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
+ };
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+ auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
+ if (n_expert > 1) {
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
+ // tensor name.
+ if (sscanf(name, "blk.%d.", &i_layer) != 1) {
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
+ }
+ if (i_layer < 0 || i_layer >= n_layer) {
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
+ }
+ }
+ return std::make_pair(i_layer, n_layer);
+ };
+
+ // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+ // with the quantization of the output tensor
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
+ new_type = qs.params->output_tensor_type;
+ } else {
+ const int64_t nx = tensor->ne[0];
+ const int64_t qk_k = ggml_blck_size(new_type);
+
+ if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
+ new_type = GGML_TYPE_Q8_0;
+ }
+ else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
+ new_type = GGML_TYPE_Q8_0;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (new_type != GGML_TYPE_Q8_0) {
+ new_type = GGML_TYPE_Q6_K;
+ }
+ }
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
+ // MoE tensors -> MXFP4
+ // other tensors -> Q8_0
+ if (tensor->ne[2] > 1) {
+ new_type = GGML_TYPE_MXFP4;
+ } else {
+ new_type = GGML_TYPE_Q8_0;
+ }
+ } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
+ if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
+ new_type = qs.params->token_embedding_type;
+ } else {
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+ new_type = GGML_TYPE_Q2_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
+ new_type = GGML_TYPE_IQ3_S;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = GGML_TYPE_IQ3_S;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ }
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+ if (name.find("attn_v.weight") != std::string::npos) {
+ if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
+ else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+ ++qs.i_attention_wv;
+ }
+ else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (name.find("ffn_down") != std::string::npos) {
+ if (qs.i_ffn_down < qs.n_ffn_down/8) {
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+ }
+ ++qs.i_ffn_down;
+ }
+ else if (name.find("attn_output.weight") != std::string::npos) {
+ if (qs.model.hparams.n_expert == 8) {
+ new_type = GGML_TYPE_Q5_K;
+ } else {
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
+ }
+ }
+ } else if (name.find("attn_v.weight") != std::string::npos) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
+ }
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+ new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
+ use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+ if (qs.model.type == LLM_TYPE_70B) {
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
+ }
+ if (qs.model.hparams.n_expert == 8) {
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+ // TODO: explore better strategies
+ new_type = GGML_TYPE_Q8_0;
+ }
+ ++qs.i_attention_wv;
+ } else if (name.find("attn_k.weight") != std::string::npos) {
+ if (qs.model.hparams.n_expert == 8) {
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+ // TODO: explore better strategies
+ new_type = GGML_TYPE_Q8_0;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+ new_type = GGML_TYPE_IQ3_XXS;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = GGML_TYPE_IQ2_S;
+ }
+ } else if (name.find("attn_q.weight") != std::string::npos) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+ new_type = GGML_TYPE_IQ3_XXS;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+ new_type = GGML_TYPE_IQ2_S;
+ }
+ } else if (name.find("ffn_down") != std::string::npos) {
+ auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+ int i_layer = info.first, n_layer = info.second;
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
+ : GGML_TYPE_Q3_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+ new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+ if (arch == LLM_ARCH_FALCON) {
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+ } else {
+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
+ }
+ }
+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
+ && qs.has_imatrix && i_layer < n_layer/8) {
+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+ }
+ ++qs.i_ffn_down;
+ } else if (name.find("attn_output.weight") != std::string::npos) {
+ if (arch != LLM_ARCH_FALCON) {
+ if (qs.model.hparams.n_expert == 8) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
+ new_type = GGML_TYPE_Q5_K;
+ }
+ } else {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
+ }
+ } else {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
+ }
+ }
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+ new_type = GGML_TYPE_Q4_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+ }
+ else if (name.find("ffn_gate") != std::string::npos) {
+ auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
+ int i_layer = info.first, n_layer = info.second;
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
+ new_type = GGML_TYPE_IQ3_XXS;
+ }
+ ++qs.i_ffn_gate;
+ }
+ else if (name.find("ffn_up") != std::string::npos) {
+ auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
+ int i_layer = info.first, n_layer = info.second;
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
+ new_type = GGML_TYPE_IQ3_XXS;
+ }
+ ++qs.i_ffn_up;
+ }
+
+ return new_type;
+}
+
+static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+ if (nthread < 2) {
+ // single-thread
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
+ throw std::runtime_error("quantized data validation failed");
+ }
+ return new_size;
+ }
+
+ std::mutex mutex;
+ int64_t counter = 0;
+ size_t new_size = 0;
+ bool valid = true;
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
+ nrows, n_per_row, imatrix]() {
+ const int64_t nrows_per_chunk = chunk_size / n_per_row;
+ size_t local_size = 0;
+ while (true) {
+ std::unique_lock<std::mutex> lock(mutex);
+ int64_t first_row = counter; counter += nrows_per_chunk;
+ if (first_row >= nrows) {
+ if (local_size > 0) {
+ new_size += local_size;
+ }
+ break;
+ }
+ lock.unlock();
+ const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
+ local_size += this_size;
+
+ // validate the quantized data
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
+ void * this_data = (char *) new_data + first_row * row_size;
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
+ std::unique_lock<std::mutex> lock(mutex);
+ valid = false;
+ break;
+ }
+ }
+ };
+ for (int it = 0; it < nthread - 1; ++it) {
+ workers.emplace_back(compute);
+ }
+ compute();
+ for (auto & w : workers) { w.join(); }
+ workers.clear();
+ if (!valid) {
+ throw std::runtime_error("quantized data validation failed");
+ }
+ return new_size;
+}
+
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+ ggml_type default_type;
+ llama_ftype ftype = params->ftype;
+
+ switch (params->ftype) {
+ case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
+ case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
+ case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
+ case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
+ case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
+ case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
+ case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
+
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
+
+ // K-quants
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S:
+ case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S:
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M:
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S:
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S:
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
+
+ default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
+ }
+
+ int nthread = params->nthread;
+
+ if (nthread <= 0) {
+ nthread = std::thread::hardware_concurrency();
+ }
+
+ // mmap consistently increases speed on Linux, and also increases speed on Windows with
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
+#if defined(__linux__) || defined(_WIN32)
+ constexpr bool use_mmap = true;
+#else
+ constexpr bool use_mmap = false;
+#endif
+
+ llama_model_kv_override * kv_overrides = nullptr;
+ if (params->kv_overrides) {
+ auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+ kv_overrides = v->data();
+ }
+
+ std::vector<std::string> splits = {};
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+ ml.init_mappings(false); // no prefetching
+
+ llama_model model(llama_model_default_params());
+
+ model.load_arch (ml);
+ model.load_hparams(ml);
+ model.load_stats (ml);
+
+ quantize_state_impl qs(model, params);
+
+ if (params->only_copy) {
+ ftype = ml.ftype;
+ }
+ const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
+ if (params->imatrix) {
+ imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
+ if (imatrix_data) {
+ LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+ qs.has_imatrix = true;
+ // check imatrix for nans or infs
+ for (const auto & kv : *imatrix_data) {
+ for (float f : kv.second) {
+ if (!std::isfinite(f)) {
+ throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
+ }
+ }
+ }
+ }
+ }
+
+ const size_t align = GGUF_DEFAULT_ALIGNMENT;
+ gguf_context_ptr ctx_out { gguf_init_empty() };
+
+ std::vector<int> prune_list = {};
+ if (params->prune_layers) {
+ prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
+ }
+
+ // copy the KV pairs from the input file
+ gguf_set_kv (ctx_out.get(), ml.meta.get());
+ gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
+ gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
+
+ // Remove split metadata
+ gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
+ gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
+ gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
+
+ if (params->kv_overrides) {
+ const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
+ for (const auto & o : overrides) {
+ if (o.key[0] == 0) break;
+ if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
+ gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
+ // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
+ gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
+ gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
+ gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
+ } else {
+ LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
+ }
+ }
+ }
+
+ std::map<int, std::string> mapped;
+ int blk_id = 0;
+
+ // make a list of weights
+ std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
+ tensors.reserve(ml.weights_map.size());
+ for (const auto & it : ml.weights_map) {
+ const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
+ if (remapped_name.empty()) {
+ LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
+ continue;
+ }
+
+ if (remapped_name != it.first) {
+ ggml_set_name(it.second.tensor, remapped_name.c_str());
+ LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
+ }
+ tensors.push_back(&it.second);
+ }
+ if (!prune_list.empty()) {
+ gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
+ }
+
+ // keep_split requires that the weights are sorted by split index
+ if (params->keep_split) {
+ std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
+ if (a->idx == b->idx) {
+ return a->offs < b->offs;
+ }
+ return a->idx < b->idx;
+ });
+ }
+
+ for (const auto * it : tensors) {
+ const struct ggml_tensor * tensor = it->tensor;
+
+ const std::string name = ggml_get_name(tensor);
+
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
+ if (name.find("attn_v.weight") != std::string::npos ||
+ name.find("attn_qkv.weight") != std::string::npos ||
+ name.find("attn_kv_b.weight")!= std::string::npos) {
+ ++qs.n_attention_wv;
+ } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
+ qs.has_output = true;
+ }
+ }
+
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
+
+ size_t total_size_org = 0;
+ size_t total_size_new = 0;
+
+ std::vector<std::thread> workers;
+ workers.reserve(nthread);
+
+ int idx = 0;
+
+ std::vector<no_init<uint8_t>> read_data;
+ std::vector<no_init<uint8_t>> work;
+ std::vector<no_init<float>> f32_conv_buf;
+
+ uint16_t n_split = 1;
+
+ // Assume split index is continuous
+ if (params->keep_split) {
+ for (const auto * it : tensors) {
+ n_split = std::max(uint16_t(it->idx + 1), n_split);
+ }
+ }
+ std::vector<gguf_context_ptr> ctx_outs(n_split);
+ ctx_outs[0] = std::move(ctx_out);
+
+ // populate the original tensors so we get an initial meta data
+ for (const auto * it : tensors) {
+ uint16_t i_split = params->keep_split ? it->idx : 0;
+ ggml_tensor * tensor = it->tensor;
+ if (!ctx_outs[i_split]) {
+ ctx_outs[i_split].reset(gguf_init_empty());
+ }
+ gguf_add_tensor(ctx_outs[i_split].get(), tensor);
+ }
+
+ // Set split info if needed
+ if (n_split > 1) {
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
+ gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
+ gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
+ gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
+ }
+ }
+
+ int cur_split = -1;
+ std::ofstream fout;
+ auto close_ofstream = [&]() {
+ // Write metadata and close file handler
+ if (fout.is_open()) {
+ fout.seekp(0);
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split].get()));
+ gguf_get_meta_data(ctx_outs[cur_split].get(), data.data());
+ fout.write((const char *) data.data(), data.size());
+ fout.close();
+ }
+ };
+ auto new_ofstream = [&](int index) {
+ cur_split = index;
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
+ std::string fname = fname_out;
+ if (params->keep_split) {
+ std::vector<char> split_path(llama_path_max(), 0);
+ llama_split_path(split_path.data(), split_path.size(), fname_out.c_str(), cur_split, n_split);
+ fname = std::string(split_path.data());
+ }
+
+ fout = std::ofstream(fname, std::ios::binary);
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get());
+ // placeholder for the meta data
+ ::zeros(fout, meta_size);
+ };
+
+ const auto tn = LLM_TN(model.arch);
+ new_ofstream(0);
+ for (const auto * it : tensors) {
+ const auto & weight = *it;
+ ggml_tensor * tensor = weight.tensor;
+ if (weight.idx != cur_split && params->keep_split) {
+ close_ofstream();
+ new_ofstream(weight.idx);
+ }
+
+ const std::string name = ggml_get_name(tensor);
+
+ if (!ml.use_mmap) {
+ if (read_data.size() < ggml_nbytes(tensor)) {
+ read_data.resize(ggml_nbytes(tensor));
+ }
+ tensor->data = read_data.data();
+ }
+ ml.load_data_for(tensor);
+
+ LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
+ ++idx, ml.n_tensors,
+ ggml_get_name(tensor),
+ llama_format_tensor_shape(tensor).c_str(),
+ ggml_type_name(tensor->type));
+
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
+ bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+ // quantize only 2D and 3D tensors (experts)
+ quantize &= (ggml_n_dims(tensor) >= 2);
+
+ // do not quantize norm tensors
+ quantize &= name.find("_norm.weight") == std::string::npos;
+
+ quantize &= params->quantize_output_tensor || name != "output.weight";
+ quantize &= !params->only_copy;
+
+ // do not quantize expert gating tensors
+ // NOTE: can't use LLM_TN here because the layer number is not known
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+ // these are very small (e.g. 4x4)
+ quantize &= name.find("altup") == std::string::npos;
+ quantize &= name.find("laurel") == std::string::npos;
+
+ // these are not too big so keep them as it is
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
+
+ // do not quantize positional embeddings and token types (BERT)
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+ // do not quantize Mamba /Kimi's small conv1d weights
+ // NOTE: can't use LLM_TN here because the layer number is not known
+ quantize &= name.find("ssm_conv1d") == std::string::npos;
+ quantize &= name.find("shortconv.conv.weight") == std::string::npos;
+
+ // do not quantize RWKV's small yet 2D weights
+ quantize &= name.find("time_mix_first.weight") == std::string::npos;
+ quantize &= name.find("time_mix_w0.weight") == std::string::npos;
+ quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_v0.weight") == std::string::npos;
+ quantize &= name.find("time_mix_v1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_v2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_a0.weight") == std::string::npos;
+ quantize &= name.find("time_mix_a1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_a2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_g1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_g2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+ quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+ // do not quantize relative position bias (T5)
+ quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
+ // do not quantize specific multimodal tensors
+ quantize &= name.find(".position_embd.") == std::string::npos;
+
+ ggml_type new_type;
+ void * new_data;
+ size_t new_size;
+
+ if (quantize) {
+ new_type = default_type;
+
+ // get more optimal quantization type based on the tensor shape, layer, etc.
+ if (!params->pure && ggml_is_quantized(default_type)) {
+ // if the user provided tensor types - use those
+ bool manual = false;
+ if (params->tensor_types) {
+ const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+ const std::string tensor_name(tensor->name);
+ for (const auto & [tname, qtype] : tensor_types) {
+ if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+ if (qtype != new_type) {
+ LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
+ new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+ manual = true;
+ break;
+ }
+ }
+ }
+ }
+
+ // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+ if (!manual) {
+ new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+ }
+
+ // incompatible tensor shapes are handled here - fallback to a compatible type
+ {
+ bool convert_incompatible_tensor = false;
+
+ const int64_t nx = tensor->ne[0];
+ const int64_t ny = tensor->ne[1];
+ const int64_t qk_k = ggml_blck_size(new_type);
+
+ if (nx % qk_k != 0) {
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+ convert_incompatible_tensor = true;
+ } else {
+ ++qs.n_k_quantized;
+ }
+
+ if (convert_incompatible_tensor) {
+ switch (new_type) {
+ case GGML_TYPE_TQ1_0:
+ case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
+ case GGML_TYPE_IQ2_XXS:
+ case GGML_TYPE_IQ2_XS:
+ case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_IQ3_XXS:
+ case GGML_TYPE_IQ3_S:
+ case GGML_TYPE_IQ1_S:
+ case GGML_TYPE_IQ1_M:
+ case GGML_TYPE_Q2_K:
+ case GGML_TYPE_Q3_K:
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
+ default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+ }
+ if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+ new_type = GGML_TYPE_F16;
+ }
+ LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+ ++qs.n_fallback;
+ }
+ }
+ }
+ if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+ new_type = params->token_embedding_type;
+ }
+ if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+ new_type = params->output_tensor_type;
+ }
+
+ // If we've decided to quantize to the same type the tensor is already
+ // in then there's nothing to do.
+ quantize = tensor->type != new_type;
+ }
+
+ if (!quantize) {
+ new_type = tensor->type;
+ new_data = tensor->data;
+ new_size = ggml_nbytes(tensor);
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+ } else {
+ const int64_t nelements = ggml_nelements(tensor);
+
+ const float * imatrix = nullptr;
+ if (imatrix_data) {
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+ if (it == imatrix_data->end()) {
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+ } else {
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
+ imatrix = it->second.data();
+ } else {
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
+
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
+ // this is a significant error and it may be good idea to abort the process if this happens,
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
+ // tok_embd should be ignored in this case, since it always causes this warning
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+ }
+ }
+ }
+ }
+ if ((new_type == GGML_TYPE_IQ2_XXS ||
+ new_type == GGML_TYPE_IQ2_XS ||
+ new_type == GGML_TYPE_IQ2_S ||
+ new_type == GGML_TYPE_IQ1_S ||
+ (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
+ (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
+ LLAMA_LOG_ERROR("============================================================\n\n");
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+ }
+
+ float * f32_data;
+
+ if (tensor->type == GGML_TYPE_F32) {
+ f32_data = (float *) tensor->data;
+ } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+ } else {
+ llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
+ f32_data = (float *) f32_conv_buf.data();
+ }
+
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+ fflush(stdout);
+
+ if (work.size() < (size_t)nelements * 4) {
+ work.resize(nelements * 4); // upper bound on size
+ }
+ new_data = work.data();
+
+ const int64_t n_per_row = tensor->ne[0];
+ const int64_t nrows = tensor->ne[1];
+
+ static const int64_t min_chunk_size = 32 * 512;
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
+
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+
+ // quantize each expert separately since they have different importance matrices
+ new_size = 0;
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+
+ new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+
+ // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
+#if 0
+ if (new_type == GGML_TYPE_MXFP4) {
+ auto * x = f32_data_03;
+
+ //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
+ std::vector<float> deq(nrows*n_per_row);
+ const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
+ qtype->to_float(new_data_03, deq.data(), deq.size());
+
+ double err = 0.0f;
+ for (int i = 0; i < (int) deq.size(); ++i) {
+ err += fabsf(deq[i] - x[i]);
+ //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
+ if (deq[i] != x[i]) {
+ LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+ }
+ }
+ //LLAMA_LOG_INFO("err = %f\n", err);
+ GGML_ASSERT(err == 0.00000);
+ }
+#endif
+ }
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
+ }
+ total_size_org += ggml_nbytes(tensor);
+ total_size_new += new_size;
+
+ // update the gguf meta data as we go
+ gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+ GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+ gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+
+ // write tensor data + padding
+ fout.write((const char *) new_data, new_size);
+ zeros(fout, GGML_PAD(new_size, align) - new_size);
+ }
+ close_ofstream();
+
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
+
+ if (qs.n_fallback > 0) {
+ LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
+ __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+ }
+}
+
+//
+// interface implementation
+//
+
+llama_model_quantize_params llama_model_quantize_default_params() {
+ llama_model_quantize_params result = {
+ /*.nthread =*/ 0,
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+ /*.output_tensor_type =*/ GGML_TYPE_COUNT,
+ /*.token_embedding_type =*/ GGML_TYPE_COUNT,
+ /*.allow_requantize =*/ false,
+ /*.quantize_output_tensor =*/ true,
+ /*.only_copy =*/ false,
+ /*.pure =*/ false,
+ /*.keep_split =*/ false,
+ /*.imatrix =*/ nullptr,
+ /*.kv_overrides =*/ nullptr,
+ /*.tensor_type =*/ nullptr,
+ /*.prune_layers =*/ nullptr
+ };
+
+ return result;
+}
+
+uint32_t llama_model_quantize(
+ const char * fname_inp,
+ const char * fname_out,
+ const llama_model_quantize_params * params) {
+ try {
+ llama_model_quantize_impl(fname_inp, fname_out, params);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/llama.cpp/src/llama-quant.h b/llama.cpp/src/llama-quant.h
new file mode 100644
index 0000000..6f70f09
--- /dev/null
+++ b/llama.cpp/src/llama-quant.h
@@ -0,0 +1 @@
+#pragma once
diff --git a/llama.cpp/src/llama-sampler.cpp b/llama.cpp/src/llama-sampler.cpp
new file mode 100644
index 0000000..9bbc5db
--- /dev/null
+++ b/llama.cpp/src/llama-sampler.cpp
@@ -0,0 +1,3885 @@
+#include "llama-sampler.h"
+
+#include "llama-impl.h"
+#include "llama-vocab.h"
+#include "llama-grammar.h"
+
+#include "ggml-cpp.h"
+
+#include <array>
+#include <algorithm>
+#include <cassert>
+#include <cfloat>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <numeric>
+#include <random>
+#include <unordered_map>
+#include <stdexcept>
+
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+template<typename T>
+struct ring_buffer {
+ ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+
+ T & front() {
+ if (sz == 0) {
+ throw std::runtime_error("ring buffer is empty");
+ }
+ return data[first];
+ }
+
+ const T & front() const {
+ if (sz == 0) {
+ throw std::runtime_error("ring buffer is empty");
+ }
+ return data[first];
+ }
+
+ T & back() {
+ if (sz == 0) {
+ throw std::runtime_error("ring buffer is empty");
+ }
+ return data[pos];
+ }
+
+ const T & back() const {
+ if (sz == 0) {
+ throw std::runtime_error("ring buffer is empty");
+ }
+ return data[pos];
+ }
+
+ void push_back(const T & value) {
+ if (capacity == 0) {
+ throw std::runtime_error("ring buffer: capacity is zero");
+ }
+
+ if (sz == capacity) {
+ // advance the start when buffer is full
+ first = (first + 1) % capacity;
+ } else {
+ sz++;
+ }
+ data[pos] = value;
+ pos = (pos + 1) % capacity;
+ }
+
+ T pop_front() {
+ if (sz == 0) {
+ throw std::runtime_error("ring buffer is empty");
+ }
+ T value = data[first];
+ first = (first + 1) % capacity;
+ sz--;
+ return value;
+ }
+
+ //T & operator[](size_t i) {
+ // if (i >= sz) {
+ // throw std::runtime_error("ring buffer: index out of bounds");
+ // }
+ // return data[(first + i) % capacity];
+ //}
+
+ //const T & at(size_t i) const {
+ // if (i >= sz) {
+ // throw std::runtime_error("ring buffer: index out of bounds");
+ // }
+ // return data[(first + i) % capacity];
+ //}
+
+ const T & rat(size_t i) const {
+ if (i >= sz) {
+ throw std::runtime_error("ring buffer: index out of bounds");
+ }
+ return data[(first + sz - i - 1) % capacity];
+ }
+
+ std::vector<T> to_vector() const {
+ std::vector<T> result;
+ result.reserve(sz);
+ for (size_t i = 0; i < sz; i++) {
+ result.push_back(data[(first + i) % capacity]);
+ }
+ return result;
+ }
+
+ void clear() {
+ // here only reset the status of the buffer
+ sz = 0;
+ first = 0;
+ pos = 0;
+ }
+
+ bool empty() const {
+ return sz == 0;
+ }
+
+ size_t size() const {
+ return sz;
+ }
+
+ size_t capacity = 0;
+ size_t sz = 0;
+ size_t first = 0;
+ size_t pos = 0;
+
+ std::vector<T> data;
+};
+
+// writes result in res, does not mutate cur
+static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
+ static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+ return a.logit > b.logit;
+ };
+
+ constexpr int nbuckets = 128;
+ constexpr float bucket_low = -10.0f;
+ constexpr float bucket_high = 10.0f;
+ constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+ constexpr float bucket_inter = -bucket_low * bucket_scale;
+
+ std::vector<int> bucket_idx;
+ std::vector<int> histo(nbuckets, 0);
+
+ std::vector<llama_token_data*> bucket_ptrs;
+
+ bucket_idx.reserve(cur.size);
+
+ for (int i = 0; i < (int)cur.size; ++i) {
+ const float val = cur.data[i].logit;
+ int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+ ib = std::max(0, std::min(nbuckets - 1, ib));
+ bucket_idx.push_back(ib);
+ ++histo[ib];
+ }
+ int nhave = 0;
+ int ib = nbuckets - 1;
+ for ( ; ib >= 0; --ib) {
+ nhave += histo[ib];
+ if (nhave >= npartial) {
+ break;
+ }
+ }
+ res.resize(nhave);
+ auto * ptr = res.data();
+ bucket_ptrs.reserve(nbuckets - ib);
+ for (int j = nbuckets - 1; j >= ib; --j) {
+ bucket_ptrs.push_back(ptr);
+ ptr += histo[j];
+ }
+ for (int i = 0; i < (int)cur.size; ++i) {
+ int j = bucket_idx[i];
+ if (j >= ib) {
+ *bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i];
+ }
+ }
+
+ ptr = res.data();
+ int ndone = 0;
+ for (int j = nbuckets - 1; j > ib; --j) {
+ std::sort(ptr, ptr + histo[j], comp);
+ ptr += histo[j];
+ ndone += histo[j];
+ }
+ std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp);
+}
+
+// reduces the size of cur_p to npartial, keeping only the top npartial elements
+static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) {
+ static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+ return a.logit > b.logit;
+ };
+
+ if (npartial <= 128) {
+ std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp);
+
+ cur_p->size = npartial;
+ cur_p->sorted = true;
+
+ return;
+ }
+
+ std::vector<llama_token_data> tmp;
+
+ llama_token_data_array_partial_sort(*cur_p, npartial, tmp);
+
+ std::copy(tmp.data(), tmp.data() + npartial, cur_p->data);
+
+ cur_p->size = npartial;
+ cur_p->sorted = true;
+}
+
+static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
+ // iterator for the probabilities
+#ifdef __GNUC__
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#endif
+
+ struct probs_iterator {
+ typedef std::input_iterator_tag iterator_category;
+ typedef float value_type;
+ typedef float * pointer;
+ typedef float & reference;
+ typedef ptrdiff_t difference_type;
+
+ const llama_token_data * data;
+
+ bool operator==(const probs_iterator & other) const { return data == other.data; }
+ bool operator!=(const probs_iterator & other) const { return data != other.data; }
+ const float & operator*() const { return data->p; }
+ probs_iterator & operator++() { ++data; return *this; }
+ probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; }
+ };
+
+#ifdef __GNUC__
+ #pragma GCC diagnostic pop
+#endif
+
+ std::discrete_distribution<int> dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size});
+
+ return dist(rng);
+}
+
+/*
+static void llama_log_softmax(float * array, size_t size) {
+ float max_l = *std::max_element(array, array + size);
+ float sum = 0.f;
+ for (size_t i = 0; i < size; ++i) {
+ float p = expf(array[i] - max_l);
+ sum += p;
+ array[i] = p;
+ }
+
+ for (size_t i = 0; i < size; ++i) {
+ array[i] = logf(array[i] / sum);
+ }
+}
+*/
+
+static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+ if (temp <= 0.0f) {
+ // find the token with the highest logit and set the rest to -inf
+ size_t max_i = 0;
+ float max_l = cur_p->data[0].logit;
+
+ for (size_t i = 1; i < cur_p->size; ++i) {
+ if (cur_p->data[i ].logit > max_l) {
+ cur_p->data[max_i].logit = -INFINITY;
+ max_i = i;
+ max_l = cur_p->data[i].logit;
+ } else {
+ cur_p->data[i].logit = -INFINITY;
+ }
+ }
+
+ return;
+ }
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].logit /= temp;
+ }
+}
+
+static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) {
+ GGML_ASSERT(cur_p->size > 0);
+
+ // Sort the logits in descending order if requested
+ if (do_sort && !cur_p->sorted) {
+ llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
+ }
+
+ float max_l = cur_p->data[0].logit;
+ if (!cur_p->sorted) {
+ for (size_t i = 1; i < cur_p->size; ++i) {
+ max_l = std::max(max_l, cur_p->data[i].logit);
+ }
+ }
+
+ float cum_sum = 0.0f;
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ float p = expf(cur_p->data[i].logit - max_l);
+ cur_p->data[i].p = p;
+ cum_sum += p;
+ }
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= cum_sum;
+ }
+}
+
+static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
+ // if (k >= (int32_t)cur_p->size) {
+ // return;
+ // }
+
+ if (k <= 0) {
+ return;
+ }
+
+ k = std::min(k, (int) cur_p->size);
+
+ // Sort scores in descending order
+ if (!cur_p->sorted) {
+ llama_token_data_array_partial_sort_inplace(cur_p, k);
+ }
+
+ cur_p->size = k;
+}
+
+static uint32_t get_rng_seed(uint32_t seed) {
+ if (seed == LLAMA_DEFAULT_SEED) {
+ // use system clock if std::random_device is not a true RNG
+ static bool is_rd_prng = std::random_device().entropy() == 0;
+ if (is_rd_prng) {
+ return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
+ }
+ std::random_device rd;
+ return rd();
+ }
+ return seed;
+}
+
+// llama_sampler API
+
+struct llama_sampler * llama_sampler_init(
+ struct llama_sampler_i * iface,
+ llama_sampler_context_t ctx) {
+ return new llama_sampler {
+ /* .iface = */ iface,
+ /* .ctx = */ ctx,
+ };
+}
+
+const char * llama_sampler_name(const struct llama_sampler * smpl) {
+ if (!smpl->iface) {
+ return "(null)";
+ }
+
+ return smpl->iface->name(smpl);
+}
+
+void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
+ if (!smpl) {
+ return;
+ }
+
+ if (smpl->iface->accept) {
+ smpl->iface->accept(smpl, token);
+ }
+}
+
+void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
+ if (!smpl) {
+ return;
+ }
+
+ GGML_ASSERT(smpl->iface->apply);
+ smpl->iface->apply(smpl, cur_p);
+}
+
+void llama_sampler_reset(struct llama_sampler * smpl) {
+ if (!smpl) {
+ return;
+ }
+
+ if (smpl->iface->reset) {
+ smpl->iface->reset(smpl);
+ }
+}
+
+struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
+ if (!smpl) {
+ return nullptr;
+ }
+
+ if (smpl->iface->clone) {
+ return smpl->iface->clone(smpl);
+ }
+
+ if (smpl->ctx == nullptr) {
+ return llama_sampler_init(
+ /* .iface = */ smpl->iface,
+ /* .ctx = */ nullptr
+ );
+ }
+
+ GGML_ABORT("the sampler does not support cloning");
+}
+
+void llama_sampler_free(struct llama_sampler * smpl) {
+ if (smpl == nullptr) {
+ return;
+ }
+
+ if (smpl->iface->free) {
+ smpl->iface->free(smpl);
+ }
+
+ delete smpl;
+}
+
+// empty sampler
+
+struct llama_sampler_empty {
+ const char * name;
+};
+
+static struct llama_sampler * llama_sampler_init_empty(const char * name);
+
+static const char * llama_sampler_empty_name(const struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_empty *) smpl->ctx;
+ return ctx->name;
+}
+
+static void llama_sampler_empty_accept(struct llama_sampler * smpl, llama_token token) {
+ GGML_UNUSED(smpl);
+ GGML_UNUSED(token);
+}
+
+static void llama_sampler_empty_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ GGML_UNUSED(smpl);
+ GGML_UNUSED(cur_p);
+}
+
+static void llama_sampler_empty_reset(struct llama_sampler * smpl) {
+ GGML_UNUSED(smpl);
+}
+
+static struct llama_sampler * llama_sampler_empty_clone(const struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_empty *) smpl->ctx;
+ return llama_sampler_init_empty(ctx->name);
+}
+
+static void llama_sampler_empty_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_empty *) smpl->ctx;
+}
+
+static bool llama_sampler_empty_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ GGML_UNUSED(smpl);
+ GGML_UNUSED(buft);
+
+ return true;
+}
+
+static void llama_sampler_empty_backend_accept(
+ struct llama_sampler * smpl,
+ ggml_context * ctx,
+ ggml_cgraph * gf,
+ struct ggml_tensor * selected_token) {
+ GGML_UNUSED(smpl);
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(gf);
+ GGML_UNUSED(selected_token);
+}
+
+static void llama_sampler_empty_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ GGML_UNUSED(smpl);
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(gf);
+ GGML_UNUSED(data);
+}
+
+static void llama_sampler_empty_backend_set_input(struct llama_sampler * smpl) {
+ GGML_UNUSED(smpl);
+}
+
+static struct llama_sampler_i llama_sampler_empty_i = {
+ /* .name = */ llama_sampler_empty_name,
+ /* .accept = */ llama_sampler_empty_accept,
+ /* .apply = */ llama_sampler_empty_apply,
+ /* .reset = */ llama_sampler_empty_reset,
+ /* .clone = */ llama_sampler_empty_clone,
+ /* .free = */ llama_sampler_empty_free,
+ /* .backend_init = */ llama_sampler_empty_backend_init,
+ /* .backend_accept = */ llama_sampler_empty_backend_accept,
+ /* .backend_apply = */ llama_sampler_empty_backend_apply,
+ /* .backend_set_input = */ llama_sampler_empty_backend_set_input,
+};
+
+struct llama_sampler * llama_sampler_init_empty(const char * name) {
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_empty_i,
+ /* .ctx = */ new llama_sampler_empty {
+ /* .name = */ name,
+ }
+ );
+}
+
+// common backend sampler functionality
+//
+// +name : means that the sampler is support and will run on the backend
+// -name : means that a ggml operator is not supported by the backend
+//
+struct llama_sampler_backend {
+ llama_sampler_backend(const char * name) : name(name), name_ext(name), is_init(false), support(false) {}
+
+ const char * get_name() {
+ if (!is_init) {
+ return name.c_str();
+ }
+
+ if (support) {
+ name_ext = "+" + name;
+ } else {
+ name_ext = "-" + name;
+ }
+
+ return name_ext.c_str();
+ }
+
+ void init(bool support) {
+ GGML_ASSERT(this->is_init == false);
+
+ this->is_init = true;
+ this->support = support;
+ }
+
+private:
+ std::string name;
+ std::string name_ext;
+
+ bool is_init;
+ bool support;
+};
+
+// check if all ggml ops used by the sampler are supported by the backend
+static bool llama_sampler_backend_support(
+ llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * device = ggml_backend_buft_get_device(buft);
+ if (!device) {
+ // CPU backend always supported
+ return true;
+ }
+
+ ggml_init_params params = {
+ /*.mem_size =*/ 128*ggml_tensor_overhead() + ggml_graph_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context_ptr ctx_ptr { ggml_init(params) };
+ if (!ctx_ptr) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+
+ ggml_context * ctx = ctx_ptr.get();
+
+ const int64_t n = 1024*1024;
+
+ llama_sampler_data data = {
+ /*.logits = */ ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n),
+ /*.probs = */ nullptr,
+ /*.sampled = */ nullptr,
+ /*.candidates = */ ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n),
+ };
+
+ ggml_cgraph * gf = ggml_new_graph(ctx);
+
+ smpl->iface->backend_apply(smpl, ctx, gf, &data);
+
+ if (data.logits) {
+ ggml_build_forward_expand(gf, data.logits);
+ }
+
+ if (data.probs) {
+ ggml_build_forward_expand(gf, data.probs);
+ }
+
+ if (data.sampled) {
+ ggml_build_forward_expand(gf, data.sampled);
+ }
+
+ if (data.candidates) {
+ ggml_build_forward_expand(gf, data.candidates);
+ }
+
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+ struct ggml_tensor * op = ggml_graph_node(gf, i);
+
+ if (!ggml_backend_dev_supports_op(device, op)) {
+ LLAMA_LOG_WARN("%s: device '%s' does not have support for op %s needed for sampler '%s'\n",
+ __func__, ggml_backend_dev_name(device), ggml_op_name(op->op), smpl->iface->name(smpl));
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// sampler chain
+
+static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
+ return "chain";
+}
+
+static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ time_meas tm(chain->t_sample_us, chain->params.no_perf);
+
+ for (auto & smpl : chain->samplers) {
+ llama_sampler_accept(smpl.ptr, token);
+ }
+
+ chain->n_sample++;
+}
+
+static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ time_meas tm(chain->t_sample_us, chain->params.no_perf);
+
+ bool is_backend = chain->is_init;
+
+ for (auto & smpl : chain->samplers) {
+ if (is_backend && smpl.is_backend) {
+ continue;
+ }
+
+ is_backend = false;
+
+ if (smpl.ptr->iface->apply == nullptr) {
+ continue;
+ }
+
+ llama_sampler_apply(smpl.ptr, cur_p);
+ }
+}
+
+static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ for (auto & smpl : chain->samplers) {
+ llama_sampler_reset(smpl.ptr);
+ }
+}
+
+static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
+ const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;
+
+ auto * result = llama_sampler_chain_init(chain_src->params);
+
+ for (const auto & smpl : chain_src->samplers) {
+ llama_sampler_chain_add(result, llama_sampler_clone(smpl.ptr));
+ }
+
+ return result;
+}
+
+static void llama_sampler_chain_free(struct llama_sampler * smpl) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ for (auto & smpl : chain->samplers) {
+ llama_sampler_free(smpl.ptr);
+ }
+
+ delete chain;
+}
+
+static bool llama_sampler_chain_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ GGML_ASSERT(chain->is_init == false && "llama_sampler_chain_backend_init() called twice");
+
+ chain->is_init = true;
+
+ bool res = true;
+
+ for (auto & smpl : chain->samplers) {
+ bool res_cur = true;
+
+ // to be able to run a sampler on the backend, it has to:
+ // - have the .backend_init() API implemented
+ // - return true during .backend_init()
+ if (smpl.ptr->iface->backend_init) {
+ if (!smpl.ptr->iface->backend_init(smpl.ptr, buft)) {
+ res_cur = false;
+ }
+ } else {
+ res_cur = false;
+ }
+
+ smpl.is_backend = res_cur;
+
+ res = res && res_cur;
+ }
+
+ return res;
+}
+
+static void llama_sampler_chain_backend_accept(
+ struct llama_sampler * smpl,
+ ggml_context * ctx,
+ ggml_cgraph * gf,
+ struct ggml_tensor * selected_token) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ for (auto & smpl : chain->samplers) {
+ if (!smpl.is_backend) {
+ break;
+ }
+
+ if (smpl.ptr->iface->backend_accept) {
+ smpl.ptr->iface->backend_accept(smpl.ptr, ctx, gf, selected_token);
+ }
+ }
+}
+
+static void llama_sampler_chain_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ GGML_ASSERT(chain->is_init && "llama_sampler_chain_backend_init() not called");
+
+ for (auto & smpl : chain->samplers) {
+ if (!smpl.is_backend) {
+ break;
+ }
+
+ if (smpl.ptr->iface->backend_apply) {
+ smpl.ptr->iface->backend_apply(smpl.ptr, ctx, gf, data);
+ }
+ }
+}
+
+static void llama_sampler_chain_backend_set_input(struct llama_sampler * smpl) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+ for (auto & smpl : chain->samplers) {
+ if (!smpl.is_backend) {
+ break;
+ }
+
+ if (smpl.ptr->iface->backend_set_input) {
+ smpl.ptr->iface->backend_set_input(smpl.ptr);
+ }
+ }
+}
+
+static struct llama_sampler_i llama_sampler_chain_i = {
+ /* .name = */ llama_sampler_chain_name,
+ /* .accept = */ llama_sampler_chain_accept,
+ /* .apply = */ llama_sampler_chain_apply,
+ /* .reset = */ llama_sampler_chain_reset,
+ /* .clone = */ llama_sampler_chain_clone,
+ /* .free = */ llama_sampler_chain_free,
+ /* .backend_init = */ llama_sampler_chain_backend_init,
+ /* .backend_accept = */ llama_sampler_chain_backend_accept,
+ /* .backend_apply = */ llama_sampler_chain_backend_apply,
+ /* .backend_set_input = */ llama_sampler_chain_backend_set_input,
+};
+
+struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_chain_i,
+ /* .ctx = */ new llama_sampler_chain {
+ /* .params = */ params,
+ /* .is_init = */ false,
+ /* .samplers = */ {},
+ /* .cur = */ {},
+ /* .t_sample_us = */ 0,
+ /* .n_sample = */ 0,
+ }
+ );
+}
+
+llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
+ const llama_token sampled_token = llama_get_sampled_token_ith (ctx, idx);
+ const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
+ const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
+ const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
+
+ // If a backend sampler has already sampled a token, return it.
+ if (sampled_token != LLAMA_TOKEN_NULL) {
+ LLAMA_LOG_DEBUG("%s: Backend sampler selected token for idx %d. Skipping CPU samplers\n", __func__, idx);
+ return sampled_token;
+ }
+
+ const llama_model * model = llama_get_model(ctx);
+ const llama_vocab * vocab = llama_model_get_vocab(model);
+
+ const int n_vocab = llama_vocab_n_tokens(vocab);
+
+ // use pre-allocated buffer from chain if available, otherwise allocate locally
+ std::vector<llama_token_data> * cur_ptr;
+ std::vector<llama_token_data> cur_local;
+
+ if (smpl->iface == &llama_sampler_chain_i) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+ cur_ptr = &chain->cur;
+ } else {
+ cur_ptr = &cur_local;
+ }
+
+ auto & cur = *cur_ptr;
+
+ if (sampled_probs) {
+ const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
+ cur.resize(sampled_probs_count);
+ for (uint32_t i = 0; i < sampled_probs_count; ++i) {
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
+ }
+ } else if (sampled_logits) {
+ const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
+ cur.resize(sampled_logits_count);
+ for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
+ }
+ } else {
+ const auto * logits = llama_get_logits_ith(ctx, idx);
+ GGML_ASSERT(logits != nullptr);
+ cur.resize(n_vocab);
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+ }
+ }
+
+ llama_token_data_array cur_p = {
+ /* .data = */ cur.data(),
+ /* .size = */ cur.size(),
+ /* .selected = */ -1,
+ /* .sorted = */ false,
+ };
+
+ llama_sampler_apply(smpl, &cur_p);
+
+ GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
+
+ auto token = cur_p.data[cur_p.selected].id;
+
+ llama_sampler_accept(smpl, token);
+
+ return token;
+}
+
+
+void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
+ auto * p = (llama_sampler_chain *) chain->ctx;
+ p->samplers.push_back({
+ /* .is_backend = */ false,
+ /* .ptr = */ smpl,
+ });
+}
+
+struct llama_sampler * llama_sampler_chain_get(struct llama_sampler * chain, int32_t i) {
+ if (chain == nullptr) {
+ return nullptr;
+ }
+
+ if (chain->iface != &llama_sampler_chain_i) {
+ return nullptr;
+ }
+
+ if (i == -1) {
+ return chain;
+ }
+
+ const auto * p = (const llama_sampler_chain *) chain->ctx;
+
+ if (i < 0 || (size_t) i >= p->samplers.size()) {
+ return nullptr;
+ }
+
+ return p->samplers[i].ptr;
+}
+
+struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
+ auto * p = (llama_sampler_chain *) chain->ctx;
+
+ if (i < 0 || (size_t) i >= p->samplers.size()) {
+ return nullptr;
+ }
+
+ auto * result = p->samplers[i].ptr;
+ p->samplers.erase(p->samplers.begin() + i);
+
+ return result;
+}
+
+int llama_sampler_chain_n(const struct llama_sampler * chain) {
+ const auto * p = (const llama_sampler_chain *) chain->ctx;
+
+ return p->samplers.size();
+}
+
+//
+// samplers
+//
+
+// greedy
+
+struct llama_sampler_greedy : public llama_sampler_backend {
+};
+
+static const char * llama_sampler_greedy_name(const struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_greedy *) smpl->ctx;
+ return sctx->get_name();
+}
+
+static void llama_sampler_greedy_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_greedy *) smpl->ctx;
+ GGML_UNUSED(ctx);
+}
+
+static struct llama_sampler * llama_sampler_greedy_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_greedy *) smpl->ctx;
+ auto * result = llama_sampler_init_greedy();
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_greedy *) result->ctx;
+
+ GGML_UNUSED(ctx);
+ GGML_UNUSED(result_ctx);
+ }
+
+ return result;
+}
+
+static void llama_sampler_greedy_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_greedy *) smpl->ctx;
+}
+
+static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
+ cur_p->selected = 0;
+ for (size_t i = 1; i < cur_p->size; ++i) {
+ if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) {
+ cur_p->selected = i;
+ }
+ }
+}
+
+static bool llama_sampler_greedy_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * sctx = (llama_sampler_greedy *) smpl->ctx;
+
+ const bool res = llama_sampler_backend_support(smpl, buft);
+
+ sctx->init(res);
+
+ return res;
+}
+
+static void llama_sampler_greedy_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ GGML_UNUSED(gf);
+ GGML_UNUSED(smpl);
+
+ struct ggml_tensor * curl = ggml_argmax(ctx, data->logits);
+ ggml_set_name(curl, "greedy_argmax");
+
+ data->sampled = curl;
+}
+
+static struct llama_sampler_i llama_sampler_greedy_i = {
+ /* .name = */ llama_sampler_greedy_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_greedy_apply,
+ /* .reset = */ llama_sampler_greedy_reset,
+ /* .clone = */ llama_sampler_greedy_clone,
+ /* .free = */ llama_sampler_greedy_free,
+ /* .backend_init = */ llama_sampler_greedy_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_greedy_backend_apply,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_greedy() {
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_greedy_i,
+ /* .ctx = */ new llama_sampler_greedy {
+ ("greedy"),
+ }
+ );
+}
+
+// dist
+
+struct llama_sampler_dist : public llama_sampler_backend {
+ const uint32_t seed;
+ uint32_t seed_cur;
+
+ std::mt19937 rng;
+
+ ggml_tensor * inp_uniform;
+};
+
+static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_dist *) smpl->ctx;
+ return sctx->get_name();
+}
+
+static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_dist *) smpl->ctx;
+
+ // edge cases
+ if (cur_p->size == 0) {
+ cur_p->selected = -1;
+ return;
+ }
+
+ cur_p->selected = 0;
+
+ if (cur_p->size == 1) {
+ cur_p->data[0].p = 1.0f;
+ return;
+ }
+
+ // max logit for numerical stability
+ float max_l = cur_p->data[0].logit;
+ if (!cur_p->sorted) {
+ for (size_t i = 1; i < cur_p->size; ++i) {
+ max_l = std::max(max_l, cur_p->data[i].logit);
+ }
+ }
+
+ // apply softmax to obtain the probabilities
+ double sum_cum = 0.0f;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ float p = expf(cur_p->data[i].logit - max_l);
+ cur_p->data[i].p = p;
+ sum_cum += p;
+ }
+
+#if 1
+ // sample from the obtained probabilities and normalize the probs in a single pass
+ // this is ~3x faster on Mac with full gpt-oss vocab than the version below
+ //
+ std::uniform_real_distribution<double> dist(0.0f, 1.0f);
+ const double rnd = dist(ctx->rng);
+
+ double sum_run = 0.0f;
+ const double sum_tgt = sum_cum*rnd;
+
+ bool found = false;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (!found) {
+ // accumulate probs until we reach the target sum
+ sum_run += cur_p->data[i].p;
+ if (sum_run >= sum_tgt) {
+ cur_p->selected = i;
+ found = true;
+ }
+ }
+
+ // normalize probs
+ cur_p->data[i].p /= sum_cum;
+ }
+
+ // fallback to the last token (don't think this can happen)
+ assert(found);
+ if (!found) {
+ cur_p->selected = cur_p->size - 1;
+ }
+#else
+ // for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= sum_cum;
+ }
+
+ cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+#endif
+}
+
+static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_dist *) smpl->ctx;
+ ctx->seed_cur = get_rng_seed(ctx->seed);
+ ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
+ auto * result = llama_sampler_init_dist(ctx->seed);
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_dist *) result->ctx;
+
+ result_ctx->rng = ctx->rng;
+ }
+
+ return result;
+}
+
+static void llama_sampler_dist_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_dist *) smpl->ctx;
+}
+
+static bool llama_sampler_dist_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * sctx = (llama_sampler_dist *) smpl->ctx;
+
+ const bool res = llama_sampler_backend_support(smpl, buft);
+
+ sctx->init(res);
+
+ return res;
+}
+
+static void llama_sampler_dist_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ GGML_UNUSED(gf);
+
+ auto * sctx = (llama_sampler_dist *) smpl->ctx;
+
+ sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+ ggml_set_name (sctx->inp_uniform, "uniform");
+ ggml_set_input(sctx->inp_uniform);
+
+ struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
+ ggml_set_name(probs, "dist_probs");
+
+ struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs);
+ ggml_set_name(cumsum, "dist_cumsum");
+
+ // The uniform tensor has a random value and we subtract this tensor with
+ // the cumsum tensor (the uniform tensor will be broadcasted by ggml_sub).
+ // Recall that each entry in cumsum is the cumulative probability up to that
+ // index so values stay negative while the cumulative total is below the
+ // random value, and become zero/positive once the threshold is crossed.
+ struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform);
+ ggml_set_name(diff, "dist_cumsum");
+
+ // The ggml_step function produces a tensor where entries are 1 if the
+ // corresponding entry in diff is > 0, and 0 otherwise. So all values up to
+ // the index where the cumulative probability exceeds the random value are 0,
+ // and all entries after that are 1.
+ struct ggml_tensor * mask = ggml_step(ctx, diff);
+ ggml_set_name(mask, "dist_mask");
+
+ // Taking the sum of the mask gives us the sum of elements after the threshold
+ // we are interested in.
+ struct ggml_tensor * idxf = ggml_sum(ctx, mask);
+ ggml_set_name(idxf, "dist_index_f32");
+
+ // Use ggml_scale_bias to scale the index value by -1 and then add the size
+ // of the mask to that value so we get the correct index ((-1 * idxf) + n).
+ struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);
+ ggml_set_name(idx, "dist_index_i32");
+
+ // Map back to original vocab ids if a candidates tensor is available.
+ struct ggml_tensor * sampled_token = idx;
+ if (data->candidates != nullptr) {
+ struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates));
+
+ sampled_token = ggml_get_rows(ctx, candidates, idx);
+ ggml_set_name(sampled_token, "dist_sampled_token");
+ }
+
+ data->sampled = sampled_token;
+ data->probs = probs;
+}
+
+static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_dist *) smpl->ctx;
+
+ GGML_ASSERT(sctx->inp_uniform != nullptr);
+
+ // We sample in double precision and cast to float to match rnd numbers of
+ // llama_dampler_dist which uses double precision (sampling from
+ // std::uniform_real_distribution<double> and
+ // std::uniform_real_distribution<float> with same rng will produce
+ // different sequences).
+ std::uniform_real_distribution<double> dist(0.0f, 1.0f);
+ const float rnd = dist(sctx->rng);
+
+ ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float));
+}
+
+static struct llama_sampler_i llama_sampler_dist_i = {
+ /* .name = */ llama_sampler_dist_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_dist_apply,
+ /* .reset = */ llama_sampler_dist_reset,
+ /* .clone = */ llama_sampler_dist_clone,
+ /* .free = */ llama_sampler_dist_free,
+ /* .backend_init = */ llama_sampler_dist_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_dist_backend_apply,
+ /* .backend_set_input = */ llama_sampler_dist_backend_set_input,
+};
+
+struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+ auto seed_cur = get_rng_seed(seed);
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_dist_i,
+ /* .ctx = */ new llama_sampler_dist {
+ ("dist"),
+ /* .seed = */ seed,
+ /* .seed_cur = */ seed_cur,
+ /* .rng = */ std::mt19937(seed_cur),
+ /* .inp_uniform = */ nullptr,
+ }
+ );
+}
+
+// top-k
+
+struct llama_sampler_top_k : public llama_sampler_backend {
+ const int32_t k;
+};
+
+static const char * llama_sampler_top_k_name(const struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_top_k *) smpl->ctx;
+ return sctx->get_name();
+}
+
+static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_top_k *) smpl->ctx;
+ llama_sampler_top_k_impl(cur_p, ctx->k);
+}
+
+static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_top_k *) smpl->ctx;
+ return llama_sampler_init_top_k(ctx->k);
+}
+
+static void llama_sampler_top_k_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_top_k *) smpl->ctx;
+}
+
+static bool llama_sampler_top_k_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * sctx = (llama_sampler_top_k *) smpl->ctx;
+
+ const bool res = llama_sampler_backend_support(smpl, buft);
+
+ sctx->init(res);
+
+ return res;
+}
+
+static void llama_sampler_top_k_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ auto * sctx = (llama_sampler_top_k *) smpl->ctx;
+
+ struct ggml_tensor * top_k = ggml_top_k(ctx, data->logits, sctx->k);
+ ggml_set_name(top_k, "top_k");
+
+ if (data->candidates) {
+ struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]);
+ data->candidates = ggml_get_rows(ctx, candidates_rows, top_k);
+ data->candidates = ggml_reshape_1d(ctx, data->candidates, sctx->k);
+ ggml_set_name(data->candidates, "top_k_candidates");
+ } else {
+ data->candidates = top_k;
+ }
+
+ struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
+ struct ggml_tensor * top_k_rows = ggml_get_rows(ctx, logits_rows, top_k);
+ data->logits = ggml_reshape_1d(ctx, top_k_rows, sctx->k);
+ ggml_set_name(top_k_rows, "top_k_rows");
+
+ GGML_UNUSED(gf);
+}
+
+static struct llama_sampler_i llama_sampler_top_k_i = {
+ /* .name = */ llama_sampler_top_k_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_top_k_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_top_k_clone,
+ /* .free = */ llama_sampler_top_k_free,
+ /* .backend_init = */ llama_sampler_top_k_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_top_k_backend_apply,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
+ const bool is_empty = (k <= 0);
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?top-k");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_top_k_i,
+ /* .ctx = */ new llama_sampler_top_k {
+ ("top-k"),
+ /* .k = */ k,
+ }
+ );
+}
+
+// top-p
+
+struct llama_sampler_top_p : public llama_sampler_backend {
+ const float p;
+ const size_t min_keep;
+
+ std::vector<llama_token_data> buf_sort;
+};
+
+static const char * llama_sampler_top_p_name(const struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_top_p *) smpl->ctx;
+ return sctx->get_name();
+}
+
+static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_top_p *) smpl->ctx;
+
+ if (ctx->p >= 1.0f) {
+ return;
+ }
+
+ llama_sampler_softmax_impl(cur_p, false);
+
+ size_t k = cur_p->size;
+ auto * pdata = cur_p->data;
+
+ auto & buf_sort = ctx->buf_sort;
+
+ // if not sorted, try adaptive top-k sorting
+ if (!cur_p->sorted && cur_p->size > 1024) {
+ k = std::min<size_t>(256, cur_p->size);
+ llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
+ pdata = buf_sort.data();
+ } else if (!cur_p->sorted) {
+ // small candidates -> sort inplace
+ llama_token_data_array_partial_sort_inplace(cur_p, k);
+ }
+
+ // Compute the cumulative probabilities
+ float cum_sum = 0.0f;
+ size_t last_idx = cur_p->size;
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cum_sum += pdata[i].p;
+
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
+ if (cum_sum >= ctx->p && i + 1 >= ctx->min_keep) {
+ last_idx = i + 1;
+ break;
+ }
+
+ // we exceeded the current top-k heuristic -> increase k and continue
+ if (!cur_p->sorted && i == k - 1) {
+ k = cur_p->size;
+ llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
+ pdata = buf_sort.data();
+ }
+ }
+
+ // Resize the output vector to keep only the top-p tokens
+ if (!cur_p->sorted) {
+ std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data);
+ cur_p->sorted = true;
+ }
+
+ cur_p->size = last_idx;
+}
+
+static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_top_p *) smpl->ctx;
+ return llama_sampler_init_top_p(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_top_p_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_top_p *) smpl->ctx;
+}
+
+static bool llama_sampler_top_p_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * sctx = (llama_sampler_top_p *) smpl->ctx;
+
+ const bool res = llama_sampler_backend_support(smpl, buft);
+
+ sctx->init(res);
+
+ return res;
+}
+
+static void llama_sampler_top_p_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ auto * sctx = (llama_sampler_top_p *) smpl->ctx;
+
+ auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) {
+ GGML_ASSERT(ggml_nrows(a) == 1);
+ struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]);
+ struct ggml_tensor * a_sorted = ggml_get_rows(ctx, a_reshaped, b);
+ return ggml_reshape_1d(ctx, a_sorted, a->ne[0]);
+ };
+
+ // Get the sorted logits in descending order.
+ struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC);
+ ggml_set_name(sorted_idx, "top_p_sorted_idx");
+
+ // Do the sorting via reshape + get_rows
+ struct ggml_tensor * sorted_logits = ggml_sort(data->logits, sorted_idx);
+ ggml_set_name(sorted_logits, "top_p_sorted_logits");
+
+ struct ggml_tensor * softmax = ggml_soft_max(ctx, sorted_logits);
+ ggml_set_name(softmax, "top_p_softmax");
+
+ // If candidates are provided, sort them as well. Otherwise, set sorted indices as candidates.
+ if (data->candidates) {
+ data->candidates = ggml_sort(data->candidates, sorted_idx);
+ } else {
+ data->candidates = sorted_idx;
+ }
+ ggml_set_name(data->candidates, "top_p_candidates");
+
+ // Compute Cumulative Distribution Function (CDF) by means of GGML_OP_CUMSUM.
+ struct ggml_tensor * cdf = ggml_cumsum(ctx, softmax);
+ ggml_set_name(cdf, "top_p_cdf");
+
+ // Invert CDF and add top-p value so that ggml_step yields 1 for values we want to keep
+ struct ggml_tensor * cdf_scaled = ggml_scale_bias(ctx, cdf, -1.0f, sctx->p);
+ ggml_set_name(cdf_scaled, "top_p_cdf_scaled");
+
+ struct ggml_tensor * mask = ggml_step(ctx, cdf_scaled);
+ ggml_set_name(mask, "top_p_mask");
+
+ // Taking the sum of the mask gives us the sum of elements after the threshold
+ // we are interested in.
+ struct ggml_tensor * idxf = ggml_sum(ctx, mask);
+ ggml_set_name(idxf, "top_p_index_f32");
+
+ // prevent out-of-bounds access
+ idxf = ggml_clamp(ctx, idxf, 0.0f, mask->ne[0] - 1);
+
+ // construct ones tensor to set the value in the mask
+ struct ggml_tensor * ones = ggml_scale_bias(ctx, idxf, 0.0f, 1.0f);
+ ggml_set_name(ones, "top_p_ones");
+
+ // Make top-p inclusive (i.e. return all values such that cum_sum/cdf >= p)
+ struct ggml_tensor * mask_reshaped = ggml_reshape_2d(ctx, mask, 1, mask->ne[0]);
+
+ mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
+ mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
+
+ // Apply -INFINITY bias for masked-out tokens
+ // log(1) = 0 (keep), log(0) = -INF (discard)
+ struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
+ ggml_set_name(top_p_bias, "top_p_bias");
+
+ data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
+ ggml_set_name(data->logits, "top_p_logits");
+
+ GGML_UNUSED(gf);
+}
+
+static struct llama_sampler_i llama_sampler_top_p_i = {
+ /* .name = */ llama_sampler_top_p_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_top_p_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_top_p_clone,
+ /* .free = */ llama_sampler_top_p_free,
+ /* .backend_init = */ llama_sampler_top_p_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_top_p_backend_apply,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
+ const bool is_empty = p >= 1.0f;
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?top-p");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_top_p_i,
+ /* .ctx = */ new llama_sampler_top_p {
+ ("top-p"),
+ /* .p = */ p,
+ /* .min_keep = */ min_keep,
+ /* .buf_sort = */ {},
+ }
+ );
+}
+
+// min-p
+
+struct llama_sampler_min_p : public llama_sampler_backend {
+ const float p;
+ const size_t min_keep;
+};
+
+static const char * llama_sampler_min_p_name(const struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_min_p *) smpl->ctx;
+ return sctx->get_name();
+}
+
+static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_min_p *) smpl->ctx;
+
+ if (ctx->p <= 0.0f || !cur_p->size) {
+ return;
+ }
+
+ bool min_p_applied = false;
+
+ // if the cur_p aren't sorted, try the unsorted implementation first
+ if (!cur_p->sorted) {
+ std::vector<llama_token_data> filtered_tokens;
+
+ float max_logit = -FLT_MAX;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ max_logit = std::max(max_logit, cur_p->data[i].logit);
+ }
+ const float min_logit = max_logit + logf(ctx->p); // min logit for p_i >= p * p_max
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (cur_p->data[i].logit >= min_logit) {
+ filtered_tokens.push_back(cur_p->data[i]);
+ }
+ }
+
+ // if we have enough values the operation was a success
+ if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
+ std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
+ cur_p->size = filtered_tokens.size();
+ min_p_applied = true;
+ }
+ }
+
+ // if the cur_p are sorted or the unsorted implementation failed, use this implementation
+ if (!min_p_applied) {
+ // Sort the logits in descending order
+ if (!cur_p->sorted) {
+ llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
+ }
+
+ const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
+ size_t i = 1; // first token always matches
+
+ for (; i < cur_p->size; ++i) {
+ if (cur_p->data[i].logit < min_logit && i >= ctx->min_keep) {
+ break; // prob too small
+ }
+ }
+
+ // Resize the output vector to keep only the matching tokens
+ cur_p->size = i;
+ }
+}
+
+static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_min_p *) smpl->ctx;
+ return llama_sampler_init_min_p(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_min_p_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_min_p *) smpl->ctx;
+}
+
+static bool llama_sampler_min_p_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * sctx = (llama_sampler_min_p *) smpl->ctx;
+
+ const bool res = llama_sampler_backend_support(smpl, buft);
+
+ sctx->init(res);
+
+ return res;
+}
+
+static void llama_sampler_min_p_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ auto * sctx = (llama_sampler_min_p *) smpl->ctx;
+
+ struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits);
+ ggml_set_name(max_idx, "max_idx");
+
+ struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
+ ggml_set_name(logits_rows, "logits_rows");
+
+ struct ggml_tensor * max_logit = ggml_get_rows(ctx, logits_rows, max_idx);
+ ggml_set_name(max_logit, "max_logit");
+
+ // Calculate the threshold value.
+ struct ggml_tensor * threshold = ggml_scale_bias(ctx, max_logit, 1.0f, logf(sctx->p));
+ ggml_set_name(threshold, "min_p_threshold");
+
+ // Subtract the threshold from logits.
+ struct ggml_tensor * sub = ggml_sub(ctx, data->logits, threshold);
+
+ // Create a mask where logits below the threshold are 0 (discard),
+ // and others are 1 (keep).
+ struct ggml_tensor * mask = ggml_step(ctx, sub);
+ ggml_set_name(mask, "min_p_mask");
+
+ // Apply -INFINITY bias for masked-out tokens
+ // log(1) = 0 (keep), log(0) = -INF (discard)
+ struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
+ ggml_set_name(min_p_bias, "min_p_bias");
+
+ data->logits = ggml_add(ctx, data->logits, min_p_bias);
+ ggml_set_name(data->logits, "min_p_logits");
+
+ GGML_UNUSED(gf);
+}
+
+static struct llama_sampler_i llama_sampler_min_p_i = {
+ /* .name = */ llama_sampler_min_p_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_min_p_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_min_p_clone,
+ /* .free = */ llama_sampler_min_p_free,
+ /* .backend_init = */ llama_sampler_min_p_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_min_p_backend_apply,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
+ const bool is_empty = (p <= 0.0f);
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?min-p");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_min_p_i,
+ /* .ctx = */ new llama_sampler_min_p {
+ ("min-p"),
+ /* .p = */ p,
+ /* .min_keep = */ min_keep,
+ }
+ );
+}
+
+// typical
+
+struct llama_sampler_typical {
+ const float p;
+ const size_t min_keep;
+};
+
+static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) {
+ return "typical";
+}
+
+static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_typical *) smpl->ctx;
+
+ // Reference implementation:
+ // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+ if (ctx->p >= 1.0f) {
+ return;
+ }
+
+ // Compute the softmax of logits and calculate entropy
+ llama_sampler_softmax_impl(cur_p, true);
+
+ float entropy = 0.0f;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
+ }
+
+ // Compute the absolute difference between negative log probability and entropy for each candidate
+ std::vector<float> shifted_scores;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
+ shifted_scores.push_back(shifted_score);
+ }
+
+ // Sort tokens based on the shifted_scores and their corresponding indices
+ std::vector<size_t> indices(cur_p->size);
+ std::iota(indices.begin(), indices.end(), 0);
+
+ std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+ return shifted_scores[a] < shifted_scores[b];
+ });
+
+ // Compute the cumulative probabilities
+ float cum_sum = 0.0f;
+ size_t last_idx = indices.size();
+
+ for (size_t i = 0; i < indices.size(); ++i) {
+ size_t idx = indices[i];
+ cum_sum += cur_p->data[idx].p;
+
+ // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
+ if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) {
+ last_idx = i + 1;
+ break;
+ }
+ }
+
+ // Resize the output vector to keep only the locally typical tokens
+ std::vector<llama_token_data> cur_p_new;
+ for (size_t i = 0; i < last_idx; ++i) {
+ size_t idx = indices[i];
+ cur_p_new.push_back(cur_p->data[idx]);
+ }
+
+ // Replace the data in cur_p with the cur_p_new data
+ std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
+ cur_p->size = cur_p_new.size();
+ cur_p->sorted = false;
+}
+
+static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_typical *) smpl->ctx;
+ return llama_sampler_init_typical(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_typical_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_typical *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_typical_i = {
+ /* .name = */ llama_sampler_typical_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_typical_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_typical_clone,
+ /* .free = */ llama_sampler_typical_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
+ const bool is_empty = (p >= 1.0f);
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?typical");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_typical_i,
+ /* .ctx = */ new llama_sampler_typical {
+ /* .p = */ p,
+ /* .min_keep = */ min_keep,
+ }
+ );
+}
+
+// temp
+
+struct llama_sampler_temp : public llama_sampler_backend {
+ const float temp;
+};
+
+static const char * llama_sampler_temp_name(const struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_temp *) smpl->ctx;
+ return sctx->get_name();
+}
+
+static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ const auto * ctx = (llama_sampler_temp *) smpl->ctx;
+
+ llama_sampler_temp_impl(cur_p, ctx->temp);
+}
+
+static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_temp *) smpl->ctx;
+ return llama_sampler_init_temp(ctx->temp);
+}
+
+static void llama_sampler_temp_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_temp *) smpl->ctx;
+}
+
+static void llama_sampler_backend_temp_sampling(
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data,
+ float temp) {
+ if (temp <= 0.0f) {
+ // Find the most probable token index.
+ struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits);
+ ggml_set_name(max_idx, "temp_max_idx");
+
+ if (data->candidates) {
+ struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]);
+ data->candidates = ggml_get_rows(ctx, candidates_rows, max_idx);
+ } else {
+ data->candidates = max_idx;
+ }
+
+ struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
+ data->logits = ggml_get_rows(ctx, logits_rows, max_idx);
+
+ return;
+ }
+
+ data->logits = ggml_scale(ctx, data->logits, 1.0f / temp);
+
+ GGML_UNUSED(gf);
+}
+
+static bool llama_sampler_temp_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * sctx = (llama_sampler_temp *) smpl->ctx;
+
+ const bool res = llama_sampler_backend_support(smpl, buft);
+
+ sctx->init(res);
+
+ return res;
+}
+
+static void llama_sampler_temp_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ auto * sctx = (llama_sampler_temp *) smpl->ctx;
+ llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp);
+}
+
+static struct llama_sampler_i llama_sampler_temp_i = {
+ /* .name = */ llama_sampler_temp_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_temp_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_temp_clone,
+ /* .free = */ llama_sampler_temp_free,
+ /* .backend_init = */ llama_sampler_temp_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_temp_backend_apply,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_temp(float temp) {
+ const bool is_empty = temp == 1.0f;
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?temp");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_temp_i,
+ /* .ctx = */ new llama_sampler_temp {
+ ("temp"),
+ /*.temp = */ temp,
+ }
+ );
+}
+
+// temp-ext
+
+struct llama_sampler_temp_ext : public llama_sampler_backend {
+ const float temp;
+ const float delta;
+ const float exponent;
+};
+
+static const char * llama_sampler_temp_ext_name(const struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
+ return sctx->get_name();
+}
+
+static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
+ if (ctx->delta > 0) {
+ const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
+ const float max_temp = ctx->temp + ctx->delta;
+
+ float exponent_val = ctx->exponent;
+
+ // no need to do anything if there is only one (or zero) candidates
+ if (cur_p->size <= 1) {
+ return;
+ }
+
+ // Calculate maximum possible entropy
+ float max_entropy = -logf(1.0f / cur_p->size);
+
+ llama_sampler_softmax_impl(cur_p, true);
+
+ // Calculate entropy of the softmax probabilities
+ float entropy = 0.0f;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ float prob = cur_p->data[i].p;
+ if (prob > 0.0f) { // Ensure no log(0)
+ entropy -= prob * logf(prob);
+ }
+ }
+
+ // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
+ float normalized_entropy = entropy / max_entropy;
+
+ // Map the normalized entropy to the desired temperature range using the power function
+ float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+ #ifdef DEBUG
+ LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+ LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+ LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+ LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+ LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+ LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+ #endif
+
+ // Apply the dynamically calculated temperature scaling
+ llama_sampler_temp_impl(cur_p, dyn_temp);
+
+ // Re-compute softmax probabilities after scaling logits with dynamic temperature
+ const double max_l_double = cur_p->data[0].logit;
+
+ double cum_sum_double = 0.0;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ double p = exp(cur_p->data[i].logit - max_l_double);
+ cur_p->data[i].p = p; // Store the scaled probability
+ cum_sum_double += p;
+ }
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+ }
+
+ #ifdef DEBUG
+ // Print the updated top 25 probabilities after temperature scaling
+ LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+ for (size_t i = 0; i < 25 && i < cur_p->size; ++i) {
+ LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f);
+ }
+ #endif
+ } else {
+ llama_sampler_temp_impl(cur_p, ctx->temp);
+ }
+}
+
+static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx;
+ return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
+}
+
+static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_temp_ext *) smpl->ctx;
+}
+
+static bool llama_sampler_temp_ext_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
+
+ const bool res = llama_sampler_backend_support(smpl, buft);
+
+ sctx->init(res);
+
+ return res;
+}
+
+static void llama_sampler_temp_ext_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
+
+ // Revert to standard temperature scaling if delta or temp are non-positive.
+ if (sctx->delta <= 0.0f || sctx->temp <= 0.0f) {
+ llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp);
+ return;
+ }
+
+ // Calculate min_temp, max_temp, and max_entropy.
+ const float min_temp = std::max(0.0f, sctx->temp - sctx->delta);
+ const float max_temp = sctx->temp + sctx->delta;
+ const float max_entropy = logf(data->logits->ne[0]);
+
+ // Calculate the probabilities.
+ struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
+ ggml_set_name(probs, "temp_ext_softmax_probs");
+
+ // Clamp probabilities to avoid log(0) which would give -inf
+ struct ggml_tensor * probs_clamped = ggml_clamp(ctx, probs, 1e-10f, 1.0f);
+ ggml_set_name(probs_clamped, "temp_ext_probs_clamped");
+
+ // Calculate the entropy, entropy = -Σ(p * log(p)).
+ struct ggml_tensor * log_probs = ggml_log(ctx, probs_clamped);
+ struct ggml_tensor * p_log_p = ggml_mul(ctx, probs_clamped, log_probs);
+ struct ggml_tensor * sum_p_log_p = ggml_sum(ctx, p_log_p);
+ struct ggml_tensor * entropy = ggml_scale(ctx, sum_p_log_p, -1.0f);
+ ggml_set_name(log_probs, "temp_ext_log_probs");
+ ggml_set_name(p_log_p, "temp_ext_p_log_p");
+ ggml_set_name(sum_p_log_p, "temp_ext_sum_p_log_p");
+ ggml_set_name(entropy, "temp_ext_entropy");
+
+ // Normalize the entropy, norm_entropy = entropy / max_entropy
+ struct ggml_tensor * norm_entropy = ggml_scale(ctx, entropy, 1.0f / max_entropy);
+ ggml_set_name(norm_entropy, "temp_ext_norm_entropy");
+
+ // Calculate the dynamic temperature:
+ // dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent);
+ //
+ // Calculate powf(normalized_entropy, exponent) as
+ // norm_entropy^exponent = exp(exponent * log(norm_entropy))
+ struct ggml_tensor * log_norm_entropy = ggml_log(ctx, norm_entropy);
+ struct ggml_tensor * scaled_log = ggml_scale(ctx, log_norm_entropy, sctx->exponent);
+ struct ggml_tensor * pow_entropy = ggml_exp(ctx, scaled_log);
+ // With pow_entropy computed we can now compute dyn_temp, scaling by
+ // (max_temp - min_temp) and then adding min_temp.
+ struct ggml_tensor * dyn_temp = ggml_scale_bias(ctx, pow_entropy, max_temp - min_temp, min_temp);
+ ggml_set_name(log_norm_entropy, "temp_ext_log_norm_entropy");
+ ggml_set_name(scaled_log, "temp_ext_scaled_log");
+ ggml_set_name(pow_entropy, "temp_ext_pow_entropy");
+ ggml_set_name(dyn_temp, "temp_ext_dyn_temp");
+
+ // Scale the logits by the dynamic temperature
+ struct ggml_tensor * scaled_logits = ggml_div(ctx, data->logits, dyn_temp);
+ ggml_set_name(scaled_logits, "temp_ext_scaled_logits");
+
+ data->logits = scaled_logits;
+}
+
+static struct llama_sampler_i llama_sampler_temp_ext_i = {
+ /* .name = */ llama_sampler_temp_ext_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_temp_ext_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_temp_ext_clone,
+ /* .free = */ llama_sampler_temp_ext_free,
+ /* .backend_init = */ llama_sampler_temp_ext_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_temp_ext_backend_apply,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
+ const bool is_empty = temp == 1.0f && delta <= 0.0f;
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?temp-ext");
+ }
+
+ auto * res = llama_sampler_init(
+ /* .iface = */ &llama_sampler_temp_ext_i,
+ /* .ctx = */ new llama_sampler_temp_ext {
+ ("temp-ext"),
+ /* .temp = */ temp,
+ /* .delta = */ delta,
+ /* .exponent = */ exponent,
+ }
+ );
+
+ return res;
+}
+
+// xtc
+
+struct llama_sampler_xtc {
+ const float probability;
+ const float threshold;
+ const size_t min_keep;
+
+ const uint32_t seed;
+ uint32_t seed_cur;
+
+ std::mt19937 rng;
+};
+
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+ return "xtc";
+}
+
+static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+
+ if (ctx->probability <= 0.0f
+ || ctx->threshold > 0.5f
+ || cur_p->size < 2) {
+ return;
+ }
+
+ std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+ float chance = distribution(ctx->rng);
+ if (chance > ctx->probability) {
+ return;
+ }
+
+ llama_sampler_softmax_impl(cur_p, true);
+
+ int pos_last = 0;
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (cur_p->data[i].p >= ctx->threshold) {
+ pos_last = i;
+ } else {
+ break;
+ }
+ }
+
+ if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
+ cur_p->data += pos_last;
+ cur_p->size -= pos_last;
+ }
+}
+
+static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
+ auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+
+ result_ctx->rng = ctx->rng;
+ }
+
+ return result;
+}
+
+static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_xtc *) smpl->ctx;
+}
+
+static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+ ctx->seed_cur = get_rng_seed(ctx->seed);
+ ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler_i llama_sampler_xtc_i = {
+ /* .name = */ llama_sampler_xtc_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sample_xtc_apply,
+ /* .reset = */ llama_sampler_xtc_reset,
+ /* .clone = */ llama_sampler_xtc_clone,
+ /* .free = */ llama_sampler_xtc_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
+ const bool is_empty = (p <= 0.0f || t > 0.5f);
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?xtc");
+ }
+
+ const auto seed_cur = get_rng_seed(seed);
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_xtc_i,
+ /* .ctx = */ new llama_sampler_xtc {
+ /* .probability = */ p,
+ /* .threshold = */ t,
+ /* .min_keep = */ min_keep,
+ /* .seed = */ seed,
+ /* .seed_cur = */ seed_cur,
+ /* .rng = */ std::mt19937(seed_cur),
+ }
+ );
+}
+
+// mirostat
+
+struct llama_sampler_mirostat {
+ const int32_t n_vocab;
+
+ const uint32_t seed;
+ uint32_t seed_cur;
+
+ const float tau;
+ const float eta;
+
+ const int32_t m;
+
+ float mu;
+
+ std::mt19937 rng;
+};
+
+static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
+ return "mirostat";
+}
+
+static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+
+ llama_sampler_softmax_impl(cur_p, true);
+
+ // Estimate s_hat using the most probable m tokens
+ float s_hat = 0.0;
+ float sum_ti_bi = 0.0;
+ float sum_ti_sq = 0.0;
+ for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) {
+ float t_i = logf(float(i + 2) / float(i + 1));
+ float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p);
+ sum_ti_bi += t_i * b_i;
+ sum_ti_sq += t_i * t_i;
+ }
+ s_hat = sum_ti_bi / sum_ti_sq;
+
+ // Compute k from the estimated s_hat and target surprise value
+ float epsilon_hat = s_hat - 1;
+ float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
+
+ llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
+
+ llama_sampler_softmax_impl(cur_p, true);
+
+ const int idx = llama_sample_dist(cur_p, ctx->rng);
+
+ cur_p->selected = idx;
+
+ float observed_surprise = -log2f(cur_p->data[idx].p);
+ float e = observed_surprise - ctx->tau;
+
+ // Update mu using the learning rate and error
+ ctx->mu = ctx->mu - ctx->eta * e;
+}
+
+static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx;
+ auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m);
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx;
+
+ result_ctx->mu = ctx->mu;
+ result_ctx->rng = ctx->rng;
+ }
+
+ return result;
+}
+
+static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+ ctx->mu = 2.0f*ctx->tau;
+ ctx->seed_cur = get_rng_seed(ctx->seed);
+ ctx->rng.seed(ctx->seed_cur);
+}
+
+static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_mirostat *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_mirostat_i = {
+ /* .name = */ llama_sampler_mirostat_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_mirostat_apply,
+ /* .reset = */ llama_sampler_mirostat_reset,
+ /* .clone = */ llama_sampler_mirostat_clone,
+ /* .free = */ llama_sampler_mirostat_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
+ const auto seed_cur = get_rng_seed(seed);
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_mirostat_i,
+ /* .ctx = */ new llama_sampler_mirostat {
+ /* .n_vocab = */ n_vocab,
+ /* .seed = */ seed,
+ /* .seed_cur = */ seed_cur,
+ /* .tau = */ tau,
+ /* .eta = */ eta,
+ /* .m = */ m,
+ /* .mu = */ 2.0f*tau,
+ /* .rng = */ std::mt19937(seed_cur),
+ }
+ );
+}
+
+// mirostat v2
+
+struct llama_sampler_mirostat_v2 {
+ const uint32_t seed;
+ uint32_t seed_cur;
+
+ const float tau;
+ const float eta;
+
+ float mu;
+
+ std::mt19937 rng;
+};
+
+static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
+ return "mirostat-v2";
+}
+
+static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+
+ llama_sampler_softmax_impl(cur_p, true);
+
+ // Truncate the words with surprise values greater than mu
+ cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
+ return -log2f(candidate.p) > ctx->mu;
+ }));
+
+ if (cur_p->size == 0) {
+ cur_p->size = 1;
+ }
+
+ // Normalize the probabilities of the remaining words
+ llama_sampler_softmax_impl(cur_p, true);
+
+ const int idx = llama_sample_dist(cur_p, ctx->rng);
+
+ cur_p->selected = idx;
+
+ float observed_surprise = -log2f(cur_p->data[idx].p);
+ float e = observed_surprise - ctx->tau;
+
+ // Update mu using the learning rate and error
+ ctx->mu = ctx->mu - ctx->eta * e;
+}
+
+static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+ ctx->mu = 2.0f*ctx->tau;
+ ctx->seed_cur = get_rng_seed(ctx->seed);
+ ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx;
+
+ auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta);
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx;
+
+ result_ctx->mu = ctx->mu;
+ result_ctx->rng = ctx->rng;
+ }
+
+ return result;
+}
+
+static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_mirostat_v2 *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
+ /* .name = */ llama_sampler_mirostat_v2_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_mirostat_v2_apply,
+ /* .reset = */ llama_sampler_mirostat_v2_reset,
+ /* .clone = */ llama_sampler_mirostat_v2_clone,
+ /* .free = */ llama_sampler_mirostat_v2_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
+ auto seed_cur = get_rng_seed(seed);
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_mirostat_v2_i,
+ /* .ctx = */ new llama_sampler_mirostat_v2 {
+ /* .seed = */ seed,
+ /* .seed_cur = */ seed_cur,
+ /* .tau = */ tau,
+ /* .eta = */ eta,
+ /* .mu = */ 2.0f*tau,
+ /* .rng = */ std::mt19937(seed_cur),
+ }
+ );
+}
+
+// grammar
+
+struct llama_sampler_grammar {
+ const struct llama_vocab * vocab;
+
+ std::string grammar_str;
+ std::string grammar_root;
+
+ struct llama_grammar * grammar;
+};
+
+static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) {
+ return "grammar";
+}
+
+static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) {
+ auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+ if (ctx->grammar) {
+ llama_grammar_accept_impl(*ctx->grammar, token);
+ }
+}
+
+static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+ if (ctx->grammar) {
+ llama_grammar_apply_impl(*ctx->grammar, cur_p);
+ }
+}
+
+// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle.
+static struct llama_sampler * llama_sampler_init_grammar_impl(
+ const struct llama_vocab * vocab,
+ const char * grammar_str,
+ const char * grammar_root,
+ bool lazy,
+ const char ** trigger_words,
+ size_t num_trigger_words,
+ const llama_token * trigger_tokens,
+ size_t num_trigger_tokens,
+ const char ** trigger_patterns,
+ size_t num_trigger_patterns);
+
+static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+ if (!ctx->grammar) {
+ return;
+ }
+
+ std::vector<const char *> trigger_patterns_c;
+ trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
+ for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
+ trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
+ }
+
+ auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
+ ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
+ ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
+
+ llama_grammar_free_impl(ctx->grammar);
+ ctx->grammar = grammar_new;
+}
+
+static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
+
+ auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
+ GGML_ASSERT(result);
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_grammar *) result->ctx;
+
+ if (ctx->grammar) {
+ result_ctx->grammar_str = ctx->grammar_str;
+ result_ctx->grammar_root = ctx->grammar_root;
+
+ result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar);
+ }
+ }
+
+ return result;
+}
+
+static void llama_sampler_grammar_free(struct llama_sampler * smpl) {
+ const auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+
+ if (ctx->grammar) {
+ llama_grammar_free_impl(ctx->grammar);
+ }
+
+ delete ctx;
+}
+
+static struct llama_sampler_i llama_sampler_grammar_i = {
+ /* .name = */ llama_sampler_grammar_name,
+ /* .accept = */ llama_sampler_grammar_accept_impl,
+ /* .apply = */ llama_sampler_grammar_apply,
+ /* .reset = */ llama_sampler_grammar_reset,
+ /* .clone = */ llama_sampler_grammar_clone,
+ /* .free = */ llama_sampler_grammar_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+static struct llama_sampler * llama_sampler_init_grammar_impl(
+ const struct llama_vocab * vocab,
+ const char * grammar_str,
+ const char * grammar_root,
+ bool lazy,
+ const char ** trigger_words,
+ size_t num_trigger_words,
+ const llama_token * trigger_tokens,
+ size_t num_trigger_tokens,
+ const char ** trigger_patterns,
+ size_t num_trigger_patterns) {
+ auto * ctx = new llama_sampler_grammar;
+
+ if (grammar_str != nullptr && grammar_str[0] != '\0') {
+ std::string trigger_pattern;
+ llama_grammar * grammar = nullptr;
+ // TODO: remove trigger_words support.
+ if (trigger_words != nullptr && num_trigger_words > 0) {
+ GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
+ trigger_pattern = "[\\s\\S]*?(";
+ for (size_t i = 0; i < num_trigger_words; ++i) {
+ static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+ if (i > 0) {
+ trigger_pattern += "|";
+ }
+ trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
+ }
+ trigger_pattern += ")[\\s\\S]*";
+
+ std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
+ grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
+ } else {
+ grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
+ }
+ *ctx = {
+ /* .vocab = */ vocab,
+ /* .grammar_str = */ grammar_str,
+ /* .grammar_root = */ grammar_root,
+ /* .grammar = */ grammar,
+ };
+ if (!ctx->grammar) {
+ delete ctx;
+ return nullptr;
+ }
+ } else {
+ *ctx = {
+ /* .vocab = */ vocab,
+ /* .grammar_str = */ {},
+ /* .grammar_root = */ {},
+ /* .grammar = */ nullptr,
+ };
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_grammar_i,
+ /* .ctx = */ ctx
+ );
+}
+
+struct llama_sampler * llama_sampler_init_grammar(
+ const struct llama_vocab * vocab,
+ const char * grammar_str,
+ const char * grammar_root) {
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
+}
+
+struct llama_sampler * llama_sampler_init_grammar_lazy(
+ const struct llama_vocab * vocab,
+ const char * grammar_str,
+ const char * grammar_root,
+ const char ** trigger_words,
+ size_t num_trigger_words,
+ const llama_token * trigger_tokens,
+ size_t num_trigger_tokens) {
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
+}
+
+struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+ const struct llama_vocab * vocab,
+ const char * grammar_str,
+ const char * grammar_root,
+ const char ** trigger_patterns,
+ size_t num_trigger_patterns,
+ const llama_token * trigger_tokens,
+ size_t num_trigger_tokens) {
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
+}
+
+// penalties
+
+struct llama_sampler_penalties {
+ const int32_t penalty_last_n;
+ const float penalty_repeat;
+ const float penalty_freq;
+ const float penalty_present;
+
+ ring_buffer<llama_token> prev;
+
+ // a frequency map to count token occurrences
+ std::unordered_map<llama_token, int> token_count;
+};
+
+static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
+ return "penalties";
+}
+
+static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) {
+ auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+ if (ctx->penalty_last_n == 0) {
+ return;
+ }
+
+ ctx->token_count[token]++;
+
+ // if the ring buffer is full, remove the oldest token
+ if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
+ const auto old = ctx->prev.front();
+
+ ctx->token_count[old]--;
+ if (ctx->token_count[old] == 0) {
+ ctx->token_count.erase(old);
+ }
+ }
+
+ ctx->prev.push_back(token);
+
+#if 0
+ // sanity check
+ std::unordered_map<llama_token, int> tmp;
+ for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
+ tmp[ctx->prev.rat(i)]++;
+ }
+
+ assert(ctx->token_count == tmp);
+#endif
+}
+
+static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+
+ if ((ctx->penalty_last_n == 0) ||
+ (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
+ return;
+ }
+
+ // Apply frequency and presence penalties to the cur_p
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
+ if (token_iter == ctx->token_count.end()) {
+ continue;
+ }
+
+ const int count = token_iter->second;
+
+ assert(count > 0 && count <= ctx->penalty_last_n);
+
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+ if (cur_p->data[i].logit <= 0) {
+ cur_p->data[i].logit *= ctx->penalty_repeat;
+ } else {
+ cur_p->data[i].logit /= ctx->penalty_repeat;
+ }
+
+ cur_p->data[i].logit -= float(count) * ctx->penalty_freq + float(count > 0) * ctx->penalty_present;
+ }
+
+ cur_p->sorted = false;
+}
+
+static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+ ctx->prev.clear();
+ ctx->token_count.clear();
+}
+
+static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
+ auto * result = llama_sampler_init_penalties(
+ ctx->penalty_last_n,
+ ctx->penalty_repeat,
+ ctx->penalty_freq,
+ ctx->penalty_present);
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_penalties *) result->ctx;
+
+ result_ctx->prev = ctx->prev;
+ }
+
+ return result;
+}
+
+static void llama_sampler_penalties_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_penalties *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_penalties_i = {
+ /* .name = */ llama_sampler_penalties_name,
+ /* .accept = */ llama_sampler_penalties_accept,
+ /* .apply = */ llama_sampler_penalties_apply,
+ /* .reset = */ llama_sampler_penalties_reset,
+ /* .clone = */ llama_sampler_penalties_clone,
+ /* .free = */ llama_sampler_penalties_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_penalties(
+ int32_t penalty_last_n,
+ float penalty_repeat,
+ float penalty_freq,
+ float penalty_present) {
+ penalty_last_n = std::max(penalty_last_n, 0);
+
+ const bool is_empty = (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f));
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?penalties");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_penalties_i,
+ /* .ctx = */ new llama_sampler_penalties {
+ /* .penalty_last_n = */ penalty_last_n,
+ /* .penalty_repeat = */ penalty_repeat,
+ /* .penalty_freq = */ penalty_freq,
+ /* .penalty_present = */ penalty_present,
+ /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
+ /* .token_count = */ {},
+ }
+ );
+}
+
+// top-n-sigma
+
+struct llama_sampler_top_n_sigma {
+ const float n;
+};
+
+static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
+ return "top-n-sigma";
+}
+
+static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
+
+ if (ctx->n <= 0.0f || cur_p->size <= 1) {
+ return;
+ }
+
+ // find max logit and calculate mean
+ float max = cur_p->data[0].logit;
+ float logits_sum = 0;
+ size_t valid_count = 0;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ // Only count non-negative infinity values
+ if (cur_p->data[i].logit != -INFINITY) {
+ max = std::max(max, cur_p->data[i].logit);
+ logits_sum += cur_p->data[i].logit;
+ valid_count++;
+ }
+ }
+ float mean = valid_count > 0 ? logits_sum/valid_count : 0;
+
+ // calculate standard deviation
+ float acc = 0;
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ // Skip -infinity in std calculation
+ if (cur_p->data[i].logit != -INFINITY) {
+ acc += pow(cur_p->data[i].logit - mean, 2);
+ }
+ }
+ float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
+
+ // apply mask
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (cur_p->data[i].logit < max - (ctx->n * std)) {
+ cur_p->data[i].logit = -INFINITY;
+ }
+ }
+
+ llama_sampler_softmax_impl(cur_p, true);
+}
+
+static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx;
+ return llama_sampler_init_top_n_sigma(ctx->n);
+}
+
+static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_top_n_sigma *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
+ /* .name = */ llama_sampler_top_n_sigma_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_top_n_sigma_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_top_n_sigma_clone,
+ /* .free = */ llama_sampler_top_n_sigma_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
+ const bool is_empty = (n <= 0.0f);
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?top-n-sigma");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_top_n_sigma_i,
+ /* .ctx = */ new llama_sampler_top_n_sigma {
+ /* .n = */ n,
+ }
+ );
+}
+
+// DRY
+
+struct llama_sampler_dry {
+ int32_t total_context_size;
+
+ const float dry_multiplier;
+ const float dry_base;
+ const int32_t dry_allowed_length;
+ const int32_t dry_penalty_last_n;
+
+ std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
+ std::vector<int> dry_repeat_count;
+ std::unordered_map<llama_token, int> dry_max_token_repeat;
+ ring_buffer<llama_token> last_tokens;
+};
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
+ for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
+ std::string word = vocab.detokenize({token_id}, true);
+ if (word.find(str) != std::string::npos) {
+ token_sequences.emplace(token_id, std::vector<llama_token>());
+ } else {
+ size_t word_len = word.size();
+ size_t str_len = str.size();
+ size_t pos = -1;
+ while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
+ bool match = true;
+ size_t i;
+ for (i = 1; i < str_len && i + pos < word_len; ++i) {
+ if (word[pos + i] != str[i]) {
+ match = false;
+ break;
+ }
+ }
+ if (match) {
+ std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
+ if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
+ tokenization.resize(max_tail_len);
+ }
+
+ // Ensure we don't already have a duplicate matching tokenization
+ auto its = token_sequences.equal_range(token_id);
+ bool found = false;
+ for (auto it = its.first; it != its.second; ++it) {
+ if (tokenization == it->second) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ token_sequences.emplace(token_id, tokenization);
+ }
+ }
+ }
+ }
+ }
+}
+
+static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
+ return "dry";
+}
+
+static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+ return;
+ }
+
+ ctx->last_tokens.push_back(token);
+}
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+ return;
+ }
+
+ int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
+ int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
+
+ if (last_n_repeat <= ctx->dry_allowed_length) {
+ return;
+ }
+
+ ctx->dry_repeat_count.assign(last_n_repeat, 0);
+ ctx->dry_max_token_repeat.clear();
+
+ // Step 1: Look for restart sequences to limit the maximum repetition length.
+ // Work backwards through the context looking for any token that begins a restart sequence.
+ //
+ // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
+ // sequences that together comprise a restart sequence. This allows us to quickly check
+ // whether each token is the head of a complete sequence. Most restart sequences are actually
+ // a single token, and for these the "tail" is an empty vector.
+ //
+ // If the token is a "head", test all restart sequences that begin with this token
+ // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
+ // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
+ // longest matching sequence (if any) is used to limit the maximum repetition length.
+ //
+ // Note that in the case case of a short sequence contained in a longer one, this might fail to
+ // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
+ // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
+ // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
+ //
+ // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
+ // have already clamped the maximum tail sequence length when generating `restart_sequences`.
+ // With clamping, this scan is O(N) in the context length.
+
+ int rep_limit = last_n_repeat;
+ for (int i = 0; i < last_n_repeat; ++i) {
+ llama_token token = ctx->last_tokens.rat(i);
+ auto its = ctx->dry_processed_breakers.equal_range(token);
+ if (its.first == ctx->dry_processed_breakers.end()) {
+ continue;
+ }
+ int longest_match = -1;
+ for (auto it = its.first; it != its.second; ++it) {
+ // Note that (*it) does not contain the head character, so seq_len will be
+ // the restart sequence length minus 1.
+ // In the common case of a single-token restart sequence, (*it) will be empty
+ // and we will trivially match.
+ int seq_len = (int)it->second.size();
+ if (seq_len > longest_match && seq_len <= (int)i) {
+ bool match = true;
+ for (int offset = 0; offset < seq_len; ++offset) {
+ // The -1 when indexing `last_tokens` is because we already matched the head.
+ if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
+ match = false;
+ break;
+ }
+ }
+ if (match) {
+ longest_match = seq_len;
+ }
+ }
+ }
+ if (longest_match >= 0) {
+ // We found a restart sequence starting `i` tokens from the end and continuing for
+ // `longest_match` tokens.
+ rep_limit = i - longest_match;
+ break;
+ }
+ }
+ if (rep_limit < ctx->dry_allowed_length) {
+ return;
+ }
+
+ // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
+ // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
+ // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
+ //
+ // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
+ // https://ivanyu.me/blog/2014/10/15/z-algorithm/
+ //
+ // The code below is adapted from the public domain implementation by the same author here:
+ // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
+ //
+ // Example:
+ // Last N tokens: a b c c b c y a b c
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+ // ^
+ // This `3` means that the last three tokens of the context (a b c) also appear here.
+ //
+ // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
+ // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
+ // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
+ // ensure that the inner while loops only examine each token in the context once as the outer
+ // for loop iterates over the context.
+
+ {
+ const int last = last_n_repeat - 1;
+
+ int rt = 0;
+ int lt = 0;
+
+ for (int k = 1; k < last_n_repeat; ++k) {
+ if (k > rt) {
+ // If k is outside the current Z-box, do naive computation.
+ int n = 0;
+ while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
+ ++n;
+ }
+ ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
+ if (n > 0) {
+ lt = k;
+ rt = k + n - 1;
+ }
+ } else {
+ // If k is inside the current Z-box, consider two cases.
+
+ int p = k - lt; // Pair index.
+ int right_part_len = rt - k + 1;
+
+ if (ctx->dry_repeat_count[last - p] < right_part_len) {
+ int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
+ ctx->dry_repeat_count[last - k] = n;
+ } else {
+ int i = rt + 1;
+ while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
+ i += 1;
+ }
+
+ int n = std::min(i - k, rep_limit);
+ ctx->dry_repeat_count[last - k] = n;
+ lt = k;
+ rt = i - 1;
+ }
+ }
+ }
+ }
+
+ // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
+ // that would be generated by emitting each new token that would extend a sequence.
+ //
+ // Following the same example as above:
+ // Last N tokens: a b c c b c y a b c
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+ //
+ // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
+ // c: 3 -> 4 (from `a b c` to `a b c c`)
+ // b: 1 -> 2 (from `c` to `c b`)
+ // y: 2 -> 3 (from `b c` to `b c y`)
+
+ for (int i = 0; i < last_n_repeat - 1; ++i) {
+ int repeat_len = ctx->dry_repeat_count[i];
+ if (repeat_len >= ctx->dry_allowed_length) {
+ // This token ends a repeat, so the next token would continue one.
+ // By convention, the value of `repeat_len` only includes the tokens currently
+ // in the context, not the new token that would be added.
+ llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
+ // Track the maximum sequence ending in this token.
+ const auto& it = ctx->dry_max_token_repeat.find(token);
+ if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
+ ctx->dry_max_token_repeat[token] = repeat_len;
+ }
+ }
+ }
+
+ // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
+
+ // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
+ // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
+ const float FLOAT_MAX_LOG = 88.7228391f;
+ int max_exponent = 0;
+ if (ctx->dry_base > 1.000001f) {
+ max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
+ }
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
+ if (af_kvp != ctx->dry_max_token_repeat.end()) {
+ // Check all sequence breakers starting with this token
+ auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
+ bool is_single_token_breaker = false;
+
+ for (auto it = range.first; it != range.second; ++it) {
+ if (it->second.empty()) {
+ is_single_token_breaker = true;
+ break;
+ }
+ }
+
+ // Apply penalty only if it's not a single-token sequence breaker
+ if (!is_single_token_breaker) {
+ int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
+ if (max_exponent > 0 && repeat_exp > max_exponent) {
+ repeat_exp = max_exponent;
+ }
+ float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
+ cur_p->data[i].logit -= penalty;
+ }
+ }
+ }
+
+ cur_p->sorted = false;
+}
+
+static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
+ ctx->last_tokens.clear();
+ ctx->dry_repeat_count.clear();
+ ctx->dry_max_token_repeat.clear();
+}
+
+static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+ llama_vocab dummy_vocab;
+
+ // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
+ auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+
+ // Copy the state, including the processed breakers
+ {
+ auto * result_ctx = (llama_sampler_dry *) result->ctx;
+ result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
+ result_ctx->dry_repeat_count = ctx->dry_repeat_count;
+ result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
+ result_ctx->last_tokens = ctx->last_tokens;
+ }
+
+ return result;
+}
+
+static void llama_sampler_dry_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_dry *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_dry_i = {
+ /* .name = */ llama_sampler_dry_name,
+ /* .accept = */ llama_sampler_dry_accept,
+ /* .apply = */ llama_sampler_dry_apply,
+ /* .reset = */ llama_sampler_dry_reset,
+ /* .clone = */ llama_sampler_dry_clone,
+ /* .free = */ llama_sampler_dry_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+ int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0);
+ std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
+ const int MAX_CHAR_LEN = 40;
+ const int MAX_SEQ_LEN = 20;
+
+ const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
+
+ if (!dry_enabled) {
+ return llama_sampler_init_empty("?dry");
+ }
+
+ if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
+ // Process sequence breakers
+ for (size_t i = 0; i < num_breakers; ++i) {
+ if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
+ LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
+ continue;
+ }
+
+ std::string sequence_break(seq_breakers[i]);
+ if (sequence_break.empty()) {
+ LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
+ continue;
+ }
+
+ if (sequence_break.size() > MAX_CHAR_LEN) {
+ LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
+ sequence_break.resize(MAX_CHAR_LEN);
+ }
+
+ get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
+ }
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_dry_i,
+ /* .ctx = */ new llama_sampler_dry {
+ /* .total_context_size = */ n_ctx_train,
+ /* .dry_multiplier = */ dry_multiplier,
+ /* .dry_base = */ dry_base,
+ /* .dry_allowed_length = */ dry_allowed_length,
+ /* .dry_penalty_last_n = */ dry_penalty_last_n,
+ /* .dry_processed_breakers = */ std::move(processed_breakers),
+ /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
+ /* .dry_max_token_repeat = */ {},
+ /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
+ }
+ );
+}
+
+// wrapper for test-sampling.cpp
+struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
+ llama_vocab dummy_vocab;
+ auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
+ auto * ctx = (llama_sampler_dry *) result->ctx;
+
+ // Process the token-based sequence breakers
+ ctx->dry_processed_breakers.clear();
+ if (seq_breakers.empty()) {
+ LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
+ } else {
+ for (const auto& breaker : seq_breakers) {
+ if (breaker.empty()) {
+ LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
+ continue;
+ }
+ llama_token head_token = breaker[0];
+ std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
+ ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
+ }
+
+ if (ctx->dry_processed_breakers.empty()) {
+ LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
+ }
+ }
+
+ return result;
+}
+
+// adaptive-p sampler state
+//
+// maintains an exponential moving average of the *ORIGINAL* probabilities
+// of selected tokens, used to compute an adapted target at each sampling step.
+//
+// see llama.h for a full description of the sampler
+//
+// ref: https://github.com/ggml-org/llama.cpp/pull/17927
+//
+struct llama_sampler_adaptive_p {
+ const float target; // target probability (0.0 - 1.0; negative = disabled)
+ const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
+ const uint32_t seed; // original RNG seed
+ uint32_t seed_cur; // actual RNG seed
+ std::mt19937 rng; // RNG state
+ float weighted_sum; // sum(p_i * decay^i)
+ float total_weight; // sum(decay^i), converges to 1/(1-decay)
+ std::vector<float> original_probs; // pre-transform probs, cached for EMA update
+ llama_token pending_token_id; // token ID of selected token
+ int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
+};
+
+// adaptive probability transformation constants
+static constexpr float DISTRIBUTION_WIDTH = 0.3f;
+static constexpr float PEAK_LOGIT_VALUE = 5.0f;
+static constexpr float SHARPNESS = 10.0f;
+static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
+
+static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
+ return "adaptive-p";
+}
+
+static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+
+ llama_sampler_softmax_impl(cur_p, false);
+
+ if (ctx->target < 0.0f) {
+ // at negative target values, adaptive-p is no-op
+ // we simply sample from the existing distribution
+ cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+ return;
+ }
+
+ // store the original probabilities
+ ctx->original_probs.resize(cur_p->size);
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ ctx->original_probs[i] = cur_p->data[i].p;
+ }
+
+ // using the EMA, compute the adapted target probability for the current sampling step
+ auto target = std::clamp(ctx->target, 0.0f, 1.0f);
+ float adapted_target = std::clamp(
+ ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
+ 0.0f, 1.0f
+ );
+
+ // adaptive probability transform
+ //
+ // quadratic near target for fine differentiation, transitioning to linear decay in the
+ // tails. unbounded negative logits ensure proper suppression of far-from-target tokens
+ // after the softmax.
+ //
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (cur_p->data[i].logit == -INFINITY) {
+ // don't transform logits that are -INFINITY
+ // (as masked out by e.g. min-p and top-p when using backend sampling)
+ continue;
+ }
+ float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
+ cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
+ }
+
+ // softmax and sample from the transformed distribution
+ llama_sampler_softmax_impl(cur_p, false);
+ const int idx = llama_sample_dist(cur_p, ctx->rng);
+ cur_p->selected = idx;
+
+ // store the selected token ID for acceptance later
+ ctx->pending_token_id = cur_p->data[idx].id;
+ ctx->pending_token_idx = idx;
+}
+
+static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+ if (ctx->pending_token_id == token) {
+ GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
+ GGML_ASSERT(ctx->pending_token_idx != -1);
+ // update EMA with the original probability of the selected token
+ ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
+ ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
+ }
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
+ ctx->pending_token_idx = -1;
+}
+
+static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+ // ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
+ // original_probs is completely overwritten on every call to _apply.
+ // so we only need to reset the EMA state and pending token.
+ ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
+ ctx->total_weight = 1.0f / (1.0f - ctx->decay);
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
+ ctx->pending_token_idx = -1;
+ ctx->seed_cur = get_rng_seed(ctx->seed);
+ ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
+ auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
+ auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
+
+ // copy everything (target, decay, seed, and RNG are already set)
+ result_ctx->weighted_sum = ctx->weighted_sum;
+ result_ctx->total_weight = ctx->total_weight;
+ result_ctx->pending_token_id = ctx->pending_token_id;
+ result_ctx->pending_token_idx = ctx->pending_token_idx;
+
+ return result;
+}
+
+static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_adaptive_p *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_adaptive_p_i = {
+ /* .name = */ llama_sampler_adaptive_p_name,
+ /* .accept = */ llama_sampler_adaptive_p_accept,
+ /* .apply = */ llama_sampler_adaptive_p_apply,
+ /* .reset = */ llama_sampler_adaptive_p_reset,
+ /* .clone = */ llama_sampler_adaptive_p_clone,
+ /* .free = */ llama_sampler_adaptive_p_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_adaptive_p(
+ float target,
+ float decay,
+ uint32_t seed
+) {
+ auto seed_cur = get_rng_seed(seed);
+ float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_adaptive_p_i,
+ /* .ctx = */ new llama_sampler_adaptive_p {
+ /* .target = */ target,
+ /* .decay = */ clamped_decay,
+ /* .seed = */ seed,
+ /* .seed_cur = */ seed_cur,
+ /* .rng = */ std::mt19937(seed_cur),
+ /* .weighted_sum = */ target / (1.0f - clamped_decay),
+ /* .total_weight = */ 1.0f / (1.0f - clamped_decay),
+ /* .original_probs = */ {},
+ /* .pending_token_id = */ LLAMA_TOKEN_NULL,
+ /* .pending_token_idx = */ -1
+ }
+ );
+}
+
+// logit-bias
+
+struct llama_sampler_logit_bias : public llama_sampler_backend {
+ const int32_t n_vocab;
+
+ const std::vector<llama_logit_bias> logit_bias;
+
+ std::vector<llama_logit_bias> to_search;
+
+ struct ggml_tensor * inp_logit_bias;
+ struct ggml_tensor * inp_logit_idxs;
+};
+
+static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
+ return ctx->get_name();
+}
+
+static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
+
+ if (ctx->logit_bias.empty()) {
+ return;
+ }
+
+ ctx->to_search.clear();
+
+ // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
+ for (const auto & lb : ctx->logit_bias) {
+ if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) {
+ cur_p->data[lb.token].logit += lb.bias;
+ } else {
+ ctx->to_search.push_back(lb);
+ }
+ }
+
+ if (ctx->to_search.empty()) {
+ return;
+ }
+
+ // search for the remaining candidates that were not found in the previous step
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ for (const auto & lb : ctx->to_search) {
+ if (cur_p->data[i].id == lb.token) {
+ cur_p->data[i].logit += lb.bias;
+ break;
+ }
+ }
+ }
+}
+
+static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
+ return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
+}
+
+static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_logit_bias *) smpl->ctx;
+}
+
+static void llama_sampler_logit_bias_backend_apply(
+ struct llama_sampler * smpl,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct llama_sampler_data * data) {
+ GGML_UNUSED(gf);
+ GGML_UNUSED(ctx);
+
+ auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
+ if (sctx->logit_bias.empty()) {
+ return;
+ }
+
+ const size_t n = sctx->logit_bias.size();
+
+ sctx->inp_logit_bias = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n);
+ ggml_set_name(sctx->inp_logit_bias, "logit_bias");
+ ggml_set_input(sctx->inp_logit_bias);
+
+ sctx->inp_logit_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n);
+ ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
+ ggml_set_input(sctx->inp_logit_idxs);
+
+ ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f);
+
+ cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur));
+ cur = ggml_set_rows(ctx, cur, sctx->inp_logit_bias, sctx->inp_logit_idxs);
+ cur = ggml_reshape_1d(ctx, cur, ggml_nelements(cur));
+
+ data->logits = ggml_add(ctx, data->logits, cur);
+}
+
+static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * smpl) {
+ auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
+ if (sctx->logit_bias.empty()) {
+ return;
+ }
+
+ GGML_ASSERT(sctx->inp_logit_bias != nullptr);
+ GGML_ASSERT(sctx->inp_logit_idxs != nullptr);
+
+ const size_t n = sctx->logit_bias.size();
+
+ std::vector<float> data_logit_bias(n, 0.0f);
+ std::vector<int32_t> data_logit_idxs(n, 0);
+ for (size_t i = 0; i < n; ++i) {
+ const auto & lb = sctx->logit_bias[i];
+ GGML_ASSERT(lb.token >= 0 && lb.token < (int32_t) sctx->n_vocab);
+ data_logit_bias[i] = lb.bias;
+ data_logit_idxs[i] = lb.token;
+ }
+
+ ggml_backend_tensor_set(sctx->inp_logit_bias, data_logit_bias.data(), 0, ggml_nbytes(sctx->inp_logit_bias));
+ ggml_backend_tensor_set(sctx->inp_logit_idxs, data_logit_idxs.data(), 0, ggml_nbytes(sctx->inp_logit_idxs));
+}
+
+static bool llama_sampler_logit_bias_backend_init(
+ struct llama_sampler * smpl,
+ ggml_backend_buffer_type_t buft) {
+ GGML_UNUSED(buft);
+
+ auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
+
+ sctx->init(true);
+
+ if (sctx->logit_bias.empty()) {
+ return true;
+ }
+
+ return true;
+}
+
+static struct llama_sampler_i llama_sampler_logit_bias_i = {
+ /* .name = */ llama_sampler_logit_bias_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_logit_bias_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_logit_bias_clone,
+ /* .free = */ llama_sampler_logit_bias_free,
+ /* .backend_init = */ llama_sampler_logit_bias_backend_init,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ llama_sampler_logit_bias_backend_apply,
+ /* .backend_set_input = */ llama_sampler_logit_bias_backend_set_input,
+};
+
+struct llama_sampler * llama_sampler_init_logit_bias(
+ int32_t n_vocab,
+ int32_t n_logit_bias,
+ const llama_logit_bias * logit_bias) {
+ const bool is_empty = n_logit_bias <= 0;
+
+ if (is_empty) {
+ return llama_sampler_init_empty("?logit-bias");
+ }
+
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_logit_bias_i,
+ /* .ctx = */ new llama_sampler_logit_bias {
+ ("logit-bias"),
+ /* .n_vocab = */ n_vocab,
+ /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
+ /* .to_search = */ {},
+ /* .inp_logit_bias = */ nullptr,
+ /* .inp_logit_idxs = */ nullptr,
+ }
+ );
+}
+
+// infill
+
+//#define GGML_DEBUG_SAMPLER_INFILL
+
+struct llama_sampler_infill {
+ const struct llama_vocab * vocab;
+
+ std::vector<char> buf0;
+ std::vector<char> buf1;
+};
+
+static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
+ return "infill";
+}
+
+static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_infill *) smpl->ctx;
+
+ llama_sampler_softmax_impl(cur_p, true);
+
+#if defined(GGML_DEBUG_SAMPLER_INFILL)
+#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#else
+#define LOG_DBG_CUR(...)
+#endif
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+ }
+
+ float p_txt_sum = 0.0f;
+ float p_eog_sum = 0.0f;
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (ctx->vocab->is_eog(cur_p->data[i].id)) {
+ p_eog_sum += cur_p->data[i].p;
+ } else {
+ p_txt_sum += cur_p->data[i].p;
+ }
+ }
+
+ const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
+
+ LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
+
+ if (3*p_eog_sum*cur_p->size > p_txt_sum) {
+ LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
+
+ // keep just the EOG tokens
+ const auto size_org = cur_p->size;
+
+ cur_p->size = 0;
+
+ float p_sum = 0.0f;
+
+ for (size_t i = 0; i < size_org; ++i) {
+ if (ctx->vocab->is_eog(cur_p->data[i].id)) {
+ p_sum += cur_p->data[i].p;
+
+ cur_p->data[cur_p->size++] = cur_p->data[i];
+ }
+ }
+
+ // normalize probs
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= p_sum;
+ }
+
+ return;
+ }
+
+ size_t n_combined = 0; GGML_UNUSED(n_combined);
+
+ // combine tokens with common prefix
+ for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
+ for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
+ if (cur_p->data[i0].logit == -INFINITY) {
+ break;
+ }
+
+ if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
+ continue;
+ }
+
+ int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+ if (len0 < 0) {
+ ctx->buf0.resize(len0);
+ len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+ assert(len0 > 0);
+ }
+
+ int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+ if (len1 < 0) {
+ ctx->buf1.resize(len1);
+ len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+ assert(len1 > 0);
+ }
+
+ // token i0 is a prefix of token i1
+ if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
+ int dst = i0;
+ int src = i1;
+
+ // merge into the token with higher probability
+ if (cur_p->data[i1].p > cur_p->data[i0].p) {
+ std::swap(dst, src);
+ }
+
+ cur_p->data[dst].p += cur_p->data[src].p;
+ cur_p->data[src].logit = -INFINITY;
+ cur_p->data[src].p = 0.0f;
+
+ n_combined++;
+ }
+ }
+ }
+
+ size_t n_non_eog = 0;
+
+ size_t size_org = cur_p->size;
+
+ float p_sum = 0.0f;
+ float thold = 0.2f;
+
+ cur_p->size = 0;
+
+ LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
+
+ for (size_t i = 0; i < size_org; ++i) {
+ const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
+
+ if (cur_p->data[i].p < thold && !is_eog) {
+ continue;
+ }
+
+ if (!is_eog) {
+ ++n_non_eog;
+ }
+
+ p_sum += cur_p->data[i].p;
+
+ // keep this token
+ cur_p->data[cur_p->size++] = cur_p->data[i];
+ }
+
+ LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
+
+ // if no non-EOG tokens are left -> reduce cur_p to single EOT token
+ if (n_non_eog == 0) {
+ cur_p->size = 1;
+ cur_p->data[0].id = ctx->vocab->token_eot();
+ if (cur_p->data[0].id == LLAMA_TOKEN_NULL) {
+ cur_p->data[0].id = ctx->vocab->token_eos();
+ }
+ cur_p->data[0].logit = 1.0f;
+
+ GGML_ASSERT(cur_p->data[0].id != LLAMA_TOKEN_NULL);
+
+ return;
+ }
+
+ // normalize probs
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= p_sum;
+
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+ }
+
+ size_org = cur_p->size;
+ p_sum = 0.0f;
+ thold = 1.0/(n_non_eog + 1);
+
+ cur_p->size = 0;
+
+ LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
+
+ for (size_t i = 0; i < size_org; ++i) {
+ const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
+
+ if (cur_p->data[i].p < thold && !is_eog) {
+ continue;
+ }
+
+ p_sum += cur_p->data[i].p;
+
+ cur_p->data[cur_p->size++] = cur_p->data[i];
+ }
+
+ // normalize probs
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= p_sum;
+
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+ }
+
+#undef LOG_DBG_CUR
+}
+
+static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
+ return llama_sampler_init_infill(ctx->vocab);
+}
+
+static void llama_sampler_infill_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_infill *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_infill_i = {
+ /* .name = */ llama_sampler_infill_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_infill_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_infill_clone,
+ /* .free = */ llama_sampler_infill_free,
+ /* .backend_apply = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+ /* .backend_init = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_infill_i,
+ /* .ctx = */ new llama_sampler_infill {
+ /* .vocab = */ vocab,
+ /* .buf0 = */ std::vector<char>(512),
+ /* .buf1 = */ std::vector<char>(512),
+ }
+ );
+}
+
+// utils
+
+uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
+ if (smpl->iface == &llama_sampler_dist_i) {
+ return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
+ }
+
+ if (smpl->iface == &llama_sampler_mirostat_i) {
+ return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
+ }
+
+ if (smpl->iface == &llama_sampler_mirostat_v2_i) {
+ return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
+ }
+
+ if (smpl->iface == &llama_sampler_chain_i) {
+ const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
+ for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
+ const uint32_t seed = llama_sampler_get_seed(it->ptr);
+ if (seed != LLAMA_DEFAULT_SEED) {
+ return seed;
+ }
+ }
+ }
+
+ return LLAMA_DEFAULT_SEED;
+}
+
+// perf
+
+struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
+ struct llama_perf_sampler_data data = {};
+
+ if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+ GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+ }
+
+ const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
+
+ data.t_sample_ms = 1e-3 * ctx->t_sample_us;
+ data.n_sample = std::max(0, ctx->n_sample);
+
+ return data;
+}
+
+void llama_perf_sampler_print(const struct llama_sampler * chain) {
+ const auto data = llama_perf_sampler(chain);
+
+ LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
+}
+
+void llama_perf_sampler_reset(struct llama_sampler * chain) {
+ if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+ GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+ }
+
+ auto * ctx = (struct llama_sampler_chain *) chain->ctx;
+
+ ctx->t_sample_us = 0;
+ ctx->n_sample = 0;
+}
diff --git a/llama.cpp/src/llama-sampler.h b/llama.cpp/src/llama-sampler.h
new file mode 100644
index 0000000..b9bfc20
--- /dev/null
+++ b/llama.cpp/src/llama-sampler.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "llama.h"
+
+#include <vector>
+
+struct llama_vocab;
+struct llama_grammar;
+
+// sampler chain
+
+struct llama_sampler_chain {
+ llama_sampler_chain_params params;
+
+ // has .backend_init() been called?
+ bool is_init = false;
+
+ struct info {
+ bool is_backend;
+
+ llama_sampler * ptr;
+ };
+
+ std::vector<info> samplers;
+
+ // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
+ std::vector<llama_token_data> cur;
+
+ // timing
+
+ mutable int64_t t_sample_us;
+
+ mutable int32_t n_sample;
+};
+
+struct llama_sampler * llama_sampler_init_dry_testing(
+ int32_t context_size,
+ float dry_multiplier,
+ float dry_base,
+ int32_t dry_allowed_length,
+ int32_t dry_penalty_last_n,
+ const std::vector<std::vector<llama_token>> & seq_breakers);
diff --git a/llama.cpp/src/llama-vocab.cpp b/llama.cpp/src/llama-vocab.cpp
new file mode 100644
index 0000000..62e137f
--- /dev/null
+++ b/llama.cpp/src/llama-vocab.cpp
@@ -0,0 +1,3938 @@
+#include "llama-vocab.h"
+
+#include "ggml.h"
+#include "gguf.h"
+#include "llama-impl.h"
+#include "llama-model-loader.h"
+
+#include "unicode.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cfloat>
+#include <cmath>
+#include <cstdarg>
+#include <cstring>
+#include <forward_list>
+#include <limits>
+#include <map>
+#include <queue>
+#include <set>
+#include <unordered_map>
+
+//
+// helpers
+//
+
+struct naive_trie {
+ naive_trie() : has_value(false), value(0) {
+ }
+ void insert(const char * key, size_t len, int32_t value = 0) {
+ if (len == 0) {
+ this->has_value = true;
+ this->value = value;
+ return;
+ }
+ char c = key[0];
+ auto res = children.find(c);
+ if (res != children.end()) {
+ res->second.insert(key + 1, len - 1, value);
+ } else {
+ auto res = children.insert(std::make_pair(c, naive_trie()));
+ res.first->second.insert(key + 1, len - 1, value);
+ }
+ }
+ std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
+ if (len == 0 || offset == len) {
+ return std::make_pair(key, offset);
+ }
+ char c = key[offset];
+ auto res = children.find(c);
+ if (res != children.end()) {
+ return res->second.get_longest_prefix(key, len, offset + 1);
+ }
+
+ return std::make_pair(key, offset);
+ }
+ const struct naive_trie * traverse(const char c) const {
+ auto res = children.find(c);
+ if (res != children.end()) {
+ return &res->second;
+ }
+
+ return NULL;
+ }
+ std::map<char, struct naive_trie> children;
+ bool has_value;
+ llama_token value;
+};
+
+//
+// tokenizers
+//
+
+struct llm_tokenizer {
+ llm_tokenizer() {}
+ virtual ~llm_tokenizer() = default;
+};
+
+struct llm_symbol {
+ using index = int;
+ index prev;
+ index next;
+ const char * text;
+ size_t n;
+};
+
+static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
+
+//
+// SPM tokenizer
+// original implementation:
+// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
+//
+
+struct llm_bigram_spm {
+ struct comparator {
+ bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
+ return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+ }
+ };
+ using queue_storage = std::vector<llm_bigram_spm>;
+ using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+ llm_symbol::index left;
+ llm_symbol::index right;
+ float score;
+ size_t size;
+};
+
+struct llm_tokenizer_spm : llm_tokenizer {
+ llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
+};
+
+struct llm_tokenizer_spm_session {
+ llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
+
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
+ // split string into utf8 chars
+ int index = 0;
+ size_t offs = 0;
+ while (offs < text.size()) {
+ llm_symbol sym;
+ size_t len = unicode_len_utf8(text[offs]);
+ sym.text = text.c_str() + offs;
+ sym.n = std::min(len, text.size() - offs);
+ offs += sym.n;
+ sym.prev = index - 1;
+ sym.next = offs == text.size() ? -1 : index + 1;
+ index++;
+ symbols.emplace_back(sym);
+ }
+
+ // seed the work queue with all possible 2-character tokens.
+ for (int i = 1; i < (int) symbols.size(); ++i) {
+ try_add_bigram(i - 1, i);
+ }
+
+ // keep substituting the highest frequency pairs for as long as we can.
+ while (!work_queue.empty()) {
+ auto bigram = work_queue.top();
+ work_queue.pop();
+
+ auto & left_sym = symbols[bigram.left];
+ auto & right_sym = symbols[bigram.right];
+
+ // if one of the symbols already got merged, skip it.
+ if (left_sym.n == 0 || right_sym.n == 0 ||
+ left_sym.n + right_sym.n != bigram.size) {
+ continue;
+ }
+
+ // merge the right sym into the left one
+ left_sym.n += right_sym.n;
+ right_sym.n = 0;
+
+ //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
+
+ // remove the right sym from the chain
+ left_sym.next = right_sym.next;
+ if (right_sym.next >= 0) {
+ symbols[right_sym.next].prev = bigram.left;
+ }
+
+ // find more substitutions
+ try_add_bigram(left_sym.prev, bigram.left);
+ try_add_bigram(bigram.left, left_sym.next);
+ }
+
+ for (int i = 0; i != -1; i = symbols[i].next) {
+ auto & symbol = symbols[i];
+ resegment(symbol, output);
+ }
+ }
+
+private:
+ void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
+ auto text = std::string(symbol.text, symbol.n);
+ auto token = vocab.text_to_token(text);
+
+ // Do we need to support is_unused?
+ if (token != LLAMA_TOKEN_NULL) {
+ output.push_back(token);
+ return;
+ }
+
+ const auto p = rev_merge.find(text);
+
+ if (p == rev_merge.end()) {
+ // output any symbols that did not form tokens as bytes.
+ output.reserve(output.size() + symbol.n);
+ for (int j = 0; j < (int)symbol.n; ++j) {
+ llama_token id = vocab.byte_to_token(symbol.text[j]);
+ output.push_back(id);
+ }
+ return;
+ }
+
+ resegment(symbols[p->second.first], output);
+ resegment(symbols[p->second.second], output);
+ }
+
+ void try_add_bigram(int left, int right) {
+ if (left == -1 || right == -1) {
+ return;
+ }
+ const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
+ auto token = vocab.text_to_token(text);
+
+ if (token == LLAMA_TOKEN_NULL) {
+ return;
+ }
+
+ if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
+ return;
+ }
+
+ const auto & tok_data = vocab.get_token_data(token);
+
+ llm_bigram_spm bigram;
+ bigram.left = left;
+ bigram.right = right;
+ bigram.score = tok_data.score;
+ bigram.size = text.size();
+
+ work_queue.push(bigram);
+
+ // Do we need to support is_unused?
+ rev_merge[text] = std::make_pair(left, right);
+ }
+
+ const llama_vocab & vocab;
+ // currently unused
+ // const llm_tokenizer_spm * spm_tokenizer;
+
+ std::vector<llm_symbol> symbols;
+ llm_bigram_spm::queue work_queue;
+ std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+//
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+//
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
+class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
+public:
+ using std::priority_queue<T, Container, Compare>::priority_queue;
+
+ T pop_move() {
+ T item = std::move(this->c.front());
+ std::pop_heap(this->c.begin(), this->c.end(), this->comp);
+ this->c.pop_back();
+ return item;
+ }
+
+ void pop() = delete;
+};
+
+struct llm_bigram_bpe {
+ struct comparator {
+ bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
+ return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
+ }
+ };
+
+ using queue_storage = std::vector<llm_bigram_bpe>;
+ using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+ llm_symbol::index left;
+ llm_symbol::index right;
+ std::string text;
+ int rank;
+ size_t size;
+};
+
+struct llm_tokenizer_bpe : llm_tokenizer {
+ llm_tokenizer_bpe(const llama_vocab & vocab) {
+ GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
+ switch (vocab.get_pre_type()) {
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+
+ // adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
+ regex_exprs = {
+ // same as llama3
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
+ regex_exprs = {
+ "[\r\n]",
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
+ "\\s+$",
+ "[一-龥ࠀ-一가-퟿]+",
+ "\\p{N}+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
+ regex_exprs = {
+ "\\p{N}{1,3}",
+ "[一-龥぀-ゟ゠-ヿ]+",
+ "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_YOUTU:
+ regex_exprs = {
+ "[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
+ "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
+ regex_exprs = {
+ "[\r\n]",
+ "\\s?\\p{L}+",
+ "\\s?\\p{P}+",
+ "[一-龥ࠀ-一가-퟿]+",
+ "\\p{N}",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
+ regex_exprs = {
+ "[\\p{P}\\$\\+<=>\\^~\\|`]+",
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+ "[0-9][0-9][0-9]",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+ case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
+ case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE:
+ case LLAMA_VOCAB_PRE_TYPE_MINERVA:
+ regex_exprs = {
+ "\\p{N}",
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
+ case LLAMA_VOCAB_PRE_TYPE_JAIS:
+ case LLAMA_VOCAB_PRE_TYPE_TRILLION:
+ case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
+ regex_exprs = {
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
+ case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_QWEN35:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
+ case LLAMA_VOCAB_PRE_TYPE_BLOOM:
+ case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
+ regex_exprs = {
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
+ regex_exprs = {
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_VIKING:
+ regex_exprs = {
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
+ "\\p{N}",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
+ // original regex from tokenizer.json
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+ regex_exprs = {
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
+ // Note: in theory, the special token (sentinel and image token) regex_exprs below
+ // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
+ // However, since the upstream pre-tokenizer uses them, they are also
+ // included here (see https://huggingface.co/facebook/chameleon-7b).
+ regex_exprs = {
+ "<sentinel:[0-9]+>", // Sentinel tokens
+ "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
+ "([\\t\\n]| | )", // directly from tokenizer.json
+ "\\p{N}", // Individual digits
+ "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_GPT4O:
+ case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
+ regex_exprs = {
+ // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
+ // The custom handler implements all K2 patterns with proper Han character exclusion
+ "\\p{Han}+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
+ regex_exprs = {
+ "\\p{N}+",
+ "(?=(\\d{3})+(?!\\d))",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
+ // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
+ "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_GROK_2:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_AFMOE:
+ regex_exprs = {
+ // Digit handling - uses custom implementation in unicode.cpp
+ // Groups digits with leading 1-2 based on total length modulo 3
+ "\\p{AFMoE_digits}",
+ // CJK and Asian scripts (using direct Unicode literals)
+ "[一-鿿㐀-䶿豈-﫿぀-ゟ゠-ヿ・-゚⼀-⿟เ-๿຀-໿ក-៿က-႟ꩠ-ꩿꧠ-꧿가-힯ᄀ-ᇿ]+",
+ // Main BPE pattern
+ "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+ };
+ break;
+ default:
+ // default regex for BPE tokenization pre-processing
+ regex_exprs = {
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+ "\\p{N}+",
+ "[0-9][0-9][0-9]",
+ };
+ break;
+ }
+ }
+
+ std::vector<std::string> regex_exprs;
+};
+
+struct llm_tokenizer_bpe_session {
+ llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+
+ static void append(const llama_token token_id, std::vector<llama_token> & output) {
+ output.push_back(token_id);
+ }
+
+ bool append_bos(std::vector<llama_token> & output) const {
+ if (vocab.get_add_bos()) {
+ GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
+ output.push_back(vocab.token_bos());
+ return true;
+ }
+ return false;
+ }
+
+ bool append_eos(std::vector<llama_token> & output) const {
+ if (vocab.get_add_eos()) {
+ GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
+ output.push_back(vocab.token_eos());
+ return true;
+ }
+ return false;
+ }
+
+ void check_double_bos_eos(const std::vector<llama_token> & output) const {
+ if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
+ LLAMA_LOG_WARN(
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+ "Are you sure this is what you want?\n", __FUNCTION__);
+ }
+ if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
+ LLAMA_LOG_WARN(
+ "%s: Added a EOS token to the prompt as specified by the model but the prompt "
+ "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
+ "Are you sure this is what you want?\n", __FUNCTION__);
+ }
+ }
+
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
+ int final_prev_index = -1;
+ const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+
+ symbols_final.clear();
+
+ for (const auto & word : word_collection) {
+ work_queue = llm_bigram_bpe::queue();
+ symbols.clear();
+
+ int index = 0;
+ size_t offset = 0;
+
+ //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+ if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+ offset = word.size();
+ }
+
+ while (offset < word.size()) {
+ llm_symbol sym;
+ size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset]));
+ sym.text = word.c_str() + offset;
+ sym.n = char_len;
+ offset += sym.n;
+ sym.prev = index - 1;
+ sym.next = offset == word.size() ? -1 : index + 1;
+ index++;
+ symbols.emplace_back(sym);
+ }
+ for (int i = 1; i < (int) symbols.size(); ++i) {
+ add_new_bigram(i - 1, i);
+ }
+
+ // build token(s)
+ while (!work_queue.empty()) {
+ auto bigram = work_queue.pop_move();
+
+ auto & left_symbol = symbols[bigram.left];
+ auto & right_symbol = symbols[bigram.right];
+
+ if (left_symbol.n == 0 || right_symbol.n == 0) {
+ continue;
+ }
+ std::string left_token = std::string(left_symbol.text, left_symbol.n);
+ std::string right_token = std::string(right_symbol.text, right_symbol.n);
+ if (left_token + right_token != bigram.text) {
+ continue; // Skip this bigram if it's outdated
+ }
+
+ // merge the right sym into the left one
+ left_symbol.n += right_symbol.n;
+ right_symbol.n = 0;
+
+ // remove the right sym from the chain
+ left_symbol.next = right_symbol.next;
+ if (right_symbol.next >= 0) {
+ symbols[right_symbol.next].prev = bigram.left;
+ }
+
+ add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol
+ add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
+ }
+
+ // add the finished tokens to the final list keeping correct order for next and prev
+ for (auto & sym : symbols) {
+ if (sym.n > 0) {
+ sym.prev = final_prev_index;
+ sym.next = -1;
+ if (final_prev_index != -1) {
+ symbols_final[final_prev_index].next = symbols_final.size();
+ }
+ symbols_final.emplace_back(sym);
+ final_prev_index = symbols_final.size() - 1;
+ }
+ }
+ }
+
+ symbols = symbols_final;
+
+ if (!symbols.empty()) {
+ for (int i = 0; i != -1; i = symbols[i].next) {
+ auto & symbol = symbols[i];
+ if (symbol.n == 0) {
+ continue;
+ }
+
+ const std::string str = std::string(symbol.text, symbol.n);
+ const auto token = vocab.text_to_token(str);
+
+ if (token == LLAMA_TOKEN_NULL) {
+ for (auto j = str.begin(); j != str.end(); ++j) {
+ std::string byte_str(1, *j);
+ auto token_multibyte = vocab.text_to_token(byte_str);
+ if (token_multibyte != LLAMA_TOKEN_NULL) {
+ output.push_back(token_multibyte);
+ }
+ }
+ } else {
+ output.push_back(token);
+ }
+ }
+ }
+ }
+
+private:
+ void add_new_bigram(int left, int right) {
+ if (left == -1 || right == -1) {
+ return;
+ }
+ std::string left_token = std::string(symbols[left].text, symbols[left].n);
+ std::string right_token = std::string(symbols[right].text, symbols[right].n);
+
+ int rank_found = -1;
+
+ rank_found = vocab.find_bpe_rank(left_token, right_token);
+
+ if (rank_found < 0) {
+ return;
+ }
+
+ llm_bigram_bpe bigram;
+
+ bigram.left = left;
+ bigram.right = right;
+ bigram.text = left_token + right_token;
+ bigram.size = left_token.size() + right_token.size();
+ bigram.rank = rank_found;
+
+ work_queue.push(bigram);
+ }
+
+ const llama_vocab & vocab;
+ const llm_tokenizer_bpe & tokenizer;
+
+ std::vector<llm_symbol> symbols;
+ std::vector<llm_symbol> symbols_final;
+ llm_bigram_bpe::queue work_queue;
+};
+
+//
+// WPM tokenizer
+//
+
+struct llm_tokenizer_wpm : llm_tokenizer {
+ llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
+};
+
+struct llm_tokenizer_wpm_session {
+ llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
+
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
+ // normalize and split by whitespace
+ std::vector<std::string> words = preprocess(text);
+ // bos token prepended already
+
+ // find the longest tokens that form the words
+ for (const std::string & word : words) {
+ // skip empty words
+ if (word.size() == 0) {
+ continue;
+ }
+
+ // prepend phantom space
+ const std::string word1 = "\xe2\x96\x81" + word;
+ const int n = word1.size();
+
+ const size_t current_tokens = output.size();
+
+ // we're at the start of a new word
+ // move through character position in word
+ for (int i = 0; i < n; ++i) {
+ // loop through possible match length
+ bool match = false;
+ for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
+ auto id = vocab.text_to_token(word1.substr(i, j - i));
+ if (id != LLAMA_TOKEN_NULL) {
+ output.push_back(id);
+ match = true;
+ i = j - 1;
+ break;
+ }
+ }
+
+ if (!match) { // discard all
+ output.resize(current_tokens);
+ break; // and discard next tokens
+ }
+ }
+
+ // we didn't find any matches for this word
+ if (current_tokens == output.size()) {
+ output.push_back(vocab.token_unk());
+ }
+ }
+ }
+
+ // TODO: reduce string copies by using cpts_offs array
+ static std::vector<std::string> preprocess(const std::string & text) {
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
+ std::vector<std::string> words(1, "");
+
+ for (const uint32_t cpt : cpts_nfd) {
+ const auto flags = unicode_cpt_flags_from_cpt(cpt);
+
+ if (flags.is_whitespace) {
+ if (words.back().size()) { // finish previous word if any
+ words.emplace_back();
+ }
+ continue;
+ }
+
+ assert (!flags.is_separator);
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
+ continue;
+ }
+
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
+ if (words.back().size()) { // finish previous word if any
+ words.emplace_back();
+ }
+ words.back() = s; // single char word
+ words.emplace_back(); // start a new word
+ } else {
+ words.back() += s; // append char to word
+ }
+ }
+
+ if (!words.back().size()) {
+ words.pop_back();
+ }
+
+ return words;
+ }
+
+ static bool is_chinese_char(uint32_t cpt) {
+ return
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
+ (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
+ (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
+ (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
+ (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
+ }
+
+private:
+ const llama_vocab & vocab;
+ // currently unused
+ // const llm_tokenizer_wpm * wpm_tokenizer;
+};
+
+//
+// UGM tokenizer
+//
+
+struct llm_tokenizer_ugm : llm_tokenizer {
+ llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
+ if (precompiled_charsmap.size() > 0) {
+ size_t charsmap_offset = 0;
+
+ // First four bytes of precompiled_charsmap contains length of binary
+ // blob containing XOR-compressed compact double array (XCDA) entries
+ uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
+ charsmap_offset += sizeof(xcda_blob_size);
+ if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
+ throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+ }
+
+ // Next xcda_blob_size bytes contain entries of XOR-compressed compact
+ // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
+ xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
+ xcda_array_size = xcda_blob_size / sizeof(uint32_t);
+ charsmap_offset += xcda_blob_size;
+
+ // Remaining bytes of precompiled charsmap contain null-terminated
+ // replacement strings for prefixes matched by the XCDA.
+ prefix_replacements = &precompiled_charsmap[charsmap_offset];
+ prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
+ }
+
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
+ const auto & token_data = vocab.get_token_data(id);
+
+ if (vocab.is_normal(id)) {
+ min_score = std::min<float>(min_score, token_data.score);
+ max_score = std::max<float>(max_score, token_data.score);
+ }
+
+ if (vocab.is_normal(id) ||
+ vocab.is_user_defined(id) ||
+ vocab.is_unused(id)) {
+ token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
+ }
+
+ if (vocab.is_user_defined(id)) {
+ user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
+ }
+ }
+
+ unknown_token_score = min_score - unknown_token_score_penalty;
+ }
+
+ // escaped space symbol - U+2581 (Lower One Eighth Block)
+ const std::string escaped_space = "\xE2\x96\x81";
+
+ const char * prefix_replacements = NULL;
+ size_t prefix_replacements_size = 0;
+
+ const uint32_t * xcda_array = NULL;
+ size_t xcda_array_size = 0;
+
+ struct naive_trie user_defined_token_matcher;
+
+ float min_score = FLT_MAX;
+ float max_score = -FLT_MAX;
+
+ float unknown_token_score_penalty = 10.0;
+ float unknown_token_score;
+
+ struct naive_trie token_matcher;
+};
+
+struct llm_tokenizer_ugm_session {
+ llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+
+ /* This implementation is based on SentencePiece optimized Viterbi algorithm for
+ * unigram language models. The general idea is to:
+ * - move along the input sequence in steps of one UTF code point,
+ * - at each step find all possible tokenizations of the prefix by
+ * traversing the tokens trie,
+ * - for each tokenization store the best one so far (by higher score)
+ * - use the position in sequence after given token as an index to store
+ * results
+ * - if there was no valid tokenization of the current UTF code point
+ * then use unknown token with additional score penalty
+ * After processing the whole sequence we backtrack from the end to get
+ * the best tokenization.
+ */
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
+ // get current size of output (for reversal later)
+ size_t output_size = output.size();
+
+ // normalize the input first
+ std::string normalized;
+ normalize(text, &normalized);
+ size_t input_len = normalized.size();
+ if (input_len == 0) {
+ return;
+ }
+
+ // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
+ std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
+ // at the beginning tokenization score is zero
+ tokenization_results[0] = { vocab.token_unk(), 0, 0 };
+
+ for (size_t input_offset = 0; input_offset < input_len;) {
+ size_t prefix_offset = input_offset;
+ // calculate how many code units are in the currently processed UTF code point
+ size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset);
+
+ // traverse the token matcher trie to find a matching token
+ bool single_codepoint_token_found = false;
+ const struct best_tokenization & current_best = tokenization_results[input_offset];
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
+
+ while (prefix_offset <= input_len && node != NULL) {
+ // check if we found valid token in prefix
+ if (node->has_value) {
+ // check if it corresponds to the whole UTF code point
+ if (prefix_offset - input_offset == n_utf8_code_units) {
+ single_codepoint_token_found = true;
+ }
+ llama_token token_id = node->value;
+ const auto & token_data = vocab.get_token_data(token_id);
+
+ // we set the user-defined token scores to 0 to make them more likely to be selected
+ // (normal token scores are log probabilities, so they are negative)
+ // score type is double here to make tokenization results exactly
+ // the same as in the HF tokenizer using SentencePiece
+ const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
+ const double challenger_score = current_best.score_sum + token_score;
+ struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+ if (challenger_score > current_champ.score_sum) {
+ struct best_tokenization challenger = { token_id, input_offset, challenger_score };
+ current_champ = challenger;
+ }
+ }
+ node = node->traverse(normalized[prefix_offset++]);
+ }
+
+ // if we didn't find a valid token corresponding to the whole UTF code point
+ // then use unknown token as the tokenization of this UTF code point
+ if (!single_codepoint_token_found) {
+ const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
+ prefix_offset = input_offset + n_utf8_code_units;
+ struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+ if (challenger_score > current_champ.score_sum) {
+ struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
+ current_champ = challenger;
+ }
+ }
+
+ // move to the next UTF code point
+ input_offset += n_utf8_code_units;
+ }
+
+ // now backtrack from the end to gather token ids of the best tokenization
+ // merge sequences of consecutive unknown tokens into single unknown tokens
+ bool is_prev_unknown = false;
+ for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
+ bool is_unknown = tokenization.token_id == vocab.token_unk();
+ if (!(is_prev_unknown && is_unknown)) {
+ output.push_back(tokenization.token_id);
+ }
+ if (tokenization.input_offset == 0) {
+ break;
+ }
+ is_prev_unknown = is_unknown;
+ }
+
+ // reverse the output since we added tokens starting from the end of the input
+ std::reverse(output.begin() + output_size, output.end());
+ }
+
+private:
+
+ // helper structure for returning normalization results
+ struct normalization_result {
+ const char * normalized;
+ size_t normalized_len;
+ size_t consumed_input;
+ };
+
+ void normalize(const std::string& input, std::string * normalized) {
+ normalized->clear();
+ normalized->reserve(input.size() * 3);
+
+ const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
+
+ const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
+ const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
+ const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
+
+ bool is_space_prepended = false;
+ bool processing_non_ws = false;
+
+ size_t input_len = input.size();
+
+ for (size_t input_offset = 0; input_offset < input_len; ) {
+ auto norm_res = normalize_prefix(input, input_offset);
+ for (size_t i = 0; i < norm_res.normalized_len; i++) {
+ char c = norm_res.normalized[i];
+ if (c != ' ') {
+ if (!processing_non_ws) {
+ processing_non_ws = true;
+ if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
+ normalized->append(space);
+ is_space_prepended = true;
+ }
+ }
+ normalized->push_back(c);
+ } else {
+ if (processing_non_ws) {
+ processing_non_ws = false;
+ }
+ if (!shall_merge_spaces) {
+ normalized->append(space);
+ }
+ }
+ }
+
+ input_offset += norm_res.consumed_input;
+ }
+
+ if (shall_append_space) {
+ normalized->append(space);
+ }
+ }
+
+ /*
+ * This structure is a view wrapper for XOR-compressed double array (XCDA)
+ * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
+ * Each bit-packed entry contains:
+ * - BASE array value in bits 10-30
+ * - LCHECK array value in bits 0-7
+ * - LEAF array value in bit 9
+ * Entries containing indexes of replacement sequences have set bit 31
+ */
+ struct xcda_array_view {
+ public:
+ xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
+ }
+ uint32_t get_base(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
+ }
+ uint32_t get_lcheck(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return packed_node & ((1U << 31) | 0xff);
+ }
+ bool get_leaf(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return (packed_node >> 8) & 1;
+ }
+ uint32_t get_value(size_t index) {
+ uint32_t packed_node = get_node(index);
+ return packed_node & ((1U << 31) - 1);
+ }
+ private:
+ uint32_t get_node(size_t index) {
+ if (index >= xcda_array_size) {
+ throw std::runtime_error("Index out of array bounds in XCDA array!");
+ }
+ return xcda_array[index];
+ }
+ const uint32_t * xcda_array;
+ size_t xcda_array_size;
+ };
+
+ // this structure stores the best tokenization so far at input_offset
+ struct best_tokenization {
+ llama_token token_id;
+ size_t input_offset;
+ double score_sum;
+ };
+
+ struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
+ if (input_offset == input.size()) {
+ return { &input[input_offset], 0, 0 };
+ }
+
+ // if input prefix matches some user-defined token return this token as normalization result
+ auto user_defined_token_match =
+ tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+ if (user_defined_token_match.second > 0) {
+ return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
+ }
+
+ size_t longest_prefix_length = 0;
+ size_t longest_prefix_offset = 0;
+
+ if (tokenizer.xcda_array_size > 0) {
+ struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
+
+ // Find the longest normalized sequence matching the input prefix by walking
+ // the XOR-compressed compact double array (XCDA) starting from the root node
+ // We find the index of the next node by calculating BASE[s] ^ c where s is
+ // the index of the previous node and c is a numerical character value
+ uint32_t node_index = 0;
+ // get BASE of the root node
+ node_index = xcda_view.get_base(node_index);
+ for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
+ unsigned char c = input[prefix_offset];
+ if (c == 0) {
+ break;
+ }
+ node_index ^= c;
+ // if value of LCHECK is not c it means that this is not a child of
+ // the previous node, so we stop matching
+ if (xcda_view.get_lcheck(node_index) != c) {
+ break;
+ }
+ bool is_leaf = xcda_view.get_leaf(node_index);
+ // get BASE of the current node
+ node_index ^= xcda_view.get_base(node_index);
+ // if LEAF of the current node is true, it means that its BASE points to the node
+ // containing index of replacement sequence for currently matched input prefix
+ if (is_leaf)
+ {
+ longest_prefix_length = prefix_offset - input_offset + 1;
+ // get index of replacement sequence for currently matched input prefix
+ longest_prefix_offset = xcda_view.get_value(node_index);
+ }
+ }
+ }
+
+ if (longest_prefix_length > 0) {
+ // we have a match, so return the replacement sequence
+ if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
+ throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+ }
+ const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
+ return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
+ }
+
+ // check if the input prefix contains a valid sequence of UTF-8 code units
+ try {
+ // if yes, return this sequence unmodified
+ size_t prefix_offset = input_offset;
+ unicode_cpt_from_utf8(input, prefix_offset);
+ return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
+ } catch (std::invalid_argument & /*ex*/) {
+ // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
+ return { "\xEF\xBF\xBD", 3, 1 };
+ }
+ }
+
+ const llama_vocab & vocab;
+ const llm_tokenizer_ugm & tokenizer;
+};
+
+//
+// RWKV tokenizer
+//
+
+static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
+ std::vector<uint8_t> output;
+ output.reserve(escaped.size());
+
+ // Parser state
+ bool escaping = false;
+ uint8_t hex_remaining = 0;
+ uint8_t hex_acc = 0;
+
+ // Step through characters, performing parsing
+ for (const char & c : escaped) {
+ // If we're parsing a hex code, interpret the next character
+ if (hex_remaining != 0) {
+ uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
+ hex_acc = (hex_acc << 4) + value;
+
+ hex_remaining -= 1;
+ if (hex_remaining == 0) {
+ output.push_back(hex_acc);
+ hex_acc = 0;
+ }
+
+ continue;
+ }
+
+ // If we got an escape character, interpret it
+ if (escaping) {
+ if (c == 't') {
+ output.push_back('\t');
+ } else if (c == 'n') {
+ output.push_back('\n');
+ } else if (c == 'r') {
+ output.push_back('\r');
+ } else if (c == 'x') {
+ hex_remaining = 2;
+ } else {
+ output.push_back(c);
+ }
+
+ escaping = false;
+ continue;
+ }
+
+ if (c == '\\') {
+ escaping = true;
+ continue;
+ }
+
+ output.push_back(c);
+ }
+
+ return output;
+}
+
+struct llm_tokenizer_rwkv : llm_tokenizer {
+ llm_tokenizer_rwkv(const llama_vocab & vocab) {
+ // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
+ // For now, we decode the vocab here into the lookup we'll use for tokenization.
+
+ // build trie
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
+ const auto & data = vocab.get_token_data(id);
+ const auto text = llama_unescape_rwkv_token(data.text);
+ token_matcher.insert((const char *) text.data(), text.size(), id);
+ }
+ }
+
+ struct naive_trie token_matcher;
+};
+
+struct llm_tokenizer_rwkv_session {
+ llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
+ uint32_t position = 0;
+ while (position < text.size()) {
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
+ if (node == NULL) {
+ // no matching token found, add unknown token
+ output.push_back(vocab.token_unk());
+ position += 1;
+ continue;
+ }
+
+ // traverse the trie to find the longest matching token
+ uint32_t token_id = 0;
+ uint32_t token_length = 0;
+ while (node != NULL) {
+ if (node->has_value) {
+ token_id = node->value;
+ token_length = position + 1;
+ }
+ node = node->traverse(text[++position]);
+ }
+
+ // add the longest matching token
+ output.push_back(token_id);
+ position = token_length;
+ }
+ }
+
+private:
+ const llama_vocab & vocab;
+ const llm_tokenizer_rwkv & tokenizer;
+};
+
+struct llm_tokenizer_plamo2 : llm_tokenizer {
+ llm_tokenizer_plamo2(const llama_vocab & vocab) {
+ build(vocab);
+ }
+
+ void build(const llama_vocab & vocab) {
+ // Reset internal structures
+ tokens_.clear();
+ bytes_.assign(256, 0);
+ to_suffix_id_.clear();
+ table_.clear();
+
+ // Build token list and byte mapping
+ std::unordered_map<std::string, float> suffix_to_score;
+ std::unordered_map<std::string, llama_token> token_to_id;
+
+ for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
+ const auto & entry = vocab.get_token_data(token_id);
+ tokens_.push_back(entry.text);
+ token_to_id[entry.text] = static_cast<llama_token>(token_id);
+
+ // Handle byte tokens
+ if (vocab.is_byte(token_id)) {
+ if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
+ std::string hex_str = entry.text.substr(3, 2);
+ int byte_val = std::stoi(hex_str, nullptr, 16);
+ bytes_[byte_val] = static_cast<llama_token>(token_id);
+ }
+ continue;
+ }
+
+ // Add token and all its suffixes to suffix_to_score
+ suffix_to_score[entry.text] = entry.score;
+
+ // Extract suffixes character by character (UTF-8 aware)
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
+ for (size_t i = 1; i < cpts.size(); ++i) {
+ std::string suffix;
+ for (size_t j = i; j < cpts.size(); ++j) {
+ suffix += unicode_cpt_to_utf8(cpts[j]);
+ }
+ if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
+ suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
+ }
+ }
+ }
+
+ // Check that all byte tokens are set
+ for (int i = 0; i < 256; ++i) {
+ if (bytes_[i] == 0) {
+ throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
+ }
+ }
+
+ // Build suffix list in lexicographical order of reversed strings
+ std::vector<std::string> suffixes;
+ suffixes.reserve(suffix_to_score.size() + 1);
+ for (const auto & pair : suffix_to_score) {
+ suffixes.push_back(pair.first);
+ }
+ suffixes.push_back(""); // Empty suffix
+
+ std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
+ std::string rev_a(a.rbegin(), a.rend());
+ std::string rev_b(b.rbegin(), b.rend());
+ return rev_a < rev_b;
+ });
+
+ // Build suffix_to_id and to_suffix_id_
+ std::unordered_map<std::string, int32_t> suffix_to_id;
+ int32_t num_pieces = 0;
+
+ for (const auto & suffix : suffixes) {
+ suffix_to_id[suffix] = num_pieces;
+ if (!suffix.empty()) {
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
+
+ std::string remaining;
+ for (size_t i = 1; i < cpts.size(); ++i) {
+ remaining += unicode_cpt_to_utf8(cpts[i]);
+ }
+
+ int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
+ to_suffix_id_[piece_code] = num_pieces;
+
+ // Count number of pieces for this suffix
+ int32_t pieces_for_suffix = 1; // sentinel row
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
+ std::string piece;
+ for (int32_t i = 0; i < piece_length; ++i) {
+ piece += unicode_cpt_to_utf8(cpts[i]);
+ }
+ if (suffix_to_score.find(piece) != suffix_to_score.end()) {
+ pieces_for_suffix++;
+ }
+ }
+ num_pieces += pieces_for_suffix;
+ } else {
+ num_pieces++; // Empty suffix contributes one piece (sentinel row)
+ }
+ }
+
+ // Build flattened table
+ table_.resize(num_pieces, std::vector<int32_t>(4, 0));
+ int32_t table_idx = 0;
+
+ for (const auto & suffix : suffixes) {
+ // Add all prefixes of the suffix to the table (in decreasing order of length)
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
+ std::string piece;
+ for (int32_t i = 0; i < piece_length; ++i) {
+ piece += unicode_cpt_to_utf8(cpts[i]);
+ }
+
+ auto score_it = suffix_to_score.find(piece);
+ if (score_it == suffix_to_score.end()) {
+ continue;
+ }
+
+ table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
+ auto token_it = token_to_id.find(piece);
+ table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
+
+ float score = score_it->second;
+ table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
+ static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
+ table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
+
+ table_idx++;
+ }
+
+ // Add sentinel row
+ table_[table_idx][TABLE_PIECE_LENGTH] = 1;
+ table_[table_idx][TABLE_TOKEN_ID] = -1;
+ table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
+ table_idx++;
+ }
+ }
+
+ std::vector<llama_token> encode(const std::string & text) const {
+ std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
+ // Skip the first code point if it is a BOM (Byte Order Mark)
+ if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
+ unicode_data.erase(unicode_data.begin());
+ }
+
+ if (unicode_data.empty()) {
+ return {};
+ }
+
+ const size_t data_len = unicode_data.size();
+
+ // Initialize scores array (dynamic programming)
+ std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
+ scores[data_len] = 0;
+
+ // Path array to track best tokenization
+ std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
+
+ int32_t suffix_id = 0;
+
+ // Process from end to beginning
+ for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
+ uint32_t c = unicode_data[i];
+
+ // Find next suffix ID
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
+ int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
+ auto it = to_suffix_id_.find(piece_code);
+ suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
+
+ if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
+ break;
+ }
+ }
+
+ // Update best path
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
+ int32_t score = table_[p][TABLE_SCORE];
+ if (score > INVALID_SCORE) {
+ int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
+ int64_t s = scores[i + piece_length] - score;
+
+ if (s < scores[i]) {
+ scores[i] = s;
+ path[i][PATH_TOKEN_LENGTH] = piece_length;
+ path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
+ path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
+
+ if (score == UNKNOWN_SCORE) {
+ // Add UTF-8 byte count
+ path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
+ }
+ }
+ }
+
+ if (score == UNKNOWN_SCORE) {
+ break;
+ }
+ }
+ }
+
+ // Decode the best path
+ std::vector<llama_token> token_ids;
+ token_ids.reserve(path[0][PATH_NUM_TOKENS]);
+
+ int pos = 0;
+ while (pos < static_cast<int>(data_len)) {
+ if (path[pos][PATH_TOKEN_ID] >= 0) {
+ token_ids.push_back(path[pos][PATH_TOKEN_ID]);
+ } else {
+ // Fall back to byte tokens
+ uint32_t c = unicode_data[pos];
+ int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
+
+ for (int i = 0; i < s; ++i) {
+ uint8_t b;
+ if (s == 1) {
+ b = c;
+ } else {
+ if (i == 0) {
+ b = (0xF00 >> s) & 0xFF;
+ } else {
+ b = 0x80;
+ }
+ }
+ token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
+ }
+ }
+
+ assert(path[pos][PATH_TOKEN_LENGTH] > 0);
+ pos += path[pos][PATH_TOKEN_LENGTH];
+ }
+
+ return token_ids;
+ }
+private:
+ // Constants for table structure
+ static constexpr int32_t TABLE_PIECE_LENGTH = 0;
+ static constexpr int32_t TABLE_TOKEN_ID = 1;
+ static constexpr int32_t TABLE_SCORE = 2;
+ static constexpr int32_t TABLE_PIECE_ID = 3;
+
+ // Constants for path array
+ static constexpr int32_t PATH_TOKEN_LENGTH = 0;
+ static constexpr int32_t PATH_TOKEN_ID = 1;
+ static constexpr int32_t PATH_NUM_TOKENS = 2;
+
+ // Score constants
+ static constexpr int32_t INVALID_SCORE = -20000000;
+ static constexpr int32_t UNKNOWN_SCORE = -10000000;
+
+ // List of tokens in the vocabulary
+ std::vector<std::string> tokens_;
+
+ // Mapping from byte code point to token ID (for byte fallback)
+ std::vector<llama_token> bytes_;
+
+ // Mapping from piece code to suffix ID
+ std::unordered_map<int64_t, int32_t> to_suffix_id_;
+
+ // Flattened table representing the Trie structure
+ // Each row contains: [piece_length, token_id, score, piece_id]
+ std::vector<std::vector<int32_t>> table_;
+};
+
+struct llm_tokenizer_plamo2_session {
+ llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
+
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
+ std::vector<llama_token> tokens = tokenizer.encode(text);
+ output.insert(output.end(), tokens.begin(), tokens.end());
+ }
+
+private:
+ const llm_tokenizer_plamo2 & tokenizer;
+};
+
+//
+// impl
+//
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
+ FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+ FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant {
+ fragment_buffer_variant(llama_token _token)
+ :
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
+ token(_token),
+ raw_text(_dummy),
+ offset(0),
+ length(0) {}
+
+ fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
+ :
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
+ token((llama_token) - 1),
+ raw_text(_raw_text),
+ offset(_offset),
+ length(_length){
+ GGML_ASSERT(_offset >= 0);
+ GGML_ASSERT(_length >= 1);
+ GGML_ASSERT(offset + length <= raw_text.length());
+ }
+
+ const FRAGMENT_BUFFER_VARIANT_TYPE type;
+ const llama_token token;
+ const std::string _dummy;
+ const std::string & raw_text;
+ const uint64_t offset;
+ const uint64_t length;
+};
+
+struct llama_vocab::impl {
+ uint32_t n_token_types = 0; // for BERT-style token types
+
+ std::string tokenizer_model;
+ std::string tokenizer_pre;
+
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+ enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+
+ int max_token_len = 0; // used for optimizing longest token search
+
+ // default LLaMA special tokens
+ // TODO: should we set all of these to LLAMA_TOKEN_NULL?
+ llama_token special_bos_id = 1;
+ llama_token special_eos_id = 2;
+ llama_token special_eot_id = LLAMA_TOKEN_NULL;
+ llama_token special_eom_id = LLAMA_TOKEN_NULL;
+ llama_token special_unk_id = 0;
+ llama_token special_sep_id = LLAMA_TOKEN_NULL;
+ llama_token special_pad_id = LLAMA_TOKEN_NULL;
+ llama_token special_mask_id = LLAMA_TOKEN_NULL;
+
+ llama_token linefeed_id = 13;
+
+ // fim tokens
+ llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
+ llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
+
+ // tokenizer flags
+ bool add_space_prefix = false;
+ bool add_bos = false;
+ bool add_eos = false;
+ bool add_sep = false;
+ bool ignore_merges = false;
+ bool clean_spaces = false; // clean_up_tokenization_spaces
+ bool remove_extra_whitespaces = false;
+ bool escape_whitespaces = true;
+ bool treat_whitespace_as_suffix = false;
+
+ std::unordered_map<std::string, llama_token> token_to_id;
+ std::vector<token_data> id_to_token;
+
+ std::vector<llama_token> cache_special_tokens;
+ std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
+ struct pair_hash {
+ size_t operator()(const std::pair<std::string, std::string> & p) const {
+ return std::hash<std::string>{}(p.first) ^ //create some hash for pair
+ (std::hash<std::string>{}(p.second) << 1);
+ }
+ };
+ std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
+
+ // set of all tokens that cause "end of generation"
+ std::set<llama_token> special_eog_ids;
+
+ std::unique_ptr<llm_tokenizer> tokenizer;
+
+ std::vector<char> precompiled_charsmap;
+
+ impl(const llama_vocab & vocab) : vocab(vocab) {
+ }
+
+ ~impl() = default;
+
+ void load(llama_model_loader & ml, const LLM_KV & kv);
+
+ enum llama_vocab_type get_type() const;
+
+ std::string type_name() const;
+
+ bool is_normal (llama_token id) const;
+ bool is_unknown (llama_token id) const;
+ bool is_control (llama_token id) const;
+ bool is_byte (llama_token id) const;
+ bool is_user_defined(llama_token id) const;
+ bool is_unused (llama_token id) const;
+ bool is_eog (llama_token id) const;
+
+ uint8_t token_to_byte(llama_token id) const;
+
+ llama_token_attr token_get_attr(llama_token id) const;
+
+ void init_tokenizer(enum llama_vocab_type type);
+
+ void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
+
+ std::string token_to_piece_for_cache(
+ llama_token token,
+ bool special) const;
+
+
+ std::vector<llama_token> tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special = false) const;
+
+ int32_t tokenize(
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) const;
+
+ // does not write null-terminator to buf
+ int32_t token_to_piece(
+ llama_token token,
+ char * buf,
+ int32_t length,
+ int32_t lstrip,
+ bool special) const;
+
+ // use cached data
+ const std::string & token_to_piece(llama_token token) const;
+
+ int32_t detokenize(
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) const;
+
+ std::string detokenize(
+ const std::vector<llama_token> & tokens,
+ bool special) const;
+
+ void print_info() const;
+
+private:
+ const llama_vocab & vocab;
+};
+
+void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+ struct gguf_context * ctx = ml.meta.get();
+
+ // determine vocab type
+ {
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
+
+ if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
+ type = LLAMA_VOCAB_TYPE_NONE;
+
+ // default special tokens
+ special_bos_id = LLAMA_TOKEN_NULL;
+ special_eos_id = LLAMA_TOKEN_NULL;
+ special_unk_id = LLAMA_TOKEN_NULL;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ special_mask_id = LLAMA_TOKEN_NULL;
+ linefeed_id = LLAMA_TOKEN_NULL;
+
+ // read vocab size from metadata
+ uint32_t n_tokens = 0;
+ if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
+ LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
+ id_to_token.resize(n_tokens);
+ }
+
+ return;
+ }
+
+ if (tokenizer_model == "llama") {
+ type = LLAMA_VOCAB_TYPE_SPM;
+
+ // default special tokens
+ special_bos_id = 1;
+ special_eos_id = 2;
+ special_unk_id = 0;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ special_mask_id = LLAMA_TOKEN_NULL;
+ } else if (tokenizer_model == "bert") {
+ type = LLAMA_VOCAB_TYPE_WPM;
+
+ // default special tokens
+ special_bos_id = 101;
+ special_eos_id = LLAMA_TOKEN_NULL;
+ special_unk_id = 100;
+ special_sep_id = 102;
+ special_pad_id = 0;
+ special_mask_id = 103;
+
+ add_sep = true;
+ } else if (tokenizer_model == "gpt2") {
+ type = LLAMA_VOCAB_TYPE_BPE;
+
+ // read bpe merges and populate bpe ranks
+ const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+ // Kimi-K2 uses custom tokenization without traditional BPE merges
+ const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2");
+
+ if (merges_keyidx == -1) {
+ if (!is_kimi_k2) {
+ throw std::runtime_error("cannot find tokenizer merges in model file\n");
+ }
+ // Kimi-K2 doesn't need merges, skip
+ LLAMA_LOG_INFO("%s: Kimi-K2 tokenizer detected, skipping BPE merges\n", __func__);
+ } else {
+ const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+ for (int i = 0; i < n_merges; i++) {
+ const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+ //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
+
+ std::string first;
+ std::string second;
+
+ const size_t pos = word.find(' ', 1);
+
+ if (pos != std::string::npos) {
+ first = word.substr(0, pos);
+ second = word.substr(pos + 1);
+ }
+
+ bpe_ranks.emplace(std::make_pair(first, second), i);
+ }
+ }
+
+ // default special tokens
+ special_bos_id = 11;
+ special_eos_id = 11;
+ special_unk_id = LLAMA_TOKEN_NULL;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ special_mask_id = LLAMA_TOKEN_NULL;
+ } else if (tokenizer_model == "t5") {
+ type = LLAMA_VOCAB_TYPE_UGM;
+
+ // default special tokens
+ special_bos_id = LLAMA_TOKEN_NULL;
+ special_eos_id = 1;
+ special_unk_id = 2;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = 0;
+ special_mask_id = LLAMA_TOKEN_NULL;
+
+ const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+ if (precompiled_charsmap_keyidx != -1) {
+ const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+ GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+
+ const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+ const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
+ precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ // correct endiannes of data in precompiled_charsmap binary blob
+ uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
+ *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
+ assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
+ size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
+ uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
+ for (size_t i = 0; i < xcda_array_size; ++i) {
+ xcda_array[i] = __builtin_bswap32(xcda_array[i]);
+ }
+#endif
+ }
+ } else if (tokenizer_model == "rwkv") {
+ type = LLAMA_VOCAB_TYPE_RWKV;
+
+ // default special tokens
+ special_bos_id = LLAMA_TOKEN_NULL;
+ special_eos_id = LLAMA_TOKEN_NULL;
+ special_unk_id = LLAMA_TOKEN_NULL;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ } else if (tokenizer_model == "plamo2") {
+ type = LLAMA_VOCAB_TYPE_PLAMO2;
+
+ // PLaMo-2 default special tokens (these will be overridden by model config)
+ special_bos_id = 1; // <|plamo:bos|>
+ special_eos_id = 2; // <|plamo:eos|>
+ special_unk_id = 0; // <|plamo:unk|>
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = 3; // <|plamo:pad|>
+ special_mask_id = LLAMA_TOKEN_NULL;
+ } else {
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
+ }
+
+ // for now, only BPE models have pre-tokenizers
+ if (type == LLAMA_VOCAB_TYPE_BPE) {
+ add_space_prefix = false;
+ clean_spaces = true;
+ if (tokenizer_pre.empty()) {
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+ LLAMA_LOG_WARN("%s: \n", __func__);
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
+ LLAMA_LOG_WARN("%s: \n", __func__);
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ } else if (tokenizer_pre == "default") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ } else if (
+ tokenizer_pre == "llama3" ||
+ tokenizer_pre == "llama-v3" ||
+ tokenizer_pre == "llama-bpe"||
+ tokenizer_pre == "falcon3" ||
+ tokenizer_pre == "falcon-h1" ||
+ tokenizer_pre == "pixtral" ||
+ tokenizer_pre == "midm-2.0" ||
+ tokenizer_pre == "lfm2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+ ignore_merges = true;
+ add_bos = true;
+ } else if (
+ tokenizer_pre == "deepseek-llm") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "deepseek-coder") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "deepseek-v3") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "youtu") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
+ clean_spaces = false;
+ ignore_merges = true;
+ } else if (
+ tokenizer_pre == "falcon") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
+ } else if (
+ tokenizer_pre == "mpt") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
+ } else if (
+ tokenizer_pre == "starcoder") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
+ } else if (
+ tokenizer_pre == "gpt-2" ||
+ tokenizer_pre == "phi-2" ||
+ tokenizer_pre == "jina-es" ||
+ tokenizer_pre == "jina-de" ||
+ tokenizer_pre == "gigachat" ||
+ tokenizer_pre == "jina-v2-es" ||
+ tokenizer_pre == "jina-v2-de" ||
+ tokenizer_pre == "a.x-4.0" ||
+ tokenizer_pre == "mellum" ||
+ tokenizer_pre == "modern-bert" ) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+ } else if (
+ tokenizer_pre == "jina-v1-en" ||
+ tokenizer_pre == "jina-v2-code" ||
+ tokenizer_pre == "roberta-bpe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+ add_sep = true;
+ } else if (
+ tokenizer_pre == "refact") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
+ } else if (
+ tokenizer_pre == "command-r") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "qwen2" ||
+ tokenizer_pre == "deepseek-r1-qwen" ||
+ tokenizer_pre == "kormo") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "qwen35") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN35;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "stablelm2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
+ } else if (
+ tokenizer_pre == "olmo") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
+ } else if (
+ tokenizer_pre == "dbrx") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
+ } else if (
+ tokenizer_pre == "smaug-bpe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+ } else if (
+ tokenizer_pre == "poro-chat") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "glm4" ||
+ tokenizer_pre == "chatglm-bpe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
+ special_bos_id = LLAMA_TOKEN_NULL;
+ } else if (
+ tokenizer_pre == "viking") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "jais") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
+ } else if (
+ tokenizer_pre == "tekken") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+ clean_spaces = false;
+ ignore_merges = true;
+ add_bos = true;
+ } else if (
+ tokenizer_pre == "smollm") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "codeshell") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+ } else if (
+ tokenizer_pre == "bloom") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
+ } else if (
+ tokenizer_pre == "gpt3-finnish") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
+ } else if (
+ tokenizer_pre == "exaone") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+ } else if (
+ tokenizer_pre == "exaone4") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+ } else if (
+ tokenizer_pre == "exaone-moe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
+ } else if (
+ tokenizer_pre == "chameleon") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
+ add_bos = true;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "minerva-7b") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
+ } else if (
+ tokenizer_pre == "megrez") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+ } else if (
+ tokenizer_pre == "gpt-4o" ||
+ tokenizer_pre == "llama4") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "superbpe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "trillion") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "granite-docling") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "bailingmoe" ||
+ tokenizer_pre == "bailingmoe2" ||
+ tokenizer_pre == "llada-moe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "seed-coder") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "hunyuan") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "hunyuan-dense") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "kimi-k2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "grok-2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "afmoe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "minimax-m2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "solar-open") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
+ clean_spaces = false;
+ } else {
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ }
+ } else if (type == LLAMA_VOCAB_TYPE_SPM) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_space_prefix = true;
+ clean_spaces = false;
+ add_bos = true;
+ add_eos = false;
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_space_prefix = false;
+ clean_spaces = true;
+ add_bos = true;
+ add_eos = false;
+ add_sep = true;
+ } else if (type == LLAMA_VOCAB_TYPE_UGM) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_bos = false;
+ add_eos = true;
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_space_prefix = false;
+ clean_spaces = false;
+ add_bos = false;
+ add_eos = false;
+ } else {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ }
+
+ ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
+ ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
+ }
+
+ const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
+ if (token_idx == -1) {
+ throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+ }
+
+ const float * scores = nullptr;
+ const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
+ if (score_idx != -1) {
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+ }
+
+ const int * toktypes = nullptr;
+ const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
+ if (toktype_idx != -1) {
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+ }
+
+ uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
+ id_to_token.resize(n_tokens);
+
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ std::string word = gguf_get_arr_str(ctx, token_idx, i);
+ if (word.empty()) {
+ LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
+ word = "[EMPTY_" + std::to_string(i) + "]";
+ }
+
+ token_to_id[word] = i;
+ max_token_len = std::max(max_token_len, (int) word.size());
+
+ auto & token_data = id_to_token[i];
+ token_data.text = std::move(word);
+ token_data.score = scores ? scores[i] : 0.0f;
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
+
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
+ switch(toktypes[i]) {
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
+ }
+ }
+ }
+ GGML_ASSERT(id_to_token.size() == token_to_id.size());
+
+ init_tokenizer(type);
+
+ // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
+ if (type == LLAMA_VOCAB_TYPE_SPM) {
+ try {
+ linefeed_id = vocab.byte_to_token('\n');
+ } catch (const std::exception & e) {
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+ linefeed_id = special_pad_id;
+ }
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+ linefeed_id = special_pad_id;
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+ const std::vector<int> ids = tokenize("\n", false);
+ GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+ linefeed_id = ids[0];
+ } else {
+ const std::vector<int> ids = tokenize("\n", false);
+
+ //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+ if (ids.empty()) {
+ LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
+ linefeed_id = special_pad_id;
+ } else {
+ linefeed_id = ids[0];
+ }
+ }
+
+ // special tokens
+ {
+ const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
+ { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
+ { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
+ { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
+ { LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
+ { LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
+ { LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
+ { LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
+ { LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
+ { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
+ { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
+
+ // deprecated
+ { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
+ { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
+ { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
+ };
+
+ for (const auto & it : special_token_types) {
+ const std::string & key = kv(std::get<0>(it));
+ int32_t & id = std::get<1>(it);
+
+ uint32_t new_id;
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
+ continue;
+ }
+ if (new_id >= id_to_token.size()) {
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
+ __func__, key.c_str(), new_id, id);
+ } else {
+ id = new_id;
+ }
+ }
+
+ // Handle add_bos, add_eos and add_sep
+ {
+ bool temp = true;
+
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
+ add_bos = temp;
+ }
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
+ add_eos = temp;
+ }
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
+ add_sep = temp;
+ }
+ }
+
+ // auto-detect special tokens by text
+ // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+ // for now, we apply this workaround to find the tokens based on their text
+
+ for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
+ if (special_eot_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|eot_id|>"
+ || t.first == "<|im_end|>"
+ || t.first == "<|end|>"
+ || t.first == "<end_of_turn>"
+ || t.first == "<|endoftext|>"
+ || t.first == "<|end_of_text|>" // granite
+ || t.first == "<EOT>"
+ || t.first == "_<EOT>"
+ || t.first == "[EOT]" // Kimi-K2
+ || t.first == "<|end▁of▁sentence|>" // DeepSeek
+ || t.first == "<end_of_utterance>" // smoldocling
+ ) {
+ special_eot_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+
+ // find EOM token: "<|eom_id|>"
+ if (special_eom_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|eom_id|>"
+ ) {
+ special_eom_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+
+ // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
+ if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_prefix|>" // Qwen
+ || t.first == "<fim-prefix>"
+ || t.first == "<fim_prefix>" // Granite
+ || t.first == "<|fim▁begin|>" // DeepSeek
+ || t.first == "<PRE>"
+ || t.first == "▁<PRE>" // CodeLlama
+ || t.first == "<|code_prefix|>" // GLM-4.5
+ || t.first == "<|prefix|>" // Falcon-H1-Tiny-Coder
+ ) {
+ special_fim_pre_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+
+ // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
+ if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_suffix|>" // Qwen
+ || t.first == "<fim-suffix>"
+ || t.first == "<fim_suffix>" // Granite
+ || t.first == "<|fim▁hole|>" // DeepSeek
+ || t.first == "<SUF>"
+ || t.first == "▁<SUF>" // CodeLlama
+ || t.first == "<|code_suffix|>" // GLM-4.5
+ || t.first == "<|suffix|>" // Falcon-H1-Tiny-Coder
+ ) {
+ special_fim_suf_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+
+ // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
+ if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_middle|>" // Qwen
+ || t.first == "<fim-middle>"
+ || t.first == "<fim_middle>" // Granite
+ || t.first == "<|fim▁end|>" // DeepSeek
+ || t.first == "<MID>"
+ || t.first == "▁<MID>" // CodeLlama
+ || t.first == "<|code_middle|>" // GLM-4.5
+ || t.first == "<|middle|>" // Falcon-H1-Tiny-Coder
+ ) {
+ special_fim_mid_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+
+ // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
+ if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_pad|>" // Qwen
+ || t.first == "<fim-pad>"
+ || t.first == "<fim_pad>" // Granite
+ || t.first == "<PAD>"
+ || t.first == "[PAD]" // Kimi-K2
+ ) {
+ special_fim_pad_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+
+ // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
+ if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_repo|>" // Qwen
+ || t.first == "<|repo_name|>"
+ || t.first == "<fim-repo>"
+ || t.first == "<REPO>"
+ || t.first == "<reponame>" // Granite
+ ) {
+ special_fim_rep_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+
+ // find FIM_SEP token: "<|file_sep|>"
+ if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|file_sep|>" // Qwen
+ ) {
+ special_fim_sep_id = t.second;
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ }
+ }
+ }
+
+ // auto-detect unused tokens: e.g. control tokens with the word "unused"
+ // ideally, these tokens should be marked as unused during conversion
+ {
+ uint32_t n_unused = 0;
+
+ for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ continue;
+ }
+
+ if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
+ if (strstr(t.first.c_str(), "unused") != NULL) {
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
+ }
+ }
+
+ if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
+ n_unused++;
+ }
+ }
+
+ LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
+ }
+
+ // maintain a list of tokens that cause end-of-generation
+ // this is currently determined based on the token text, which is obviously not ideal
+ // ref: https://github.com/ggml-org/llama.cpp/issues/9606
+ special_eog_ids.clear();
+
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
+ special_eog_ids.insert(special_fim_pad_id);
+ }
+
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
+ special_eog_ids.insert(special_fim_rep_id);
+ }
+
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
+ special_eog_ids.insert(special_fim_sep_id);
+ }
+
+ for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
+ if (false
+ || t.first == "<|eot_id|>"
+ || t.first == "<|im_end|>"
+ || t.first == "<|end|>"
+ || t.first == "<|return|>" // o200k_harmony
+ || t.first == "<|call|>" // o200k_harmony
+ || t.first == "<|flush|>" // solar-open
+ || t.first == "<|calls|>" // solar-open
+ || t.first == "<end_of_turn>"
+ || t.first == "<|endoftext|>"
+ || t.first == "<|eom_id|>"
+ || t.first == "<EOT>"
+ || t.first == "_<EOT>"
+ || t.first == "[EOT]" // Kimi-K2
+ || t.first == "[EOS]" // Kimi-K2
+ || t.first == "<|end_of_text|>"
+ || t.first == "<end_of_utterance>" // smoldocling
+ ) {
+ special_eog_ids.insert(t.second);
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+ }
+ } else {
+ if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
+ // token is control, but not marked as EOG -> print a debug log
+ if (special_eog_ids.count(t.second) == 0) {
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+ __func__, t.second, t.first.c_str());
+ }
+ }
+ }
+ }
+
+ // @ngxson : quick hack for gpt-oss, always render these tokens
+ for (const auto & t : token_to_id) {
+ auto & attr = id_to_token[t.second].attr;
+
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
+ LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
+ __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
+
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+ }
+ }
+
+ // sanity checks
+ if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
+ special_eog_ids.insert(special_eos_id);
+ LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+ }
+
+ if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
+ special_eog_ids.insert(special_eot_id);
+ LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+ }
+
+ if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
+ special_eog_ids.insert(special_eom_id);
+ LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+ }
+
+ // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
+ // we remove the "<|end|>" token from the EOG list
+ {
+ bool has_return = false;
+ bool has_call = false;
+ bool has_end = false;
+ bool has_flush = false;
+
+ llama_token end_id = LLAMA_TOKEN_NULL;
+
+ LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
+ for (auto tid : special_eog_ids) {
+ auto & text = id_to_token[tid].text;
+
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
+
+ if (text == "<|return|>") {
+ has_return = true;
+ } else if (text == "<|call|>" || text == "<|calls|>") {
+ has_call = true;
+ } else if (text == "<|flush|>") {
+ has_flush = true;
+ } else if (text == "<|end|>") {
+ has_end = true;
+ end_id = tid;
+ }
+ }
+
+ if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
+ special_eog_ids.erase(end_id);
+
+ auto & attr = id_to_token[end_id].attr;
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
+ }
+ }
+ }
+
+ // build special tokens cache
+ {
+ for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
+ if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
+ cache_special_tokens.push_back(id);
+ }
+ }
+
+ std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
+ [&] (const llama_token a, const llama_token b) {
+ return id_to_token[a].text.size() > id_to_token[b].text.size();
+ }
+ );
+
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
+ }
+
+ // build token to piece cache
+ {
+ size_t size_cache = 0;
+
+ std::vector<std::string> cache(n_tokens);
+
+ for (uint32_t id = 0; id < n_tokens; ++id) {
+ cache[id] = token_to_piece_for_cache(id, true);
+
+ size_cache += cache[id].size();
+ }
+
+ std::swap(cache_token_to_piece, cache);
+
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
+ }
+
+ // Handle per token attributes
+ //NOTE: Each model customizes per token attributes.
+ //NOTE: Per token attributes are missing from the GGUF file.
+ //TODO: Extract attributes from GGUF file.
+ {
+ auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
+ for (const auto & substr : substrs) {
+ if (str.find(substr) != std::string::npos) {
+ return true;
+ }
+ }
+ return false;
+ };
+
+ auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
+ uint32_t current = id_to_token.at(id).attr;
+ current = value ? (current | attr) : (current & ~attr);
+ id_to_token[id].attr = (llama_token_attr) current;
+ };
+
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
+ _set_tokenid_attr(token_to_id.at(token), attr, value);
+ };
+
+ std::string model_name;
+ std::string tokenizer_pre;
+ std::string general_arch;
+
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+ ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
+
+ // model name to lowercase
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
+ [] (const std::string::value_type x) {
+ return std::tolower(x);
+ }
+ );
+
+ // set attributes by model/tokenizer/architecture name
+ if (false
+ || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
+ || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
+ ) {
+ if (token_to_id.count("<mask>") == 0) {
+ LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
+ } else {
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
+ }
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
+ for (auto id : cache_special_tokens) {
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
+ }
+ for (const auto * token : {"</s>"}) {
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
+ }
+ for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
+ }
+ } else if (_contains_any(model_name, {"modern-bert"})) {
+ if (token_to_id.count("[MASK]") == 0 ) {
+ LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
+ }
+ else {
+ _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
+ }
+ }
+ }
+}
+
+enum llama_vocab_type llama_vocab::impl::get_type() const {
+ return type;
+}
+
+std::string llama_vocab::impl::type_name() const{
+ switch (type) {
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
+ case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
+ default: return "unknown";
+ }
+}
+
+bool llama_vocab::impl::is_normal(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
+}
+
+bool llama_vocab::impl::is_unknown(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
+}
+
+bool llama_vocab::impl::is_control(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
+}
+
+bool llama_vocab::impl::is_byte(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
+}
+
+bool llama_vocab::impl::is_user_defined(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
+}
+
+bool llama_vocab::impl::is_unused(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+}
+
+bool llama_vocab::impl::is_eog(llama_token id) const {
+ return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
+}
+
+uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
+ GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
+ GGML_ASSERT(is_byte(id));
+ const auto & token_data = id_to_token.at(id);
+ switch (get_type()) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
+ auto buf = token_data.text.substr(3, 2);
+ return strtol(buf.c_str(), NULL, 16);
+ }
+ case LLAMA_VOCAB_TYPE_BPE: {
+ GGML_ABORT("fatal error");
+ }
+ case LLAMA_VOCAB_TYPE_WPM: {
+ GGML_ABORT("fatal error");
+ }
+ default:
+ GGML_ABORT("fatal error");
+ }
+}
+
+llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token.at(id).attr;
+}
+
+void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
+ LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
+
+ switch (type) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
+ break;
+ case LLAMA_VOCAB_TYPE_BPE:
+ tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
+ break;
+ case LLAMA_VOCAB_TYPE_WPM:
+ tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
+ break;
+ case LLAMA_VOCAB_TYPE_UGM:
+ tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
+ break;
+ case LLAMA_VOCAB_TYPE_RWKV:
+ tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
+ break;
+ case LLAMA_VOCAB_TYPE_PLAMO2:
+ tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
+ break;
+ default:
+ GGML_ABORT("unsupported vocab type");
+ }
+}
+
+//
+// (de-) tokenize
+//
+
+// #define PRETOKENIZERDEBUG
+
+void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
+ // for each special token
+ for (const llama_token special_id : cache_special_tokens) {
+ const auto & data = vocab.get_token_data(special_id);
+ const auto & text = data.text;
+
+ if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
+ // Ignore control and unknown tokens when parse_special == false
+ continue;
+ // User-defined tokens are still pre-tokenized before everything else
+ // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
+ // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
+ }
+
+ // for each text fragment
+ std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
+ while (it != buffer.end()) {
+ auto & fragment = (*it);
+
+ // if a fragment is text ( not yet processed )
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ const auto & raw_text = fragment.raw_text;
+
+ auto raw_text_base_offset = fragment.offset;
+ auto raw_text_base_length = fragment.length;
+
+ // loop over the text
+ while (true) {
+ // find the first occurrence of a given special token in this fragment
+ // passing offset argument only limit the "search area" but match coordinates
+ // are still relative to the source full raw_text
+ // string_view begins at pos 0 for the same reason
+ auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
+
+ // no occurrences found, stop processing this fragment for a given special token
+ if (match == std::string::npos) break;
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+ auto source = std::distance(buffer.begin(), it);
+
+ // if match is further than base offset
+ // then we have some text to the left of it
+ if (match > raw_text_base_offset) {
+ // left
+ const int64_t left_reminder_offset = raw_text_base_offset + 0;
+ int64_t left_reminder_length = match - raw_text_base_offset;
+
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
+ left_reminder_length--;
+ }
+ }
+
+ if (left_reminder_length > 0) {
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
+ it++;
+ }
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+#endif
+ }
+
+ // special token
+ buffer.emplace_after(it, special_id);
+ it++;
+
+ // right
+ if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
+ int64_t right_reminder_offset = match + text.length();
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
+
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
+ right_reminder_offset++;
+ right_reminder_length--;
+ }
+ }
+
+ if (right_reminder_length > 0) {
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
+ it++;
+ }
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+#endif
+
+ if (source == 0) {
+ buffer.erase_after(buffer.before_begin());
+ } else {
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
+ }
+
+ // repeat for the right side
+ raw_text_base_offset = right_reminder_offset;
+ raw_text_base_length = right_reminder_length;
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+ } else {
+ if (source == 0) {
+ buffer.erase_after(buffer.before_begin());
+ } else {
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
+ }
+ break;
+ }
+ }
+ }
+ it++;
+ }
+ }
+}
+
+// NOTE: avoid ever using this except for building the token_to_piece caches
+std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
+ std::string piece;
+ piece.resize(piece.capacity()); // using string internal cache
+ const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
+ if (n_chars < 0) {
+ piece.resize(-n_chars);
+ int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
+ GGML_ASSERT(check == -n_chars);
+ }
+ else {
+ piece.resize(n_chars);
+ }
+
+ return piece;
+}
+
+static void llama_escape_whitespace(std::string & text) {
+ replace_all(text, " ", "\xe2\x96\x81");
+}
+
+static void llama_unescape_whitespace(std::string & word) {
+ replace_all(word, "\xe2\x96\x81", " ");
+}
+
+static std::string llama_decode_text(const std::string & text) {
+ std::string decoded_text;
+
+ const auto cpts = unicode_cpts_from_utf8(text);
+ for (const auto cpt : cpts) {
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
+ try {
+ decoded_text += unicode_utf8_to_byte(utf8);
+ } catch (const std::out_of_range & /*e*/) {
+ decoded_text += "[UNK_BYTE_0x";
+ for (const auto c : utf8) {
+ decoded_text += format("%02x", (uint8_t) c);
+ }
+ decoded_text += text + "]";
+ }
+ }
+
+ return decoded_text;
+}
+
+std::vector<llama_token> llama_vocab::impl::tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special) const {
+ GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+
+ std::vector<llama_token> output;
+ std::forward_list<fragment_buffer_variant> fragment_buffer;
+
+ if (!raw_text.empty()) {
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
+ tokenizer_st_partition(fragment_buffer, parse_special);
+ }
+
+ switch (get_type()) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ {
+ // OG tokenizer behavior:
+ //
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
+ // tokenizer.encode('', add_special_tokens=False) returns []
+
+ bool is_prev_special = true; // prefix with space if first token
+
+ if (add_special && add_bos) {
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_bos_id);
+ is_prev_special = true;
+ }
+
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text;
+
+ // prefix with space if previous is special
+ if (add_space_prefix && is_prev_special) {
+ text = ' ';
+ }
+
+ text += fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ llama_escape_whitespace(text);
+ llm_tokenizer_spm_session session(vocab);
+ session.tokenize(text, output);
+ is_prev_special = false;
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ is_prev_special = true;
+ }
+ }
+
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
+ LLAMA_LOG_WARN(
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+ "Are you sure this is what you want?\n", __FUNCTION__);
+ }
+
+ if (add_special && add_eos) {
+ GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_eos_id);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_BPE:
+ {
+ llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
+ // it calls some other methods that are not exist in llm_tokenizer,
+ // here just cast it to bpe tokenizer object
+ if (add_special) {
+ session.append_bos(output);
+ }
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ session.append(fragment.token, output);
+ }
+ }
+
+ if (add_special) {
+ session.append_eos(output);
+ session.check_double_bos_eos(output);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_WPM:
+ {
+ if (add_special) {
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_bos_id);
+ }
+
+ llm_tokenizer_wpm_session session(vocab);
+
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+
+ if (add_special) {
+ GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_sep_id);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_UGM:
+ {
+ if (add_special && add_bos) {
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_bos_id);
+ }
+ llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
+
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
+ LLAMA_LOG_WARN(
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+ "Are you sure this is what you want?\n", __FUNCTION__);
+ }
+
+ if (add_special && add_eos) {
+ GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_eos_id);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_RWKV:
+ {
+ llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_PLAMO2:
+ {
+ llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_NONE:
+ GGML_ABORT("fatal error");
+ }
+
+ return output;
+}
+
+int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
+ // ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
+ static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
+ const llama_token_attr attr = token_get_attr(token);
+ if (!special && (attr & attr_special)) {
+ return 0;
+ }
+
+ // copy piece chars to output text buffer
+ // skip up to 'lstrip' leading spaces before copying
+ auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
+ if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+ GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
+ }
+
+ for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
+ token++;
+ size--;
+ }
+ if (length < (int32_t)size) {
+ return -(int32_t) size;
+ }
+ memcpy(buf, token, size);
+ return (int32_t) size;
+ };
+
+ // if we have a cache - use it
+ {
+ const auto & cache = cache_token_to_piece;
+
+ if (!cache.empty()) {
+ const auto & result = cache.at(token);
+ return _try_copy(result.data(), result.size());
+ }
+ }
+
+ if (0 <= token && token < (int32_t) id_to_token.size()) {
+ const std::string & token_text = id_to_token[token].text;
+ switch (get_type()) {
+ case LLAMA_VOCAB_TYPE_WPM:
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
+ // NOTE: we accept all unsupported token types,
+ // suppressing them like CONTROL tokens.
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+ return _try_copy(token_text.data(), token_text.size());
+ }
+ if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+ std::string result = token_text;
+ llama_unescape_whitespace(result);
+ return _try_copy(result.data(), result.size());
+ }
+ if (attr & LLAMA_TOKEN_ATTR_BYTE) {
+ char byte = (char) token_to_byte(token);
+ return _try_copy((char*) &byte, 1);
+ }
+ break;
+ }
+ case LLAMA_VOCAB_TYPE_BPE: {
+ // NOTE: we accept all unsupported token types,
+ // suppressing them like CONTROL tokens.
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+ return _try_copy(token_text.data(), token_text.size());
+ }
+ if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+ std::string result = llama_decode_text(token_text);
+ return _try_copy(result.data(), result.size());
+ }
+ break;
+ }
+ case LLAMA_VOCAB_TYPE_RWKV: {
+ std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
+
+ // If we don't have enough space, return an error
+ if (result.size() > (size_t)length) {
+ return -(int)result.size();
+ }
+
+ memcpy(buf, result.data(), result.size());
+ return (int)result.size();
+ }
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
+ // PLaMo-2 uses similar token handling as BPE/SPM
+ if (vocab.is_byte(token)) {
+ // Handle byte tokens like <0xXX>
+ if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
+ int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
+ if (length < 1) {
+ return -1;
+ }
+ buf[0] = static_cast<char>(hex_val);
+ return 1;
+ }
+ }
+
+ // Normal token - just copy the text
+ std::string result = token_text;
+ return _try_copy(result.data(), result.size());
+ }
+ default:
+ GGML_ABORT("fatal error");
+ }
+ }
+
+ return 0;
+}
+
+const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
+ return cache_token_to_piece.at(token);
+}
+
+int32_t llama_vocab::impl::detokenize(
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) const {
+ if (type == LLAMA_VOCAB_TYPE_NONE) {
+ return 0;
+ }
+
+ GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+
+ int32_t avail = text_len_max;
+ int32_t total = 0;
+
+ // remove the leading space
+ bool remove_space = add_space_prefix;
+
+ if (remove_special && add_bos) {
+ if (n_tokens > 0 && tokens[0] == special_bos_id) {
+ remove_space = false;
+ n_tokens--;
+ tokens++;
+ }
+ }
+
+ if (remove_special && add_eos) {
+ if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
+ n_tokens--;
+ }
+ }
+
+ for (int32_t i = 0; i < n_tokens; ++i) {
+ GGML_ASSERT(avail >= 0);
+ int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
+ remove_space = false;
+ if (n_chars < 0) {
+ avail = 0;
+ total -= n_chars;
+ } else if (n_chars > 0) {
+ avail -= n_chars;
+ text += n_chars;
+ total += n_chars;
+ }
+ }
+
+ if (total > text_len_max) {
+ return -total;
+ }
+
+ if (clean_spaces) {
+ text -= total; // restart text
+
+ // first pass: characters ?!., //TODO: where do these characters come from?
+ const int32_t total1 = total;
+ total = total ? 1 : 0;
+ for (int32_t i = 1; i < total1; ++i) {
+ const char x = text[i];
+ if (text[i - 1] == ' ') {
+ if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
+ total--; // remove space
+ }
+ }
+ text[total++] = x;
+ }
+
+ // second pass: strip single apostrophe between spaces
+ const int32_t total2 = total;
+ total = total ? 1 : 0;
+ for (int32_t i = 1; i < total2; ++i) {
+ const char x = text[i];
+ if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
+ total--; // remove prev space
+ text[++i] = '\0'; // remove next space
+ }
+ text[total++] = x;
+ }
+
+ // third pass: apostrophe contractions //NOTE: this makes sense?
+ const int32_t total3 = total;
+ total = total ? 1 : 0;
+ for (int32_t i = 1; i < total3; ++i) {
+ const char x = text[i];
+ if (text[i - 1] == ' ') {
+ if (x == '\'' && i + 1 < total3) {
+ const char x1 = text[i + 1];
+ if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
+ //total--; // remove space
+ } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
+ total--; // remove space
+ } else if (i + 2 < total3) {
+ const char x2 = text[i + 2];
+ if ((x1 == 'l' && x2 == 'l')) { // " 'll"
+ //total--; // remove space
+ } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
+ total--; // remove space
+ } else {
+ //total--; // remove space
+ }
+ } else {
+ //total--; // remove space
+ }
+ }
+ }
+ text[total++] = x;
+ }
+ }
+
+ return total <= text_len_max ? total : -total;
+}
+
+void llama_vocab::impl::print_info() const {
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
+
+ // special tokens
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
+
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
+
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
+
+ for (const auto & id : special_eog_ids) {
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
+ }
+
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
+}
+
+llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
+}
+
+llama_vocab::~llama_vocab() = default;
+
+void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
+ pimpl->load(ml, kv);
+}
+
+std::string llama_vocab::get_tokenizer_model() const {
+ return pimpl->tokenizer_model;
+}
+
+std::string llama_vocab::get_tokenizer_pre() const {
+ return pimpl->tokenizer_pre;
+}
+
+enum llama_vocab_type llama_vocab::get_type() const {
+ return pimpl->type;
+}
+
+enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
+ return pimpl->pre_type;
+}
+
+uint32_t llama_vocab::n_tokens() const {
+ return (uint32_t) pimpl->id_to_token.size();
+}
+
+uint32_t llama_vocab::n_token_types() const {
+ return (uint32_t) pimpl->n_token_types;
+}
+
+std::string llama_vocab::type_name() const{
+ return pimpl->type_name();
+}
+
+bool llama_vocab::is_normal(llama_token id) const {
+ return pimpl->is_normal(id);
+}
+
+bool llama_vocab::is_unknown(llama_token id) const {
+ return pimpl->is_unknown(id);
+}
+
+bool llama_vocab::is_control(llama_token id) const {
+ return pimpl->is_control(id);
+}
+
+bool llama_vocab::is_byte(llama_token id) const {
+ return pimpl->is_byte(id);
+}
+
+bool llama_vocab::is_user_defined(llama_token id) const {
+ return pimpl->is_user_defined(id);
+}
+
+bool llama_vocab::is_unused(llama_token id) const {
+ return pimpl->is_unused(id);
+}
+
+bool llama_vocab::is_eog(llama_token id) const {
+ return pimpl->is_eog(id);
+}
+
+uint8_t llama_vocab::token_to_byte(llama_token id) const {
+ return pimpl->token_to_byte(id);
+}
+
+llama_token llama_vocab::byte_to_token(uint8_t ch) const {
+ GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
+ static const char * hex = "0123456789ABCDEF";
+ switch (get_type()) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+ auto token = pimpl->token_to_id.find(buf);
+ if (token != pimpl->token_to_id.end()) {
+ return (*token).second;
+ }
+ // Try to fall back to just the byte as a string
+ const char buf2[2] = { (char)ch, 0 };
+ return pimpl->token_to_id.at(buf2);
+ }
+ case LLAMA_VOCAB_TYPE_WPM:
+ case LLAMA_VOCAB_TYPE_BPE: {
+ return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
+ }
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
+ // PLaMo-2 uses byte tokens in format <0xXX>
+ char hex_str[8];
+ snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
+ return pimpl->token_to_id.at(hex_str);
+ }
+ default:
+ GGML_ABORT("fatal error");
+ }
+}
+
+llama_token llama_vocab::text_to_token(const std::string & text) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ auto it = pimpl->token_to_id.find(text);
+ if (it != pimpl->token_to_id.end()) {
+ return (*it).second;
+ }
+ return LLAMA_TOKEN_NULL;
+}
+
+const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ return pimpl->id_to_token.at(id);
+}
+
+const char * llama_vocab::token_get_text(llama_token id) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ return pimpl->id_to_token.at(id).text.c_str();
+}
+
+float llama_vocab::token_get_score(llama_token id) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ return pimpl->id_to_token.at(id).score;
+}
+
+llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
+ return pimpl->token_get_attr(id);
+}
+
+llama_token llama_vocab::token_bos() const {
+ return pimpl->special_bos_id;
+}
+
+llama_token llama_vocab::token_eos() const {
+ return pimpl->special_eos_id;
+}
+
+llama_token llama_vocab::token_eot() const {
+ return pimpl->special_eot_id;
+}
+
+llama_token llama_vocab::token_eom() const {
+ return pimpl->special_eom_id;
+}
+
+llama_token llama_vocab::token_unk() const {
+ return pimpl->special_unk_id;
+}
+
+llama_token llama_vocab::token_sep() const {
+ return pimpl->special_sep_id;
+}
+
+llama_token llama_vocab::token_nl() const {
+ return pimpl->linefeed_id;
+}
+
+llama_token llama_vocab::token_pad() const {
+ return pimpl->special_pad_id;
+}
+
+llama_token llama_vocab::token_prefix() const {
+ return pimpl->special_fim_pre_id;
+}
+
+llama_token llama_vocab::token_middle() const {
+ return pimpl->special_fim_mid_id;
+}
+
+llama_token llama_vocab::token_suffix() const {
+ return pimpl->special_fim_suf_id;
+}
+
+llama_token llama_vocab::token_fim_pre() const {
+ return pimpl->special_fim_pre_id;
+}
+
+llama_token llama_vocab::token_fim_suf() const {
+ return pimpl->special_fim_suf_id;
+}
+
+llama_token llama_vocab::token_fim_mid() const {
+ return pimpl->special_fim_mid_id;
+}
+
+llama_token llama_vocab::token_fim_pad() const {
+ return pimpl->special_fim_pad_id;
+}
+
+llama_token llama_vocab::token_fim_rep() const {
+ return pimpl->special_fim_rep_id;
+}
+
+llama_token llama_vocab::token_fim_sep() const {
+ return pimpl->special_fim_sep_id;
+}
+
+llama_token llama_vocab::token_mask() const {
+ return pimpl->special_mask_id;
+}
+
+bool llama_vocab::get_add_space_prefix() const {
+ return pimpl->add_space_prefix;
+}
+
+bool llama_vocab::get_add_bos() const {
+ return pimpl->add_bos;
+}
+
+bool llama_vocab::get_add_eos() const {
+ return pimpl->add_eos;
+}
+
+bool llama_vocab::get_add_sep() const {
+ return pimpl->add_sep;
+}
+
+bool llama_vocab::get_ignore_merges() const {
+ return pimpl->ignore_merges;
+}
+
+bool llama_vocab::get_clean_spaces() const {
+ return pimpl->clean_spaces;
+}
+
+bool llama_vocab::get_remove_extra_whitespaces() const {
+ return pimpl->remove_extra_whitespaces;
+}
+
+bool llama_vocab::get_escape_whitespaces() const {
+ return pimpl->escape_whitespaces;
+}
+
+bool llama_vocab::get_treat_whitespace_as_suffix() const {
+ return pimpl->treat_whitespace_as_suffix;
+}
+
+int llama_vocab::max_token_len() const {
+ return pimpl->max_token_len;
+}
+
+int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
+ GGML_ASSERT(token_left.find(' ') == std::string::npos);
+ GGML_ASSERT(token_left.find('\n') == std::string::npos);
+ GGML_ASSERT(token_right.find(' ') == std::string::npos);
+ GGML_ASSERT(token_right.find('\n') == std::string::npos);
+
+ auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
+ if (it == pimpl->bpe_ranks.end()) {
+ return -1;
+ }
+
+ return it->second;
+}
+
+std::vector<std::string> llama_vocab::get_bpe_merges() const {
+ std::vector<std::string> result(pimpl->bpe_ranks.size());
+
+ for (const auto & pair : pimpl->bpe_ranks) {
+ result[pair.second] = pair.first.first + " " + pair.first.second;
+ }
+
+ return result;
+}
+
+std::vector<char> llama_vocab::get_precompiled_charsmap() const {
+ return pimpl->precompiled_charsmap;
+}
+
+int32_t llama_vocab::tokenize(
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) const {
+ auto res = tokenize(std::string(text, text_len), add_special, parse_special);
+ if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
+ return std::numeric_limits<int32_t>::min();
+ }
+
+ if (n_tokens_max < (int) res.size()) {
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+ return -((int) res.size());
+ }
+
+ for (size_t i = 0; i < res.size(); i++) {
+ tokens[i] = res[i];
+ }
+
+ return res.size();
+}
+
+std::vector<llama_token> llama_vocab::tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special) const {
+ return pimpl->tokenize(raw_text, add_special, parse_special);
+}
+
+const std::string & llama_vocab::token_to_piece(llama_token token) const {
+ return pimpl->token_to_piece(token);
+}
+
+int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
+ return pimpl->token_to_piece(token, buf, length, lstrip, special);
+}
+
+int32_t llama_vocab::detokenize(
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) const {
+ return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+}
+
+std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
+ std::string text;
+ text.resize(std::max(text.capacity(), tokens.size()));
+ int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ if (n_chars < 0) {
+ text.resize(-n_chars);
+ n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
+ }
+
+ text.resize(n_chars);
+
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+ return text;
+}
+
+void llama_vocab::print_info() const {
+ pimpl->print_info();
+}
+
+//
+// interface implementation
+//
+
+int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
+ return vocab->n_tokens();
+}
+
+// deprecated
+int32_t llama_n_vocab(const struct llama_vocab * vocab) {
+ return llama_vocab_n_tokens(vocab);
+}
+
+enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
+ return vocab->get_type();
+}
+
+const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->token_get_text(token);
+}
+
+float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->token_get_score(token);
+}
+
+enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->token_get_attr(token);
+}
+
+bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->is_eog(token);
+}
+
+bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->is_control(token);
+}
+
+llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
+ return vocab->token_bos();
+}
+
+llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
+ return vocab->token_eos();
+}
+
+llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
+ return vocab->token_eot();
+}
+
+// deprecated
+llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
+ return vocab->token_bos();
+}
+
+llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
+ return vocab->token_sep();
+}
+
+llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
+ return vocab->token_nl();
+}
+
+llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
+ return vocab->token_pad();
+}
+
+bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
+ return vocab->get_add_bos();
+}
+
+bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
+ return vocab->get_add_eos();
+}
+
+bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
+ return vocab->get_add_sep();
+}
+
+llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
+ return vocab->token_fim_pre();
+}
+
+llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
+ return vocab->token_fim_suf();
+}
+
+llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
+ return vocab->token_fim_mid();
+}
+
+llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
+ return vocab->token_fim_pad();
+}
+
+llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
+ return vocab->token_fim_rep();
+}
+
+llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
+ return vocab->token_fim_sep();
+}
+
+llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
+ return vocab->token_mask();
+}
+
+// deprecated
+const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_get_text(vocab, token);
+}
+
+// deprecated
+float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_get_score(vocab, token);
+}
+
+// deprecated
+enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_get_attr(vocab, token);
+}
+
+// deprecated
+bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_is_eog(vocab, token);
+}
+
+// deprecated
+bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_is_control(vocab, token);
+}
+
+// deprecated
+llama_token llama_token_bos(const struct llama_vocab * vocab) {
+ return llama_vocab_bos(vocab);
+}
+
+// deprecated
+llama_token llama_token_eos(const struct llama_vocab * vocab) {
+ return llama_vocab_eos(vocab);
+}
+
+// deprecated
+llama_token llama_token_eot(const struct llama_vocab * vocab) {
+ return llama_vocab_eot(vocab);
+}
+
+// deprecated
+llama_token llama_token_cls(const struct llama_vocab * vocab) {
+ //return llama_vocab_cls(vocab);
+ return llama_vocab_bos(vocab); // avoid deprecation warning
+}
+
+// deprecated
+llama_token llama_token_sep(const struct llama_vocab * vocab) {
+ return llama_vocab_sep(vocab);
+}
+
+// deprecated
+llama_token llama_token_nl (const struct llama_vocab * vocab) {
+ return llama_vocab_nl(vocab);
+}
+
+// deprecated
+llama_token llama_token_pad(const struct llama_vocab * vocab) {
+ return llama_vocab_pad(vocab);
+}
+
+// deprecated
+bool llama_add_bos_token(const struct llama_vocab * vocab) {
+ return llama_vocab_get_add_bos(vocab);
+}
+
+// deprecated
+bool llama_add_eos_token(const struct llama_vocab * vocab) {
+ return llama_vocab_get_add_eos(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_pre(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_suf(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_mid(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_pad(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_rep(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_sep(vocab);
+}
+
+//
+// tokenization
+//
+
+int32_t llama_tokenize(
+ const struct llama_vocab * vocab,
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) {
+ return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
+}
+
+int32_t llama_token_to_piece(
+ const struct llama_vocab * vocab,
+ llama_token token,
+ char * buf,
+ int32_t length,
+ int32_t lstrip,
+ bool special) {
+ return vocab->token_to_piece(token, buf, length, lstrip, special);
+}
+
+int32_t llama_detokenize(
+ const struct llama_vocab * vocab,
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) {
+ return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+}
diff --git a/llama.cpp/src/llama-vocab.h b/llama.cpp/src/llama-vocab.h
new file mode 100644
index 0000000..718238f
--- /dev/null
+++ b/llama.cpp/src/llama-vocab.h
@@ -0,0 +1,184 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <memory>
+
+// pre-tokenization types
+enum llama_vocab_pre_type {
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5,
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
+ LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
+ LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
+ LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
+ LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
+ LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
+ LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
+ LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
+ LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
+ LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
+};
+
+struct LLM_KV;
+struct llama_model_loader;
+
+struct llama_vocab {
+ struct token_data {
+ std::string text;
+ float score;
+ llama_token_attr attr;
+ };
+
+ llama_vocab();
+ ~llama_vocab();
+
+ void load(llama_model_loader & ml, const LLM_KV & kv);
+
+ std::string get_tokenizer_model() const;
+ std::string get_tokenizer_pre() const;
+
+ enum llama_vocab_type get_type() const;
+ enum llama_vocab_pre_type get_pre_type() const;
+
+ uint32_t n_tokens() const;
+ uint32_t n_token_types() const;
+
+ std::string type_name() const;
+
+ bool is_normal (llama_token id) const;
+ bool is_unknown (llama_token id) const;
+ bool is_control (llama_token id) const;
+ bool is_byte (llama_token id) const;
+ bool is_user_defined(llama_token id) const;
+ bool is_unused (llama_token id) const;
+ bool is_eog (llama_token id) const;
+
+ uint8_t token_to_byte(llama_token id) const;
+ llama_token byte_to_token(uint8_t ch) const;
+
+ llama_token text_to_token(const std::string & text) const;
+
+ const token_data & get_token_data(llama_token id) const;
+
+ const char * token_get_text (llama_token id) const;
+ float token_get_score(llama_token id) const;
+ llama_token_attr token_get_attr (llama_token id) const;
+
+ llama_token token_bos() const;
+ llama_token token_eos() const;
+ llama_token token_eot() const;
+ llama_token token_eom() const;
+ llama_token token_unk() const;
+ llama_token token_sep() const;
+ llama_token token_nl () const;
+ llama_token token_pad() const;
+ llama_token token_mask() const;
+
+ llama_token token_prefix() const;
+ llama_token token_middle() const;
+ llama_token token_suffix() const;
+
+ llama_token token_fim_pre() const;
+ llama_token token_fim_suf() const;
+ llama_token token_fim_mid() const;
+ llama_token token_fim_pad() const;
+ llama_token token_fim_rep() const;
+ llama_token token_fim_sep() const;
+
+ bool get_add_space_prefix () const;
+ bool get_add_bos () const;
+ bool get_add_eos () const;
+ bool get_add_sep () const;
+ bool get_ignore_merges () const;
+ bool get_clean_spaces () const;
+ bool get_remove_extra_whitespaces () const;
+ bool get_escape_whitespaces () const;
+ bool get_treat_whitespace_as_suffix() const;
+
+ int max_token_len() const;
+
+ int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+ std::vector<std::string> get_bpe_merges() const;
+
+ std::vector<char> get_precompiled_charsmap() const;
+
+ int32_t tokenize(
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) const;
+
+ std::vector<llama_token> tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special = false) const;
+
+ // does not write null-terminator to buf
+ int32_t token_to_piece(
+ llama_token token,
+ char * buf,
+ int32_t length,
+ int32_t lstrip,
+ bool special) const;
+
+ // use cached data
+ const std::string & token_to_piece(llama_token token) const;
+
+ int32_t detokenize(
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) const;
+
+ std::string detokenize(
+ const std::vector<llama_token> & tokens,
+ bool special) const;
+
+ void print_info() const;
+
+private:
+ struct impl;
+ std::unique_ptr<impl> pimpl;
+};
diff --git a/llama.cpp/src/llama.cpp b/llama.cpp/src/llama.cpp
new file mode 100644
index 0000000..6da90d6
--- /dev/null
+++ b/llama.cpp/src/llama.cpp
@@ -0,0 +1,1174 @@
+#include "llama.h"
+
+#include "llama-impl.h"
+
+#include "llama-chat.h"
+#include "llama-context.h"
+#include "llama-mmap.h"
+#include "llama-vocab.h"
+#include "llama-model-loader.h"
+#include "llama-model-saver.h"
+#include "llama-model.h"
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <stdexcept>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+//
+// interface implementation
+//
+
+const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
+ switch (flash_attn_type) {
+ case LLAMA_FLASH_ATTN_TYPE_AUTO:
+ return "auto";
+ case LLAMA_FLASH_ATTN_TYPE_DISABLED:
+ return "disabled";
+ case LLAMA_FLASH_ATTN_TYPE_ENABLED:
+ return "enabled";
+ }
+ GGML_ABORT("fatal error");
+}
+
+struct llama_device_memory_data {
+ int64_t total;
+ int64_t free;
+ llama_memory_breakdown_data mb;
+};
+
+static std::vector<llama_device_memory_data> llama_get_device_memory_data(
+ const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
+ std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
+ const ggml_log_level log_level) {
+ struct user_data_t {
+ struct {
+ ggml_log_callback callback;
+ void * user_data;
+ } original_logger;
+ ggml_log_level min_level; // prints below this log level go to debug log
+ };
+ user_data_t ud;
+ llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
+ ud.min_level = log_level;
+
+ llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
+ const user_data_t * ud = (const user_data_t *) user_data;
+ const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+ ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
+ }, &ud);
+
+ llama_model_params mparams_copy = *mparams;
+ mparams_copy.no_alloc = true;
+ mparams_copy.use_mmap = false;
+ mparams_copy.use_mlock = false;
+
+ llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
+ if (model == nullptr) {
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+ throw std::runtime_error("failed to load model");
+ }
+
+ llama_context * ctx = llama_init_from_model(model, *cparams);
+ if (ctx == nullptr) {
+ llama_model_free(model);
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+ throw std::runtime_error("failed to create llama_context from model");
+ }
+
+ std::vector<llama_device_memory_data> ret(model->devices.size());
+
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
+
+ for (const auto & [buft, mb] : memory_breakdown) {
+ if (ggml_backend_buft_is_host(buft)) {
+ continue;
+ }
+
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+ if (!dev) {
+ continue;
+ }
+ for (size_t i = 0; i < ret.size(); i++) {
+ if (model->devices[i] == dev) {
+ ret[i].mb.model += mb.model;
+ ret[i].mb.context += mb.context;
+ ret[i].mb.compute += mb.compute;
+ break;
+ }
+ }
+ }
+ for (size_t i = 0; i < ret.size(); i++) {
+ size_t free;
+ size_t total;
+ ggml_backend_dev_memory(model->devices[i], &free, &total);
+
+ // devices can return 0 bytes for free and total memory if they do not
+ // have any to report. in this case, we will use the host memory as a fallback
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+ if (free == 0 && total == 0) {
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (cpu_dev == nullptr) {
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
+ }
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
+ }
+ ret[i].free = free;
+ ret[i].total = total;
+ }
+
+ devs = model->devices;
+ hp_ngl = model->hparams.n_layer;
+ hp_n_ctx_train = model->hparams.n_ctx_train;
+ hp_n_expert = model->hparams.n_expert;
+
+ llama_memory_breakdown_print(ctx); // goes to debug log
+
+ llama_free(ctx);
+ llama_model_free(model);
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+ return ret;
+}
+
+// enum to identify part of a layer for distributing its tensors:
+enum layer_fraction_t {
+ LAYER_FRACTION_NONE = 0, // nothing
+ LAYER_FRACTION_ATTN = 1, // attention
+ LAYER_FRACTION_UP = 2, // attention + up
+ LAYER_FRACTION_GATE = 3, // attention + up + gate
+ LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
+};
+// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
+
+class llama_params_fit_exception : public std::runtime_error {
+ using std::runtime_error::runtime_error;
+};
+
+static void llama_params_fit_impl(
+ const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
+ float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
+ size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+ constexpr int64_t MiB = 1024*1024;
+ typedef std::vector<llama_device_memory_data> dmds_t;
+ const llama_model_params default_mparams = llama_model_default_params();
+
+ std::vector<ggml_backend_dev_t> devs;
+ uint32_t hp_ngl = 0; // hparams.n_gpu_layers
+ uint32_t hp_nct = 0; // hparams.n_ctx_train
+ uint32_t hp_nex = 0; // hparams.n_expert
+
+ // step 1: get data for default parameters and check whether any changes are necessary in the first place
+
+ LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
+ const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+ const size_t nd = devs.size(); // number of devices
+ if (nd == 0) {
+ LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
+ return;
+ }
+
+ std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
+ margins.reserve(nd);
+ for (size_t id = 0; id < nd; id++) {
+ margins.push_back(margins_s[id]);
+ }
+
+ std::vector<std::string> dev_names;
+ {
+ dev_names.reserve(nd);
+ size_t max_length = 0;
+ for (ggml_backend_dev_t dev : devs) {
+ std::string name = ggml_backend_dev_name(dev);
+ name += " (";
+ name += ggml_backend_dev_description(dev);
+ name += ")";
+ dev_names.push_back(name);
+ max_length = std::max(max_length, name.length());
+ }
+ for (std::string & dn : dev_names) {
+ dn.insert(dn.end(), max_length - dn.length(), ' ');
+ }
+ }
+
+ int64_t sum_free = 0;
+ int64_t sum_projected_free = 0;
+ int64_t sum_projected_used = 0;
+ int64_t sum_projected_model = 0;
+ std::vector<int64_t> projected_free_per_device;
+ projected_free_per_device.reserve(nd);
+
+ if (nd > 1) {
+ LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+ }
+ for (size_t id = 0; id < nd; id++) {
+ const llama_device_memory_data & dmd = dmds_full[id];
+
+ const int64_t projected_used = dmd.mb.total();
+ const int64_t projected_free = dmd.free - projected_used;
+ projected_free_per_device.push_back(projected_free);
+
+ sum_free += dmd.free;
+ sum_projected_used += projected_used;
+ sum_projected_free += projected_free;
+ sum_projected_model += dmd.mb.model;
+
+ if (nd > 1) {
+ LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+ __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
+ }
+ }
+ assert(sum_free >= 0 && sum_projected_used >= 0);
+ LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+ __func__, sum_projected_used/MiB, sum_free/MiB);
+ if (nd == 1) {
+ if (projected_free_per_device[0] >= margins[0]) {
+ LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+ __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+ return;
+ }
+ } else {
+ bool changes_needed = false;
+ for (size_t id = 0; id < nd; id++) {
+ if (projected_free_per_device[id] < margins[id]) {
+ changes_needed = true;
+ break;
+ }
+ }
+ if (!changes_needed) {
+ LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+ return;
+ }
+ }
+
+ // step 2: try reducing memory use by reducing the context size
+
+ {
+ int64_t global_surplus = sum_projected_free;
+ for (size_t id = 0; id < nd; id++) {
+ global_surplus -= margins[id];
+ }
+ if (global_surplus < 0) {
+ if (nd == 1) {
+ LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+ __func__, margins[0]/MiB, -global_surplus/MiB);
+ } else {
+ LLAMA_LOG_INFO(
+ "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
+ __func__, -global_surplus/MiB);
+ }
+ if (cparams->n_ctx == 0) {
+ if (hp_nct > n_ctx_min) {
+ int64_t sum_used_target = sum_free;
+ for (size_t id = 0; id < nd; id++) {
+ sum_used_target -= margins[id];
+ }
+ if (nd > 1) {
+ // for multiple devices we need to be more conservative in terms of how much context we think can fit:
+ // - for dense models only whole layers can be assigned to devices
+ // - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
+ // - on average we expect a waste of 0.5 layers/tensors per device
+ // - use slightly more than the expected average for nd devices to be safe
+ const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
+ sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
+ }
+
+ int64_t sum_projected_used_min_ctx = 0;
+ cparams->n_ctx = n_ctx_min;
+ const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+ for (const auto & dmd : dmds_min_ctx) {
+ sum_projected_used_min_ctx += dmd.mb.total();
+ }
+ if (sum_used_target > sum_projected_used_min_ctx) {
+ // linear interpolation between minimum and maximum context size:
+ cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
+ / (sum_projected_used - sum_projected_used_min_ctx);
+ cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
+
+ const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
+ const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+ if (nd == 1) {
+ LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
+ return;
+ }
+ LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
+ } else {
+ const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+ }
+ } else {
+ if (n_ctx_min == UINT32_MAX) {
+ LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+ } else {
+ LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+ __func__, hp_nct, n_ctx_min);
+ }
+ }
+ } else {
+ LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
+ }
+ }
+ }
+
+ if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
+ throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+ }
+ if (nd > 1) {
+ if (!tensor_split) {
+ throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
+ }
+ if (mparams->tensor_split) {
+ for (size_t id = 0; id < nd; id++) {
+ if (mparams->tensor_split[id] != 0.0f) {
+ throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
+ }
+ }
+ }
+ if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
+ throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+ }
+ }
+ if (!tensor_buft_overrides) {
+ throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
+ }
+ if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
+ throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
+ }
+
+ // step 3: iteratively fill the back to front with "dense" layers
+ // - for a dense model simply fill full layers, giving each device a contiguous slice of the model
+ // - for a MoE model, same as dense model but with all MoE tensors in system memory
+
+ // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
+ auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
+ constexpr size_t n_strings = 1000;
+ if (il >= n_strings) {
+ throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
+ }
+ switch (lf) {
+ case LAYER_FRACTION_ATTN: {
+ static std::array<std::string, n_strings> patterns;
+ if (patterns[il].empty()) {
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
+ }
+ return patterns[il].c_str();
+ }
+ case LAYER_FRACTION_UP: {
+ static std::array<std::string, n_strings> patterns;
+ if (patterns[il].empty()) {
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
+ }
+ return patterns[il].c_str();
+ }
+ case LAYER_FRACTION_GATE: {
+ static std::array<std::string, n_strings> patterns;
+ if (patterns[il].empty()) {
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
+ }
+ return patterns[il].c_str();
+ }
+ case LAYER_FRACTION_MOE: {
+ static std::array<std::string, n_strings> patterns;
+ if (patterns[il].empty()) {
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
+ }
+ return patterns[il].c_str();
+ }
+ default:
+ GGML_ABORT("fatal error");
+ }
+ };
+
+ struct ngl_t {
+ uint32_t n_layer = 0; // number of total layers
+ uint32_t n_part = 0; // number of partial layers, <= n_layer
+
+ // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
+ layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
+
+ uint32_t n_full() const {
+ assert(n_layer >= n_part);
+ return n_layer - n_part;
+ }
+ };
+
+ const size_t ntbo = llama_max_tensor_buft_overrides();
+
+ // utility function to set n_gpu_layers and tensor_split
+ auto set_ngl_tensor_split_tbo = [&](
+ const std::vector<ngl_t> & ngl_per_device,
+ const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
+ llama_model_params & mparams) {
+ mparams.n_gpu_layers = 0;
+ for (size_t id = 0; id < nd; id++) {
+ mparams.n_gpu_layers += ngl_per_device[id].n_layer;
+ if (nd > 1) {
+ tensor_split[id] = ngl_per_device[id].n_layer;
+ }
+ }
+ assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
+ uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
+
+ mparams.tensor_split = tensor_split;
+
+ size_t itbo = 0;
+ for (size_t id = 0; id < nd; id++) {
+ il0 += ngl_per_device[id].n_full();
+ for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
+ if (itbo + 1 >= ntbo) {
+ tensor_buft_overrides[itbo].pattern = nullptr;
+ tensor_buft_overrides[itbo].buft = nullptr;
+ itbo++;
+ mparams.tensor_buft_overrides = tensor_buft_overrides;
+ throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
+ + std::to_string(ntbo) + " is insufficient for model");
+ }
+ tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
+ tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
+ itbo++;
+ }
+ il0 += ngl_per_device[id].n_part;
+ }
+ tensor_buft_overrides[itbo].pattern = nullptr;
+ tensor_buft_overrides[itbo].buft = nullptr;
+ itbo++;
+ mparams.tensor_buft_overrides = tensor_buft_overrides;
+ };
+
+ // utility function that returns the memory use per device for given numbers of layers per device
+ auto get_memory_for_layers = [&](
+ const char * func_name,
+ const std::vector<ngl_t> & ngl_per_device,
+ const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
+ llama_model_params mparams_copy = *mparams;
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
+
+ const dmds_t dmd_nl = llama_get_device_memory_data(
+ path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+
+ LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
+ for (size_t id = 0; id < nd; id++) {
+ const ngl_t & n = ngl_per_device[id];
+ LLAMA_LOG_DEBUG(
+ "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
+ func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
+ }
+
+ std::vector<int64_t> ret;
+ ret.reserve(nd);
+ for (const llama_device_memory_data & dmd : dmd_nl) {
+ ret.push_back(dmd.mb.total());
+ }
+ return ret;
+ };
+
+ int64_t global_surplus_cpu_moe = 0;
+ if (hp_nex > 0) {
+ const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
+ ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
+ tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
+ tensor_buft_overrides[1] = {nullptr, nullptr};
+ mparams->tensor_buft_overrides = tensor_buft_overrides;
+
+ LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
+ const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
+ path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+
+ for (size_t id = 0; id < nd; id++) {
+ global_surplus_cpu_moe += dmds_cpu_moe[id].free;
+ global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
+ }
+
+ if (global_surplus_cpu_moe > 0) {
+ LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
+ __func__, global_surplus_cpu_moe/MiB);
+ } else {
+ LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
+ __func__, -global_surplus_cpu_moe/MiB);
+ }
+
+ // reset
+ tensor_buft_overrides[0] = {nullptr, nullptr};
+ mparams->tensor_buft_overrides = tensor_buft_overrides;
+ }
+
+ std::vector<int64_t> targets; // maximum acceptable memory use per device
+ targets.reserve(nd);
+ for (size_t id = 0; id < nd; id++) {
+ targets.push_back(dmds_full[id].free - margins[id]);
+ LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
+ }
+
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
+ overflow_bufts.reserve(nd);
+ for (size_t id = 0; id < nd; id++) {
+ overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
+ }
+
+ std::vector<ngl_t> ngl_per_device(nd);
+ std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
+
+ // optimize the number of layers per device using the method of false position:
+ // - ngl_per_device has 0 layers for each device, lower bound
+ // - try a "high" configuration where a device is given all unassigned layers
+ // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
+ // - check memory use of our guess, replace either the low or high bound
+ // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
+ // - the last device has the output layer, which cannot be a partial layer
+ if (hp_nex == 0) {
+ LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
+ } else {
+ LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
+ }
+ for (int id = nd - 1; id >= 0; id--) {
+ uint32_t n_unassigned = hp_ngl + 1;
+ for (size_t jd = id + 1; jd < nd; ++jd) {
+ assert(n_unassigned >= ngl_per_device[jd].n_layer);
+ n_unassigned -= ngl_per_device[jd].n_layer;
+ }
+
+ std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
+ ngl_per_device_high[id].n_layer = n_unassigned;
+ if (hp_nex > 0) {
+ ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
+ }
+ if (ngl_per_device_high[id].n_layer > 0) {
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+ if (mem_high[id] > targets[id]) {
+ assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
+ uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+ LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
+ while (delta > 1) {
+ uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
+ step_size = std::max(step_size, uint32_t(1));
+ step_size = std::min(step_size, delta - 1);
+
+ std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+ ngl_per_device_test[id].n_layer += step_size;
+ if (hp_nex) {
+ ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
+ step_size - 1 : step_size; // the first layer is the output layer which must always be full
+ }
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+
+ if (mem_test[id] <= targets[id]) {
+ ngl_per_device = ngl_per_device_test;
+ mem = mem_test;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+ } else {
+ ngl_per_device_high = ngl_per_device_test;
+ mem_high = mem_test;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+ }
+ delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+ }
+ } else {
+ assert(ngl_per_device_high[id].n_layer == n_unassigned);
+ ngl_per_device = ngl_per_device_high;
+ mem = mem_high;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+ }
+ }
+
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
+ LLAMA_LOG_INFO(
+ "%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
+ }
+ if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+ return;
+ }
+
+ // step 4: for a MoE model where all dense tensors fit,
+ // convert the dense-only layers in the back to full layers in the front until all devices are full
+ // essentially the same procedure as for the dense-only layers except front-to-back
+ // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
+
+ size_t id_dense_start = nd;
+ for (int id = nd - 1; id >= 0; id--) {
+ if (ngl_per_device[id].n_layer > 0) {
+ id_dense_start = id;
+ continue;
+ }
+ break;
+ }
+ assert(id_dense_start < nd);
+
+ LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
+ for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
+ std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
+ for (size_t jd = id_dense_start; jd < nd; jd++) {
+ const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
+ ngl_per_device_high[id].n_layer += n_layer_move;
+ ngl_per_device_high[jd].n_layer -= n_layer_move;
+ ngl_per_device_high[jd].n_part = 0;
+ }
+ size_t id_dense_start_high = nd - 1;
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+
+ if (mem_high[id] > targets[id]) {
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+ uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+ while (delta > 1) {
+ uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
+ step_size = std::max(step_size, uint32_t(1));
+ step_size = std::min(step_size, delta - 1);
+
+ std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+ size_t id_dense_start_test = id_dense_start;
+ uint32_t n_converted_test = 0;
+ for (;id_dense_start_test < nd; id_dense_start_test++) {
+ const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
+ ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
+ ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
+ ngl_per_device_test[id].n_layer += n_convert_jd;
+ n_converted_test += n_convert_jd;
+
+ if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
+ break;
+ }
+ }
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+
+ if (mem_test[id] <= targets[id]) {
+ ngl_per_device = ngl_per_device_test;
+ mem = mem_test;
+ id_dense_start = id_dense_start_test;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+ } else {
+ ngl_per_device_high = ngl_per_device_test;
+ mem_high = mem_test;
+ id_dense_start_high = id_dense_start_test;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
+ __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
+ }
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
+ delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+ }
+ } else {
+ ngl_per_device = ngl_per_device_high;
+ mem = mem_high;
+ id_dense_start = id_dense_start_high;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+ }
+
+ // try to fit at least part of one more layer
+ if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
+ std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+ size_t id_dense_start_test = id_dense_start;
+ ngl_per_device_test[id_dense_start_test].n_layer--;
+ ngl_per_device_test[id_dense_start_test].n_part--;
+ ngl_per_device_test[id].n_layer++;
+ ngl_per_device_test[id].n_part++;
+ if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
+ id_dense_start_test++;
+ }
+ ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
+ if (id < nd - 1) {
+ overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
+ }
+ LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+ ngl_per_device = ngl_per_device_test;
+ overflow_bufts = overflow_bufts_test;
+ mem = mem_test;
+ id_dense_start = id_dense_start_test;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+
+ ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
+ LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+ ngl_per_device = ngl_per_device_test;
+ overflow_bufts = overflow_bufts_test;
+ mem = mem_test;
+ id_dense_start = id_dense_start_test;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+ }
+ } else {
+ ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
+ LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+ ngl_per_device = ngl_per_device_test;
+ overflow_bufts = overflow_bufts_test;
+ mem = mem_test;
+ id_dense_start = id_dense_start_test;
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+ }
+ }
+ }
+
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
+ LLAMA_LOG_INFO(
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
+ }
+
+ // print info for devices that were not changed during the conversion from dense only to full layers:
+ for (size_t id = id_dense_start + 1; id < nd; id++) {
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
+ LLAMA_LOG_INFO(
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
+ }
+
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+}
+
+enum llama_params_fit_status llama_params_fit(
+ const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
+ float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
+ size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+ const int64_t t0_us = llama_time_us();
+ llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
+ try {
+ llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
+ LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
+ } catch (const llama_params_fit_exception & e) {
+ LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
+ status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
+ } catch (const std::runtime_error & e) {
+ LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
+ status = LLAMA_PARAMS_FIT_STATUS_ERROR;
+ }
+ const int64_t t1_us = llama_time_us();
+ LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
+ return status;
+}
+
+struct llama_sampler_chain_params llama_sampler_chain_default_params() {
+ struct llama_sampler_chain_params result = {
+ /*.no_perf =*/ true,
+ };
+
+ return result;
+}
+
+size_t llama_max_devices(void) {
+ return 16;
+}
+
+size_t llama_max_tensor_buft_overrides() {
+ return 4096;
+}
+
+bool llama_supports_mmap(void) {
+ return llama_mmap::SUPPORTED;
+}
+
+bool llama_supports_mlock(void) {
+ return llama_mlock::SUPPORTED;
+}
+
+bool llama_supports_gpu_offload(void) {
+ return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
+ llama_supports_rpc();
+}
+
+bool llama_supports_rpc(void) {
+ return ggml_backend_reg_by_name("RPC") != nullptr;
+}
+
+void llama_backend_init(void) {
+ ggml_time_init();
+
+ // needed to initialize f16 tables
+ {
+ struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_context * ctx = ggml_init(params);
+ ggml_free(ctx);
+ }
+}
+
+void llama_numa_init(enum ggml_numa_strategy numa) {
+ if (numa != GGML_NUMA_STRATEGY_DISABLED) {
+ auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ GGML_ASSERT(dev && "CPU backend is not loaded");
+ auto * reg = ggml_backend_dev_backend_reg(dev);
+ auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
+ if (numa_init_fn) {
+ numa_init_fn(numa);
+ }
+ }
+}
+
+void llama_backend_free(void) {
+ ggml_quantize_free();
+}
+
+int64_t llama_time_us(void) {
+ return ggml_time_us();
+}
+
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
+static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+ // loading time will be recalculated after the first eval, so
+ // we take page faults deferred by mmap() into consideration
+ model.t_load_us = 0;
+ time_meas tm(model.t_load_us);
+
+ model.t_start_us = tm.t_start_us;
+
+ try {
+ llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+
+ ml.print_info();
+
+ model.hparams.vocab_only = params.vocab_only;
+ model.hparams.no_alloc = params.no_alloc;
+
+ try {
+ model.load_arch(ml);
+ } catch(const std::exception & e) {
+ throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
+ }
+ try {
+ model.load_hparams(ml);
+ } catch(const std::exception & e) {
+ throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
+ }
+ if (model.arch == LLM_ARCH_CLIP) {
+ throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
+ }
+ try {
+ model.load_vocab(ml);
+ } catch(const std::exception & e) {
+ throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
+ }
+
+ model.load_stats(ml);
+ model.print_info();
+
+ if (params.vocab_only) {
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+ return 0;
+ }
+
+ if (!model.load_tensors(ml)) {
+ return -2;
+ }
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct llama_model * llama_model_load_from_file_impl(
+ const std::string & path_model,
+ std::vector<std::string> & splits,
+ struct llama_model_params params) {
+ ggml_time_init();
+
+ if (!params.vocab_only && ggml_backend_reg_count() == 0) {
+ LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
+ return nullptr;
+ }
+
+ unsigned cur_percentage = 0;
+ if (params.progress_callback == NULL) {
+ params.progress_callback_user_data = &cur_percentage;
+ params.progress_callback = [](float progress, void * ctx) {
+ unsigned * cur_percentage_p = (unsigned *) ctx;
+ unsigned percentage = (unsigned) (100 * progress);
+ while (percentage > *cur_percentage_p) {
+ *cur_percentage_p = percentage;
+ LLAMA_LOG_CONT(".");
+ if (percentage >= 100) {
+ LLAMA_LOG_CONT("\n");
+ }
+ }
+ return true;
+ };
+ }
+
+ llama_model * model = new llama_model(params);
+
+ // create list of devices to use with this model
+ if (params.devices) {
+ for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
+ model->devices.push_back(*dev);
+ }
+ } else {
+ // default device selection
+
+ // build list of available devices
+ std::vector<ggml_backend_dev_t> gpus;
+ std::vector<ggml_backend_dev_t> igpus;
+ std::vector<ggml_backend_dev_t> rpc_servers;
+
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ switch (ggml_backend_dev_type(dev)) {
+ case GGML_BACKEND_DEVICE_TYPE_CPU:
+ case GGML_BACKEND_DEVICE_TYPE_ACCEL:
+ // skip CPU backends since they are handled separately
+ break;
+
+ case GGML_BACKEND_DEVICE_TYPE_GPU: {
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+ if (ggml_backend_reg_name(reg) == std::string("RPC")) {
+ rpc_servers.push_back(dev);
+ } else {
+ // check if there is already a GPU with the same device id
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
+ ggml_backend_dev_props d_props;
+ ggml_backend_dev_get_props(d, &d_props);
+ if (props.device_id && d_props.device_id) {
+ return strcmp(props.device_id, d_props.device_id) == 0;
+ }
+ return false;
+ });
+
+ if (it != gpus.end()) {
+ LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
+ __func__,
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+ props.device_id ? props.device_id : "unknown id",
+ ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
+ } else {
+ gpus.push_back(dev);
+ }
+ }
+ break;
+ }
+
+ case GGML_BACKEND_DEVICE_TYPE_IGPU:
+ igpus.push_back(dev);
+ break;
+ }
+ }
+
+ // add RPC servers at the front of the list to minimize network transfers
+ model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
+
+ // add GPUs
+ model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
+
+ // add integrated GPUs only if no other devices were found
+ if (model->devices.empty()) {
+ model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
+ }
+ }
+
+ // if using single GPU mode, remove all except the main GPU
+ if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
+ if (params.main_gpu < 0) {
+ model->devices.clear();
+ } else {
+ if (params.main_gpu >= (int)model->devices.size()) {
+ LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
+ llama_model_free(model);
+ return nullptr;
+ }
+ ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
+ model->devices.clear();
+ model->devices.push_back(main_gpu);
+ }
+ }
+
+ for (auto * dev : model->devices) {
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
+ props.device_id ? props.device_id : "unknown id",
+ props.memory_free/1024/1024);
+ }
+
+ const int status = llama_model_load(path_model, splits, *model, params);
+ GGML_ASSERT(status <= 0);
+ if (status < 0) {
+ if (status == -1) {
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+ } else if (status == -2) {
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+ }
+
+ llama_model_free(model);
+ return nullptr;
+ }
+
+ return model;
+}
+
+// deprecated
+struct llama_model * llama_load_model_from_file(
+ const char * path_model,
+ struct llama_model_params params) {
+ return llama_model_load_from_file(path_model, params);
+}
+
+struct llama_model * llama_model_load_from_file(
+ const char * path_model,
+ struct llama_model_params params) {
+ std::vector<std::string> splits = {};
+ return llama_model_load_from_file_impl(path_model, splits, params);
+}
+
+struct llama_model * llama_model_load_from_splits(
+ const char ** paths,
+ size_t n_paths,
+ struct llama_model_params params) {
+ std::vector<std::string> splits;
+ if (n_paths == 0) {
+ LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
+ return nullptr;
+ }
+ splits.reserve(n_paths);
+ for (size_t i = 0; i < n_paths; ++i) {
+ splits.push_back(paths[i]);
+ }
+ return llama_model_load_from_file_impl(splits.front(), splits, params);
+}
+
+void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
+ llama_model_saver ms(*model);
+ ms.add_kv_from_model();
+ ms.add_tensors_from_model();
+ ms.save(path_model);
+}
+
+//
+// chat templates
+//
+
+int32_t llama_chat_apply_template(
+ const char * tmpl,
+ const struct llama_chat_message * chat,
+ size_t n_msg,
+ bool add_ass,
+ char * buf,
+ int32_t length) {
+ const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
+
+ // format the chat to string
+ std::vector<const llama_chat_message *> chat_vec;
+ chat_vec.resize(n_msg);
+ for (size_t i = 0; i < n_msg; i++) {
+ chat_vec[i] = &chat[i];
+ }
+
+ std::string formatted_chat;
+ llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
+ if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
+ return -1;
+ }
+ int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
+ if (res < 0) {
+ return res;
+ }
+ if (buf && length > 0) {
+ strncpy(buf, formatted_chat.c_str(), length);
+ }
+ return res;
+}
+
+//
+// model split
+//
+
+int32_t llama_split_path(
+ char * split_path,
+ size_t maxlen,
+ const char * path_prefix,
+ int32_t split_no,
+ int32_t split_count) {
+
+ static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
+
+ const int written = snprintf(
+ split_path,
+ maxlen,
+ SPLIT_PATH_FORMAT,
+ path_prefix,
+ split_no + 1,
+ split_count
+ );
+
+ if (written < 0 || (size_t) written >= maxlen) {
+ return 0;
+ }
+
+ return (int32_t) written;
+}
+
+int32_t llama_split_prefix(
+ char * split_prefix,
+ size_t maxlen,
+ const char * split_path,
+ int32_t split_no,
+ int32_t split_count) {
+
+ const std::string str_split_path(split_path);
+
+ char postfix[32];
+ snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
+
+ const std::string str_postfix(postfix);
+ if (str_split_path.size() <= str_postfix.size()) {
+ return 0;
+ }
+
+ const size_t size_prefix = str_split_path.size() - str_postfix.size();
+
+ if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
+ const size_t copy_len = std::min(size_prefix + 1, maxlen);
+ snprintf(split_prefix, copy_len, "%s", split_path);
+
+ return (int32_t) size_prefix;
+ }
+
+ return 0;
+}
+
+const char * llama_print_system_info(void) {
+ static std::string s;
+ s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
+
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+ auto * reg = ggml_backend_reg_get(i);
+ auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+ if (get_features_fn) {
+ ggml_backend_feature * features = get_features_fn(reg);
+ s += ggml_backend_reg_name(reg);
+ s += " : ";
+ for (; features->name; features++) {
+ s += features->name;
+ s += " = ";
+ s += features->value;
+ s += " | ";
+ }
+ }
+ }
+
+ return s.c_str();
+}
+
diff --git a/llama.cpp/src/models/afmoe.cpp b/llama.cpp/src/models/afmoe.cpp
new file mode 100644
index 0000000..6a752a4
--- /dev/null
+++ b/llama.cpp/src/models/afmoe.cpp
@@ -0,0 +1,191 @@
+#include "models.h"
+
+llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // MuP scaling: embeddings * sqrt(hidden_size)
+ // mup_enabled = true, hidden_size = 1024, scale = 32.0
+ inpL = ggml_scale(ctx0, inpL, sqrtf(float(n_embd)));
+ cb(inpL, "inp_embd_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ ggml_tensor * inpSA = inpL;
+
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
+
+ // dual attention normalization (pre)
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * attn_inp = cur; // save input for gate computation
+
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ // compute gate from input
+ ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
+ cb(gate, "attn_gate_proj", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+ // Q/K normalization
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur_rope", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Kcur, "Kcur_rope", il);
+ }
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ cur = build_attn(inp_attn,
+ NULL, NULL, // wo will be applied after gating
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+
+ // attention gating: attn_out * sigmoid(gate) BEFORE o_proj
+ gate = ggml_sigmoid(ctx0, gate);
+ cb(gate, "attn_gate_sig", il);
+ cur = ggml_mul(ctx0, cur, gate);
+ cb(cur, "attn_gated", il);
+
+ // now apply output projection
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ cb(cur, "attn_o_proj", il);
+ }
+
+ // dual attention normalization (post)
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // dual ffn normalization (pre)
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // MoE or dense FFN
+ if ((uint32_t)il >= hparams.n_layer_dense_lead) {
+ // MoE layer with sigmoid routing, normalization, and scaling
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU,
+ hparams.expert_weights_norm, // norm_w (route_norm=True)
+ hparams.expert_weights_scale, // scale_w
+ hparams.expert_weights_scale, // w_scale (route_scale=2.826)
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // shared expert
+ if (hparams.n_expert_shared > 0) {
+ ggml_tensor * ffn_shexp = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ } else {
+ // dense layer
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ // dual ffn normalization (post)
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/apertus.cpp b/llama.cpp/src/models/apertus.cpp
new file mode 100644
index 0000000..9af19c1
--- /dev/null
+++ b/llama.cpp/src/models/apertus.cpp
@@ -0,0 +1,125 @@
+#include "models.h"
+
+
+
+llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur_pos", il);
+ cb(Kcur, "Kcur_pos", il);
+ cb(Vcur, "Vcur_pos", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network with xIELU activation
+ {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // Up projection
+ ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
+ cb(up, "ffn_up", il);
+
+ float alpha_n_val = hparams.xielu_alpha_n[il];
+ float alpha_p_val = hparams.xielu_alpha_p[il];
+ float beta_val = hparams.xielu_beta[il];
+ float eps_val = hparams.xielu_eps[il];
+
+ // Apply xIELU activation
+ ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
+ cb(activated, "ffn_xielu", il);
+
+ // Down projection
+ cur = build_lora_mm(model.layers[il].ffn_down, activated);
+ cb(cur, "ffn_down", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/arcee.cpp b/llama.cpp/src/models/arcee.cpp
new file mode 100644
index 0000000..aa6167d
--- /dev/null
+++ b/llama.cpp/src/models/arcee.cpp
@@ -0,0 +1,135 @@
+#include "models.h"
+
+
+llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ // ARCEE uses relu^2 instead of silu
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/arctic.cpp b/llama.cpp/src/models/arctic.cpp
new file mode 100644
index 0000000..e8f028a
--- /dev/null
+++ b/llama.cpp/src/models/arctic.cpp
@@ -0,0 +1,138 @@
+#include "models.h"
+
+
+llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
+ cb(ffn_out, "ffn_out", il);
+
+ // MoE
+ cur = build_norm(inpSA,
+ model.layers[il].ffn_norm_exps, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm_exps", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_out);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/arwkv7.cpp b/llama.cpp/src/models/arwkv7.cpp
new file mode 100644
index 0000000..107a3be
--- /dev/null
+++ b/llama.cpp/src/models/arwkv7.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+
+llm_build_arwkv7::llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * v_first = nullptr;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0,
+ token_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+ 1
+ );
+
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
+
+ token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/baichuan.cpp b/llama.cpp/src/models/baichuan.cpp
new file mode 100644
index 0000000..c04b0c9
--- /dev/null
+++ b/llama.cpp/src/models/baichuan.cpp
@@ -0,0 +1,122 @@
+#include "models.h"
+
+
+llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ switch (model.type) {
+ case LLM_TYPE_7B:
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ break;
+ case LLM_TYPE_13B:
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/bailingmoe.cpp b/llama.cpp/src/models/bailingmoe.cpp
new file mode 100644
index 0000000..ed56b9c
--- /dev/null
+++ b/llama.cpp/src/models/bailingmoe.cpp
@@ -0,0 +1,144 @@
+#include "models.h"
+
+
+llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ false, hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/bailingmoe2.cpp b/llama.cpp/src/models/bailingmoe2.cpp
new file mode 100644
index 0000000..fbf7b21
--- /dev/null
+++ b/llama.cpp/src/models/bailingmoe2.cpp
@@ -0,0 +1,135 @@
+#include "models.h"
+
+
+
+llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 0 * sizeof(float) * (n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
+ cb(sa_out, "sa_out", il);
+
+ // MoE branch
+ cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/bert.cpp b/llama.cpp/src/models/bert.cpp
new file mode 100644
index 0000000..bca0e25
--- /dev/null
+++ b/llama.cpp/src/models/bert.cpp
@@ -0,0 +1,178 @@
+#include "models.h"
+
+
+
+llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inp_pos = nullptr;
+
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
+ inp_pos = build_inp_pos();
+ }
+
+ // construct input embeddings (token, type, position)
+ inpL = build_inp_embd(model.tok_embd);
+
+ // token types are hardcoded to zero ("Sentence A")
+ if (model.type_embd) {
+ ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+ inpL = ggml_add(ctx0, inpL, type_row0);
+ }
+ if (model.arch == LLM_ARCH_BERT) {
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+ }
+ cb(inpL, "inp_embd", -1);
+
+ // embed layer norm
+ inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+ cb(inpL, "inp_norm", -1);
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * cur = inpL;
+
+ {
+ ggml_tensor * Qcur;
+ ggml_tensor * Kcur;
+ ggml_tensor * Vcur;
+
+ // self-attention
+ if (model.layers[il].wqkv) {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
+ 0 * sizeof(float) * (n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+ } else {
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ }
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ }
+
+ if (model.layers[il].attn_k_norm) {
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head * n_head_kv, n_tokens);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
+
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ }
+
+ // RoPE
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
+ model.arch == LLM_ARCH_JINA_BERT_V3) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // re-add the layer input
+ cur = ggml_add(ctx0, cur, inpL);
+
+ // attention layer norm
+ cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
+
+ if (model.layers[il].attn_norm_2 != nullptr) {
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
+ cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
+ }
+
+ ggml_tensor * ffn_inp = cur;
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
+ // MoE branch
+ cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
+ model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
+ LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ cb(cur, "ffn_moe_out", il);
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
+ model.arch == LLM_ARCH_JINA_BERT_V3) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
+ const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
+ auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
+ type_op, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ // attentions bypass the intermediate layer
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ // output layer norm
+ cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cb(cur, "result_embd", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/bitnet.cpp b/llama.cpp/src/models/bitnet.cpp
new file mode 100644
index 0000000..331a3f1
--- /dev/null
+++ b/llama.cpp/src/models/bitnet.cpp
@@ -0,0 +1,160 @@
+#include "models.h"
+
+
+llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].wq_scale) {
+ Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
+ }
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ // B1.K
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].wk_scale) {
+ Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
+ }
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ // B1.V
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].wv_scale) {
+ Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
+ }
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ NULL, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+
+ cur = build_norm(cur,
+ model.layers[il].attn_sub_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_sub_norm", il);
+
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ if (model.layers[il].wo_scale) {
+ cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
+ }
+ if (model.layers[il].bo) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
+ }
+ cb(cur, "attn_out", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward forward
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
+ model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+ NULL, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_sub_out", il);
+
+ cur = build_norm(cur,
+ model.layers[il].ffn_sub_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_sub_norm", il);
+
+ cur = build_lora_mm(model.layers[il].ffn_down, cur);
+ if (model.layers[il].ffn_down_scale) {
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+ }
+ cb(cur, "ffn_down", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ // FIXME: do not use model.tok_embd directly, duplicate as model.output
+ cur = build_lora_mm(model.tok_embd, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/bloom.cpp b/llama.cpp/src/models/bloom.cpp
new file mode 100644
index 0000000..2c552d1
--- /dev/null
+++ b/llama.cpp/src/models/bloom.cpp
@@ -0,0 +1,101 @@
+#include "models.h"
+
+llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ inpL = build_norm(inpL,
+ model.tok_norm,
+ model.tok_norm_b,
+ LLM_NORM, -1);
+ cb(inpL, "inp_norm", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // Add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/chameleon.cpp b/llama.cpp/src/models/chameleon.cpp
new file mode 100644
index 0000000..184511a
--- /dev/null
+++ b/llama.cpp/src/models/chameleon.cpp
@@ -0,0 +1,178 @@
+#include "models.h"
+
+#include <float.h>
+
+llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ if (hparams.swin_norm) {
+ cur = inpL;
+ } else {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+ ggml_element_size(Qcur) * n_embd_head,
+ ggml_element_size(Qcur) * n_embd_head * n_head,
+ 0);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm,
+ model.layers[il].attn_q_norm_b,
+ LLM_NORM, il);
+ cb(Qcur, "Qcur", il);
+ }
+
+ if (model.layers[il].attn_k_norm) {
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+ ggml_element_size(Kcur) * n_embd_head,
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+ 0);
+ cb(Kcur, "Kcur", il);
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm,
+ model.layers[il].attn_k_norm_b,
+ LLM_NORM, il);
+ cb(Kcur, "Kcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ if (hparams.swin_norm) {
+ cur = build_norm(cur,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ if (!hparams.swin_norm) {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+ }
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ if (hparams.swin_norm) {
+ cur = build_norm(cur,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output_with_img_logits", -1);
+
+ // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
+ // Needs to be removed once image outputs are supported.
+ int img_token_end_idx = 8196;
+ int img_token_start_idx = 4;
+ int num_img_tokens = img_token_end_idx - img_token_start_idx;
+ // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
+ // which ensures that text token values are always at least larger than image token values
+ ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
+ img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
+ cb(img_logits, "img_logits", -1);
+
+ cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/chatglm.cpp b/llama.cpp/src/models/chatglm.cpp
new file mode 100644
index 0000000..2685d4f
--- /dev/null
+++ b/llama.cpp/src/models/chatglm.cpp
@@ -0,0 +1,132 @@
+#include "models.h"
+
+
+llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv == nullptr) {
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ }
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ }
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ } else {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+ }
+
+ //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // Add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ }
+
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/codeshell.cpp b/llama.cpp/src/models/codeshell.cpp
new file mode 100644
index 0000000..0b3bdbf
--- /dev/null
+++ b/llama.cpp/src/models/codeshell.cpp
@@ -0,0 +1,111 @@
+#include "models.h"
+
+llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/cogvlm.cpp b/llama.cpp/src/models/cogvlm.cpp
new file mode 100644
index 0000000..0ceae3a
--- /dev/null
+++ b/llama.cpp/src/models/cogvlm.cpp
@@ -0,0 +1,102 @@
+#include "models.h"
+
+llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * inpL;
+ ggml_tensor * cur;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ // check ubatch to see if we have input tokens (text)
+ // or an input embedding vector (image)
+ bool is_text;
+ if (ubatch.token) {
+ is_text = true;
+ } else {
+ is_text = false;
+ }
+
+ for (int il = 0; il < n_layer; ++il) {
+ // get either the text or image weight tensors
+ ggml_tensor *wqkv, *wo;
+ ggml_tensor *ffn_gate, *ffn_down, *ffn_up;
+
+ if (is_text) {
+ wqkv = model.layers[il].wqkv;
+ wo = model.layers[il].wo;
+ ffn_gate = model.layers[il].ffn_gate;
+ ffn_down = model.layers[il].ffn_down;
+ ffn_up = model.layers[il].ffn_up;
+ } else {
+ wqkv = model.layers[il].visexp_attn_wqkv;
+ wo = model.layers[il].visexp_attn_wo;
+ ffn_gate = model.layers[il].visexp_ffn_gate;
+ ffn_down = model.layers[il].visexp_ffn_down;
+ ffn_up = model.layers[il].visexp_ffn_up;
+ }
+
+ ggml_tensor * inpSA = inpL;
+ cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+
+ // build self attention
+ {
+ ggml_tensor * qkv = build_lora_mm(wqkv, cur);
+
+ // split qkv into Q, K, V along the first dimension
+ ggml_tensor * Qcur =
+ ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), qkv->nb[1], 0);
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ qkv->nb[1], n_embd * ggml_element_size(qkv));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ qkv->nb[1], 2 * n_embd * ggml_element_size(qkv));
+
+ Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type);
+ Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
+
+ cur = build_attn(inp_attn,
+ wo, nullptr,
+ Qcur, Kcur, Vcur,
+ nullptr, nullptr, nullptr,
+ kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ ffn_up, NULL, NULL,
+ ffn_gate, NULL, NULL,
+ ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/cohere2-iswa.cpp b/llama.cpp/src/models/cohere2-iswa.cpp
new file mode 100644
index 0000000..9334b5e
--- /dev/null
+++ b/llama.cpp/src/models/cohere2-iswa.cpp
@@ -0,0 +1,134 @@
+#include "models.h"
+
+llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ const float f_logit_scale = hparams.f_logit_scale;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const bool is_swa = hparams.is_swa(il);
+ // UNUSED:
+ // const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+ ggml_tensor * ffn_inp = cur;
+
+ // self-attention
+ {
+ // rope freq factors for 128k context
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (is_swa) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+
+ ggml_tensor * attn_out = cur;
+
+ // feed-forward network
+ {
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ // add together residual + FFN + self-attention
+ cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx0, cur, attn_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (f_logit_scale) {
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
+ }
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/command-r.cpp b/llama.cpp/src/models/command-r.cpp
new file mode 100644
index 0000000..4d3b643
--- /dev/null
+++ b/llama.cpp/src/models/command-r.cpp
@@ -0,0 +1,122 @@
+#include "models.h"
+
+
+
+llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ const float f_logit_scale = hparams.f_logit_scale;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * ffn_inp = cur;
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM, il);
+ cb(Qcur, "Qcur", il);
+ }
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM, il);
+ cb(Kcur, "Kcur", il);
+ }
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+ ggml_tensor * attn_out = cur;
+
+ // feed-forward network
+ {
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ // add together residual + FFN + self-attention
+ cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx0, cur, attn_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (f_logit_scale) {
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/dbrx.cpp b/llama.cpp/src/models/dbrx.cpp
new file mode 100644
index 0000000..6d2a0eb
--- /dev/null
+++ b/llama.cpp/src/models/dbrx.cpp
@@ -0,0 +1,123 @@
+#include "models.h"
+
+
+llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(cur, "wqkv_clamped", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].attn_out_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_out_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/deci.cpp b/llama.cpp/src/models/deci.cpp
new file mode 100644
index 0000000..7410a3a
--- /dev/null
+++ b/llama.cpp/src/models/deci.cpp
@@ -0,0 +1,135 @@
+#include "models.h"
+
+
+
+llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+ const int64_t n_head = hparams.n_head(il);
+ const int64_t n_ff = hparams.n_ff(il);
+
+ if (n_head == 0) {
+ // attention-free layer of Llama-3_1-Nemotron-51B
+ cur = inpL;
+ } else {
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+ if (n_head > 0 && n_head_kv == 0) {
+ // "linear attention" of Llama-3_1-Nemotron-51B
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ cb(cur, "wo", il);
+ } else if (n_head > 0) {
+ // self-attention
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
+ if (n_ff == 0) {
+ continue;
+ }
+ // modified to support attention-free layer of Llama-3_1-Nemotron-51B
+ ggml_tensor * ffn_inp = cur;
+ if (n_head > 0) {
+ ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+ }
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/deepseek.cpp b/llama.cpp/src/models/deepseek.cpp
new file mode 100644
index 0000000..17866c0
--- /dev/null
+++ b/llama.cpp/src/models/deepseek.cpp
@@ -0,0 +1,144 @@
+#include "models.h"
+
+
+
+llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/deepseek2.cpp b/llama.cpp/src/models/deepseek2.cpp
new file mode 100644
index 0000000..987f449
--- /dev/null
+++ b/llama.cpp/src/models/deepseek2.cpp
@@ -0,0 +1,259 @@
+#include "models.h"
+
+llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const bool is_mla = hparams.is_mla();
+
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+ const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
+ const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
+
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
+
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
+ // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
+ // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+ // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+ GGML_ASSERT(ext_factor >= 0.0f);
+ const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+ // use the original attn_factor to pre-scale the kq_scale
+ const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+ const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ // (optional) temperature tuning - used by mistral-large
+ ggml_tensor * inp_attn_scale = nullptr;
+ if (hparams.f_attn_temp_scale != 0.0f) {
+ inp_attn_scale = build_inp_attn_scale();
+ }
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr;
+ auto * inp_attn_k = is_mla ? build_attn_inp_k() : nullptr;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ ggml_tensor * q = NULL;
+
+ const bool is_lite = model.layers[il].wq;
+
+ if (!is_lite) {
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+ cb(q, "q", il);
+
+ q = build_norm(q, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
+ cb(q, "q", il);
+
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+ cb(q, "q", il);
+ } else {
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(q, "q", il);
+ }
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
+ ggml_tensor * q_nope =
+ ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+ ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
+ cb(q_nope, "q_nope", il);
+
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(
+ ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+ ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
+ cb(q_pe, "q_pe", il);
+
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
+
+ // split into {kv_lora_rank, n_tokens}
+ ggml_tensor * kv_cmpr =
+ ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+ cb(kv_cmpr, "kv_cmpr", il);
+
+ // and {n_embd_head_qk_rope, 1, n_tokens}
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+ cb(k_pe, "k_pe", il);
+
+ q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(q_pe, "q_pe", il);
+
+ k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(k_pe, "k_pe", il);
+
+ kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+ cb(kv_cmpr, "kv_cmpr", il);
+
+ if (is_mla) {
+ // {n_embd_head_qk_nope, n_tokens, n_head}
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+ cb(q_nope, "q_nope_perm", il);
+
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
+
+ // {kv_lora_rank, n_head, n_tokens}
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
+
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+ // note: rope must go first for in-place context shifting in build_rope_shift()
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
+ cb(Qcur, "Qcur", il);
+
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
+
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
+ cb(Kcur, "Kcur", il);
+
+ // {kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Vcur = kv_cmpr;
+ cb(Vcur, "Vcur", il);
+
+ if (inp_attn_scale) {
+ // apply llama 4 temperature scaling
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
+ }
+
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+ cur = build_attn(inp_attn_k,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
+ } else {
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
+ cb(kv, "kv", il);
+
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
+ ggml_tensor * k_nope =
+ ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head, 0);
+ cb(k_nope, "k_nope_view", il);
+
+ // and {n_embd_head_v, n_head, n_tokens}
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
+ cb(Vcur, "Vcur_view", il);
+
+ Vcur = ggml_cont(ctx0, Vcur);
+ cb(Vcur, "Vcur_cont", il);
+
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+ cb(Kcur, "Kcur", il);
+
+ if (inp_attn_scale) {
+ // apply llama 4 temperature scaling
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
+ }
+
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
+ cur = build_attn(inp_attn_kv,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/dots1.cpp b/llama.cpp/src/models/dots1.cpp
new file mode 100644
index 0000000..09c36f8
--- /dev/null
+++ b/llama.cpp/src/models/dots1.cpp
@@ -0,0 +1,134 @@
+#include "models.h"
+
+
+
+llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/dream.cpp b/llama.cpp/src/models/dream.cpp
new file mode 100644
index 0000000..2aafbae
--- /dev/null
+++ b/llama.cpp/src/models/dream.cpp
@@ -0,0 +1,105 @@
+#include "models.h"
+
+
+
+llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ //copied from qwen2
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/ernie4-5-moe.cpp b/llama.cpp/src/models/ernie4-5-moe.cpp
new file mode 100644
index 0000000..0d96d14
--- /dev/null
+++ b/llama.cpp/src/models/ernie4-5-moe.cpp
@@ -0,0 +1,150 @@
+#include "models.h"
+
+
+
+llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+ // norm
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ bool is_moe_layer =
+ static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
+
+ if (!is_moe_layer) {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // Shared expert (if present)
+ if (hparams.n_ff_shexp > 0) {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ } else {
+ cur = moe_out;
+ }
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/ernie4-5.cpp b/llama.cpp/src/models/ernie4-5.cpp
new file mode 100644
index 0000000..99aead5
--- /dev/null
+++ b/llama.cpp/src/models/ernie4-5.cpp
@@ -0,0 +1,110 @@
+#include "models.h"
+
+llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ }
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/exaone-moe.cpp b/llama.cpp/src/models/exaone-moe.cpp
new file mode 100644
index 0000000..bef5b2a
--- /dev/null
+++ b/llama.cpp/src/models/exaone-moe.cpp
@@ -0,0 +1,146 @@
+#include "models.h"
+
+
+llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn_iswa = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // use RoPE for SWA layers
+ const bool is_local_layer = hparams.is_swa(il);
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+
+ if (is_local_layer) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn_iswa,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // norm
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // dense branch
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL, NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // final norm
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/exaone.cpp b/llama.cpp/src/models/exaone.cpp
new file mode 100644
index 0000000..62602b2
--- /dev/null
+++ b/llama.cpp/src/models/exaone.cpp
@@ -0,0 +1,114 @@
+#include "models.h"
+
+
+
+llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/exaone4.cpp b/llama.cpp/src/models/exaone4.cpp
new file mode 100644
index 0000000..8b7e3dc
--- /dev/null
+++ b/llama.cpp/src/models/exaone4.cpp
@@ -0,0 +1,123 @@
+#include "models.h"
+
+
+template <bool iswa>
+llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // use RoPE for SWA layers or non-SWA models
+ const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
+
+ cur = inpL;
+
+ // self-attention
+ {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL, NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_exaone4<false>;
+template struct llm_build_exaone4<true>;
diff --git a/llama.cpp/src/models/falcon-h1.cpp b/llama.cpp/src/models/falcon-h1.cpp
new file mode 100644
index 0000000..b641a09
--- /dev/null
+++ b/llama.cpp/src/models/falcon-h1.cpp
@@ -0,0 +1,113 @@
+#include "models.h"
+
+
+
+llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // Build the inputs in the recurrent & kv cache
+ auto * inp = build_inp_mem_hybrid();
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur-post-rope", il);
+ cb(Kcur, "Kcur-post-rope", il);
+ cb(Vcur, "Vcur-post-rope", il);
+
+ ggml_tensor * attn_out = build_attn(inp->get_attn(),
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(attn_out, "attn_out", il);
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ // Mamba2 layer
+ cb(cur, "ssm_in", il);
+
+ ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
+ cb(ssm_out, "ssm_out", il);
+
+ // // Aggregation
+ cur = ggml_add(ctx0, attn_out, ssm_out);
+ inpSA = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "layer_out", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = inpSA;
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, inpSA);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/falcon.cpp b/llama.cpp/src/models/falcon.cpp
new file mode 100644
index 0000000..db1ccdb
--- /dev/null
+++ b/llama.cpp/src/models/falcon.cpp
@@ -0,0 +1,120 @@
+#include "models.h"
+
+
+llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * attn_norm;
+
+ attn_norm = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(attn_norm, "attn_norm", il);
+
+ // self-attention
+ {
+ if (model.layers[il].attn_norm_2) {
+ // Falcon-40B
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm_2,
+ model.layers[il].attn_norm_2_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm_2", il);
+ } else {
+ cur = attn_norm;
+ }
+
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ // using mode = 2 for neox mode
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = cur;
+
+ // feed forward
+ {
+ cur = build_ffn(attn_norm, // !! use the attn norm, not the result
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ // norm
+ cur = build_norm(cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/gemma-embedding.cpp b/llama.cpp/src/models/gemma-embedding.cpp
new file mode 100644
index 0000000..944c198
--- /dev/null
+++ b/llama.cpp/src/models/gemma-embedding.cpp
@@ -0,0 +1,116 @@
+#include "models.h"
+
+llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
+ cur =
+ build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/gemma.cpp b/llama.cpp/src/models/gemma.cpp
new file mode 100644
index 0000000..4893d9a
--- /dev/null
+++ b/llama.cpp/src/models/gemma.cpp
@@ -0,0 +1,112 @@
+#include "models.h"
+
+
+llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/gemma2-iswa.cpp b/llama.cpp/src/models/gemma2-iswa.cpp
new file mode 100644
index 0000000..7a91981
--- /dev/null
+++ b/llama.cpp/src/models/gemma2-iswa.cpp
@@ -0,0 +1,128 @@
+#include "models.h"
+
+llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ // final logit soft-capping
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/gemma3.cpp b/llama.cpp/src/models/gemma3.cpp
new file mode 100644
index 0000000..dec3fc4
--- /dev/null
+++ b/llama.cpp/src/models/gemma3.cpp
@@ -0,0 +1,155 @@
+#include "models.h"
+
+template <bool iswa>
+llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // TODO: is causal == true correct? might need some changes
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ float freq_base_l = 0.0f;
+ float freq_scale_l = 0.0f;
+
+ if constexpr (iswa) {
+ freq_base_l = model.get_rope_freq_base (cparams, il);
+ freq_scale_l = model.get_rope_freq_scale(cparams, il);
+ } else {
+ freq_base_l = freq_base;
+ freq_scale_l = freq_scale;
+ }
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = build_norm(sa_out,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (hparams.f_final_logit_softcapping) {
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+ }
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+template struct llm_build_gemma3<false>;
+template struct llm_build_gemma3<true>;
diff --git a/llama.cpp/src/models/gemma3n-iswa.cpp b/llama.cpp/src/models/gemma3n-iswa.cpp
new file mode 100644
index 0000000..7db6d3b
--- /dev/null
+++ b/llama.cpp/src/models/gemma3n-iswa.cpp
@@ -0,0 +1,384 @@
+#include "models.h"
+
+llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model),
+ n_embd_head(model.hparams.n_embd_head_k),
+ n_embd_altup(model.hparams.n_embd_altup),
+ n_altup(model.hparams.n_altup),
+ i_altup_act(model.hparams.i_altup_act) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // TODO: is causal == true correct? might need some changes
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
+ ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
+
+ // inpL now has only 1 altup, project it to the rest of the altups
+ // these "added" altups will be concat to the last dim of inpL
+ {
+ ggml_tensor * target_magnitude = calc_magnitude(inpL);
+ ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
+ ggml_tensor * altup_added =
+ ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
+ ggml_tensor * new_magnitude = calc_magnitude(altup_added);
+ altup_added = ggml_div(ctx0, ggml_mul(ctx0, altup_added, target_magnitude), new_magnitude);
+ inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
+ cb(inpL, "inp_stacked", -1);
+ }
+ // inpL now has shape: [n_embd, n_tokens, n_altup]
+ // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
+
+ for (int il = 0; il < n_layer; ++il) {
+ // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
+ ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
+
+ // predicted value will go through self-attention and laurel
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
+ cur = active_prediction;
+ cb(cur, "active_prediction", il);
+
+ // norm
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // laurel
+ ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
+
+ // self-attention
+ if (hparams.has_kv(il)) {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
+
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+ cb(Vcur, "Vcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur_pos", il);
+ cb(Kcur, "Kcur_pos", il);
+
+ cur = build_attn(inp_attn, model.layers[il].wo,
+ NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+ hparams.f_attention_scale, il);
+ } else {
+ // reuse KV cache of earlier layers
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur_pos", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
+ }
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
+ cb(cur, "attn_gated", il);
+
+ ggml_tensor * attn_laurel = ggml_scale(ctx0, ggml_add(ctx0, cur, laurel_out),
+ 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
+ cb(attn_laurel, "attn_laurel", il);
+
+ cur = build_norm(attn_laurel, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
+ ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
+
+ if (il < n_layer_sparsity) {
+ // apply activation sparsity
+ gate_proj = gaussian_topk(gate_proj);
+ }
+ gate_proj = ggml_gelu(ctx0, gate_proj);
+
+ cur = ggml_mul(ctx0, up_proj, gate_proj);
+ cur = build_lora_mm(model.layers[il].ffn_down, cur);
+ cb(cur, "ffn_out", il);
+ }
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", il);
+
+ ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
+ cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
+
+ ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
+
+ ggml_tensor * first_prediction; // [n_embd, n_tokens]
+ {
+ first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
+ first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
+ first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
+ first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
+ cb(first_prediction, "first_prediction_gated", il);
+ ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
+ first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
+ cb(first_prediction, "first_prediction_scaled", il);
+
+ first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
+ first_prediction =
+ build_norm(first_prediction, model.layers[il].per_layer_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(first_prediction, "first_prediction_out", il);
+ }
+ // equivalent to python code: corrected_predictions[1:] += first_prediction
+ {
+ ggml_tensor * slice_first = view_2d_slice(corrected, 0);
+ ggml_tensor * slice_rest = ggml_view_3d(
+ ctx0, corrected, n_embd, n_tokens, n_altup - 1, ggml_row_size(corrected->type, n_embd),
+ ggml_row_size(corrected->type, n_embd * n_tokens), n_embd * n_tokens * ggml_element_size(corrected));
+ ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
+ corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
+ }
+ cur = corrected; // [n_embd, n_tokens, n_altup]
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL; // [n_embd, n_tokens, n_altup]
+
+ // cur now has multiple altup(s), we want to merge them back to 1 altup
+ {
+ ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
+ // do a view to skip the first slice (active altup)
+ ggml_tensor * alt_slice =
+ ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1, ggml_row_size(cur->type, n_embd),
+ ggml_row_size(cur->type, n_embd * n_tokens), n_embd * n_tokens * ggml_element_size(cur));
+ ggml_tensor * altup_unembd =
+ ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
+ ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
+ altup_unembd = ggml_div(ctx0, ggml_mul(ctx0, altup_unembd, target_magnitude), new_magnitude);
+ cb(altup_unembd, "altup_unembd", -1);
+
+ // equivalent to torch.mean(hidden_states, dim=0)
+ cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
+ for (int i = 0; i < n_altup - 1; ++i) {
+ cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
+ }
+ cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
+ cb(cur, "unembd_merged", -1);
+ }
+ // cur now has shape: [n_embd, n_tokens]
+
+ // TODO: move this to right after the last KV layer
+ {
+ // skip computing output for unused tokens
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ }
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ {
+ // final logit soft-capping
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_gemma3n_iswa::calc_magnitude(ggml_tensor * x) {
+ return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
+}
+
+// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
+ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
+ GGML_ASSERT(idx < (int) x->ne[2]);
+ return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]),
+ idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
+}
+
+// equivalent to get_per_layer_inputs() in python code
+// output shape: [n_embd_altup, n_layer, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
+ auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
+ ggml_tensor * inp_per_layer;
+ if (ubatch.token) {
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+ ggml_set_input(inp->tokens);
+ res->t_inp_tokens = inp->tokens;
+ inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
+ cb(inp_per_layer, "inp_per_layer_selected", -1);
+ res->add_input(std::move(inp));
+ } else {
+ // Vision embedding path: use padding token (ID=0) embedding
+ // TODO: verify if this is the correct behavior in transformers implementation
+ const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
+
+ // Extract and dequantize padding token embedding (row 0)
+ ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
+ inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
+
+ // Reshape to [n_embd_altup, n_layer, 1]
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
+ cb(inp_per_layer, "inp_per_layer_vision", -1);
+ }
+ return inp_per_layer;
+}
+
+// equivalent to project_per_layer_inputs() in python code
+// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
+// output shape: [n_embd_altup, n_tokens, n_layer]
+ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
+ const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
+ const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
+
+ ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
+ per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
+ per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
+ per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, NULL, LLM_NORM_RMS,
+ -1); // [n_embd_altup, n_layer, n_tokens]
+ cb(per_layer_proj, "per_layer_proj", -1);
+
+ inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
+ cb(inp_per_layer, "inp_per_layer", -1);
+
+ // permute to shape: [n_embd_altup, n_tokens, n_layer]
+ inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
+ return inp_per_layer;
+}
+
+// input cur shape: [n_altup, n_tokens]
+// output shape: [n_altup, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::laurel(ggml_tensor * cur, int il) {
+ ggml_tensor * tmp = cur;
+ tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
+ tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
+ tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
+ tmp = ggml_add(ctx0, tmp, cur);
+ cb(tmp, "laurel_out", il);
+ return tmp;
+}
+
+// input x shape: [n_embd, n_tokens]
+// output shape: [n_embd, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::gaussian_topk(ggml_tensor * x) {
+ ggml_tensor * mean = ggml_mean(ctx0, x);
+ ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
+ 1.0f / (float) (x->ne[0] - 1)));
+ ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
+ return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
+}
+
+//
+// altup functions
+//
+
+// equivalent to compute_router_modalities() in python code
+// input x shape: [n_embd, n_tokens]
+// output shape: [n_altup, n_tokens]
+ggml_tensor * llm_build_gemma3n_iswa::altup_compute_router_modalities(ggml_tensor * x, int il) {
+ ggml_tensor * router_inputs = build_norm(x, model.layers[il].altup_router_norm, NULL, LLM_NORM_RMS, il);
+
+ // router_input_scale
+ router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float) n_embd);
+
+ ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
+ return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
+}
+
+// input cur shape: [n_embd, n_tokens, n_altup]
+// output shape: [n_embd, n_tokens, n_altup]
+ggml_tensor * llm_build_gemma3n_iswa::altup_predict(ggml_tensor * cur, int il) {
+ ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
+ cb(modalities, "modalities", il);
+
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
+ cb(all_coefs, "all_coefs", il);
+ // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
+
+ // permute to [n_altup, n_embd, n_tokens]
+ ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+ ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
+
+ // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
+ predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
+ predictions = ggml_add(ctx0, predictions, cur);
+ cb(predictions, "predictions", il);
+
+ return predictions;
+}
+
+// input predictions shape: [n_embd, n_tokens, n_altup]
+// input activated shape: [n_embd, n_tokens]
+// output shape: [n_embd, n_tokens, n_altup]
+ggml_tensor * llm_build_gemma3n_iswa::altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
+ cb(modalities, "modalities", il);
+
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
+ ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
+ cb(innovation, "innovation", il);
+
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
+ all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
+ cb(all_coefs, "all_coefs", il);
+ all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
+ all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
+
+ innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
+ ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
+ corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
+ cb(corrected, "corrected", il);
+
+ return corrected;
+}
diff --git a/llama.cpp/src/models/glm4-moe.cpp b/llama.cpp/src/models/glm4-moe.cpp
new file mode 100644
index 0000000..003f70f
--- /dev/null
+++ b/llama.cpp/src/models/glm4-moe.cpp
@@ -0,0 +1,170 @@
+#include "models.h"
+
+llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ bool use_mrope = hparams.use_mrope();
+ if (ubatch.embd && !use_mrope) {
+ // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
+ GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
+ }
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ // Only process up to last layer (skip final NextN layer)
+ // Final layer tensors are loaded but not processed in forward pass
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // Pre-attention norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ }
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ }
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ }
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ // Apply Q/K norm if available (GLM-4.5 355B variant)
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ }
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+
+ if (use_mrope) {
+ Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ } else {
+ // Normal RoPE
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
+ rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
+ rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // Post-attention norm
+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "post_attn_norm", il);
+
+ // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
+ // Dense FFN layer
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // Process routed experts using existing MoE infrastructure
+ ggml_tensor * routed_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(routed_out, "ffn_moe_out", il);
+
+ // Process shared expert on original input
+ ggml_tensor * shared_out = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(shared_out, "ffn_shexp_out", il);
+
+ // Final output: routed_output + shared_output
+ cur = ggml_add(ctx0, routed_out, shared_out);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/glm4.cpp b/llama.cpp/src/models/glm4.cpp
new file mode 100644
index 0000000..204aa39
--- /dev/null
+++ b/llama.cpp/src/models/glm4.cpp
@@ -0,0 +1,150 @@
+#include "models.h"
+
+
+
+llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ bool use_mrope = hparams.use_mrope();
+ if (ubatch.embd && !use_mrope) {
+ // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
+ GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
+ }
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // Pre-attention norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv == nullptr) {
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ }
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ }
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ } else {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
+ 0 * sizeof(float) * (n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+ }
+
+ if (use_mrope) {
+ Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ } else {
+ // Normal RoPE
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
+ rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
+ rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // Post-attention norm (new!)
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "post_attn_norm", il);
+
+ // Add the input (residual connection after post-attention norm)
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ // Pre-MLP norm
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // MLP
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ // Post-MLP norm
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "post_mlp_norm", il);
+ }
+ // Add residual connection after post-MLP norm
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+ // Final norm
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // Output projection
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/gpt2.cpp b/llama.cpp/src/models/gpt2.cpp
new file mode 100644
index 0000000..60761c8
--- /dev/null
+++ b/llama.cpp/src/models/gpt2.cpp
@@ -0,0 +1,105 @@
+#include "models.h"
+
+llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * pos;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
+
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/gptneox.cpp b/llama.cpp/src/models/gptneox.cpp
new file mode 100644
index 0000000..2151b14
--- /dev/null
+++ b/llama.cpp/src/models/gptneox.cpp
@@ -0,0 +1,144 @@
+#include "models.h"
+
+
+llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // ffn
+ if (hparams.use_par_res) {
+ // attention and ffn are computed in parallel
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
+
+ ggml_tensor * attn_out = cur;
+
+ cur = build_norm(inpL,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, attn_out);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ } else {
+ // attention and ffn are computed sequentially
+ // x = x + attn(ln1(x))
+ // x = x + ffn(ln2(x))
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ }
+
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/granite-hybrid.cpp b/llama.cpp/src/models/granite-hybrid.cpp
new file mode 100644
index 0000000..f6ca4c1
--- /dev/null
+++ b/llama.cpp/src/models/granite-hybrid.cpp
@@ -0,0 +1,196 @@
+#include "models.h"
+
+
+llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ // Positional embeddings populated if rope enabled
+ ggml_tensor * inp_pos = nullptr;
+ if (hparams.rope_finetuned) {
+ inp_pos = build_inp_pos();
+ }
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (hparams.is_recurrent(il)) {
+ // ssm layer //
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
+ } else {
+ // attention layer //
+ cur = build_attention_layer(cur, inp_pos, inp->get_attn(), model, n_embd_head, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // ffn
+ cur = build_layer_ffn(cur, inpSA, model, il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ // For Granite architectures - scale logits
+ if (hparams.f_logit_scale) {
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il) {
+ // compute Q and K and (optionally) RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+
+ const bool use_rope = hparams.rope_finetuned;
+ if (use_rope) {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ return cur;
+}
+
+ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor * cur,
+ ggml_tensor * inpSA,
+ const llama_model & model,
+ const int il) {
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // For Granite MoE Shared
+ if (hparams.n_ff_shexp > 0) {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ }
+
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ return cur;
+}
diff --git a/llama.cpp/src/models/granite.cpp b/llama.cpp/src/models/granite.cpp
new file mode 100644
index 0000000..18748e9
--- /dev/null
+++ b/llama.cpp/src/models/granite.cpp
@@ -0,0 +1,211 @@
+#include "models.h"
+
+
+llm_build_granite::llm_build_granite(
+ const llama_model & model,
+ const llm_graph_params & params)
+ : llm_graph_context(params) {
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - built only if rope enabled
+ ggml_tensor * inp_pos = nullptr;
+ if (hparams.rope_finetuned) {
+ inp_pos = build_inp_pos();
+ }
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ cur = build_attention_layer(
+ cur, inp_pos, inp_attn,
+ model, n_embd_head, il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // ffn
+ cur = build_layer_ffn(cur, inpSA, model, il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ // For Granite architectures - scale logits
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_granite::build_attention_layer(
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il) {
+
+ // compute Q and K and (optionally) RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+
+ const bool use_rope = hparams.rope_finetuned;
+ if (use_rope) {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ return cur;
+}
+
+ggml_tensor * llm_build_granite::build_layer_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * inpSA,
+ const llama_model & model,
+ const int il) {
+
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // For Granite MoE Shared
+ if (hparams.n_ff_shexp > 0) {
+ ggml_tensor * ffn_shexp = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ }
+
+ // For Granite architectures - scale residual
+ if (hparams.f_residual_scale) {
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ return cur;
+}
diff --git a/llama.cpp/src/models/graph-context-mamba.cpp b/llama.cpp/src/models/graph-context-mamba.cpp
new file mode 100644
index 0000000..b9a363b
--- /dev/null
+++ b/llama.cpp/src/models/graph-context-mamba.cpp
@@ -0,0 +1,283 @@
+#include "models.h"
+
+llm_graph_context_mamba::llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
+
+ggml_tensor * llm_graph_context_mamba::build_mamba_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const auto & layer = model.layers[il];
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t dt_rank = hparams.ssm_dt_rank;
+ const int64_t n_head = d_inner;
+ const int64_t head_dim = 1;
+ const int64_t n_seqs = ubatch.n_seqs;
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
+ // split the above in two
+ // => {d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
+ ggml_tensor * z =
+ ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner * ggml_element_size(xz));
+
+ // conv
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2],
+ n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all, (d_conv - 1) * (d_inner) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner) *ggml_element_size(conv_states_all))));
+
+ // 1D convolution
+ // The equivalent is to make a self-overlapping view of conv_x
+ // over d_conv columns at each stride in the 3rd dimension,
+ // then element-wise multiply that with the conv1d weight,
+ // then sum the elements of each row,
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
+ // then permute away the ne[0] dimension,
+ // and then you're left with the resulting x tensor.
+ // For simultaneous sequences, all sequences need to have the same length.
+ x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
+
+ // bias
+ x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
+
+ x = ggml_silu(ctx0, x);
+ }
+
+ // ssm
+ {
+ // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+ ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
+ // split
+ ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
+ ggml_tensor * B =
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
+ x_db->nb[2], ggml_element_size(x_db) * dt_rank);
+ ggml_tensor * C =
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
+ x_db->nb[2], ggml_element_size(x_db) * (dt_rank + d_state));
+
+ // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
+ if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
+ dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
+ B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
+ C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
+ }
+
+ // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+ dt = build_lora_mm(layer.ssm_dt, dt);
+ dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
+
+ cur = x;
+ x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
+
+ ggml_tensor * A = layer.ssm_a;
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
+
+ // Custom operator to optimize the parallel associative scan
+ // as described in the Annex D of the Mamba paper.
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, x->nb[3] * x->ne[3]),
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
+
+ ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
+
+ // TODO: skip computing output earlier for unused tokens
+
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ cur = build_lora_mm(layer.ssm_out, y);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+
+ return cur;
+}
+
+ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_head = hparams.ssm_dt_rank;
+ const int64_t head_dim = d_inner / n_head;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
+
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
+
+ // split the above in three
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
+ zxBCdt->nb[1], zxBCdt->nb[2], 0);
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2 * n_group * d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1],
+ zxBCdt->nb[2], d_inner * ggml_element_size(zxBCdt));
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2],
+ (2 * d_inner + 2 * n_group * d_state) * ggml_element_size(zxBCdt));
+
+ // conv
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs,
+ conv_x->nb[1], conv_x->nb[2], n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all,
+ (d_conv - 1) * (d_inner + 2 * n_group * d_state) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner + 2 * n_group * d_state) *
+ ggml_element_size(conv_states_all))));
+
+ // 1D convolution
+ // The equivalent is to make a self-overlapping view of conv_x
+ // over d_conv columns at each stride in the 3rd dimension,
+ // then element-wise multiply that with the conv1d weight,
+ // then sum the elements of each row,
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
+ // then permute away the ne[0] dimension,
+ // and then you're left with the resulting x tensor.
+ // For simultaneous sequences, all sequences need to have the same length.
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+
+ // bias
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
+
+ xBC = ggml_silu(ctx0, xBC);
+ }
+
+ // ssm
+ {
+ // These correspond to V K Q in SSM/attention duality
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], 0);
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], d_inner * ggml_element_size(xBC));
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
+ xBC->nb[1], xBC->nb[2], (d_inner + n_group * d_state) * ggml_element_size(xBC));
+
+ // {n_head, n_seq_tokens, n_seqs}
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
+
+ ggml_tensor * A = model.layers[il].ssm_a;
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
+
+ // TODO: use semistructured matrices to implement state-space duality
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, ggml_nelements(x) * x->nb[0]),
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
+
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head * x->nb[1],
+ n_seq_tokens * n_head * x->nb[1], 0);
+
+ // TODO: skip computing output earlier for unused tokens
+
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+ cb(y, "mamba2_y_add_d", il);
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+
+ // grouped RMS norm
+ if (model.layers[il].ssm_norm) {
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
+ }
+
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
+
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+ cb(cur, "mamba_out", il);
+
+ return cur;
+}
diff --git a/llama.cpp/src/models/grok.cpp b/llama.cpp/src/models/grok.cpp
new file mode 100644
index 0000000..3c54dfe
--- /dev/null
+++ b/llama.cpp/src/models/grok.cpp
@@ -0,0 +1,159 @@
+#include "models.h"
+
+llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_out_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_out_norm", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_GELU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ if (model.layers[il].ffn_up) {
+ ggml_tensor * ffn_out = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
+ cb(ffn_out, "ffn_out", il);
+
+ cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
+
+ // final logit soft-capping
+ if (hparams.f_final_logit_softcapping) {
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+ cur = ggml_tanh(ctx0, cur);
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/grovemoe.cpp b/llama.cpp/src/models/grovemoe.cpp
new file mode 100644
index 0000000..56b6db9
--- /dev/null
+++ b/llama.cpp/src/models/grovemoe.cpp
@@ -0,0 +1,141 @@
+#include "models.h"
+
+
+
+llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
+ cb(probs, "ffn_moe_logits", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ nullptr,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il,
+ probs);
+ cb(moe_out, "ffn_moe_out", il);
+ cur = moe_out;
+
+ // TODO: Only do the expert selection and weights once
+ moe_out = build_moe_ffn(cur,
+ nullptr,
+ model.layers[il].ffn_up_chexps,
+ model.layers[il].ffn_gate_chexps,
+ model.layers[il].ffn_down_chexps,
+ nullptr,
+ n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il,
+ probs);
+ cb(moe_out, "ffn_adj_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
+ cb(cur, "ffn_final_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/hunyuan-dense.cpp b/llama.cpp/src/models/hunyuan-dense.cpp
new file mode 100644
index 0000000..7d5dcc7
--- /dev/null
+++ b/llama.cpp/src/models/hunyuan-dense.cpp
@@ -0,0 +1,132 @@
+#include "models.h"
+
+llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_norm", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_norm", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+ // feed-forward network (non-MoE)
+ ggml_tensor * cur_mlp = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur_mlp, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur_mlp, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/hunyuan-moe.cpp b/llama.cpp/src/models/hunyuan-moe.cpp
new file mode 100644
index 0000000..77e39de
--- /dev/null
+++ b/llama.cpp/src/models/hunyuan-moe.cpp
@@ -0,0 +1,154 @@
+#include "models.h"
+
+llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_norm", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_norm", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network (non-MoE)
+ ggml_tensor * cur_mlp = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur_mlp, "ffn_mlp", il);
+
+ // MoE branch
+ ggml_tensor * cur_moe = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU,
+ true, // norm_topk_prob
+ false,
+ 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur_moe, "ffn_moe_out", il);
+
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
+ cb(ffn_out, "ffn_out", il);
+
+ cur = ggml_add(ctx0, ffn_out, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/internlm2.cpp b/llama.cpp/src/models/internlm2.cpp
new file mode 100644
index 0000000..387e821
--- /dev/null
+++ b/llama.cpp/src/models/internlm2.cpp
@@ -0,0 +1,120 @@
+#include "models.h"
+
+llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/jais.cpp b/llama.cpp/src/models/jais.cpp
new file mode 100644
index 0000000..3e3376e
--- /dev/null
+++ b/llama.cpp/src/models/jais.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/jamba.cpp b/llama.cpp/src/models/jamba.cpp
new file mode 100644
index 0000000..a018777
--- /dev/null
+++ b/llama.cpp/src/models/jamba.cpp
@@ -0,0 +1,106 @@
+#include "models.h"
+
+llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_hybrid = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (n_head_kv == 0) {
+ cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
+ } else {
+ // Attention
+
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // No RoPE :)
+ cur = build_attn(inp_hybrid->get_attn(),
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // residual
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
+ cb(cur, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // FFN
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+ // residual
+ cur = ggml_add(ctx0, ffn_inp, cur);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ // final rmsnorm
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/kimi-linear.cpp b/llama.cpp/src/models/kimi-linear.cpp
new file mode 100644
index 0000000..0f037d1
--- /dev/null
+++ b/llama.cpp/src/models/kimi-linear.cpp
@@ -0,0 +1,772 @@
+#include "models.h"
+#include "ggml.h"
+
+#define CHUNK_SIZE 64
+
+// Causal Conv1d function for Q,K,V
+// When qkv is 0, it is Q, 1 is K, 2 is V
+static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) {
+ const int64_t d_inner = head_dim * n_head;
+ const int64_t conv_state_size = (d_conv - 1) * d_inner;
+ const int64_t n_embd_r_total = 3 * conv_state_size; // Q + K + V
+
+ // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V
+ // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs]
+ // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V
+ // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size
+ // View Q conv state: offset 0, size conv_state_size per seq
+ // conv_state_all is [n_embd_r_total, n_seqs] with memory layout:
+ // state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V
+ // We want [d_conv-1, d_inner, n_seqs] view:
+ // nb1 = (d_conv-1) * element_size (stride between channels)
+ // nb2 = n_embd_r_total * element_size (stride between seqs)
+ ggml_tensor * conv_state_x = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs,
+ (d_conv - 1) * ggml_element_size(conv_state_all), // nb1: stride between channels
+ n_embd_r_total * ggml_element_size(conv_state_all), // nb2: stride between seqs
+ qkv * conv_state_size * ggml_element_size(conv_state_all));
+
+// Causal Conv1d function for Q,K,V
+// When qkv is 0, it is Q, 1 is K, 2 is V
+ // Step 1: Q, K, V projections -> [d_inner, n_tokens]
+ ggml_tensor * x_proj = ggml_mul_mat(ctx0, proj_w, x);
+
+ // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs);
+
+ // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0);
+
+ // Save last (d_conv-1) columns back to Q conv state
+ ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
+ conv_x->nb[1], conv_x->nb[2], n_seq_tokens * conv_x->nb[0]);
+ ggml_build_forward_expand(gf,
+ ggml_cpy(ctx0, last_conv_x,
+ ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
+ (kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all))));
+ // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
+ // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
+ // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]
+ // ggml_ssm_conv computes: c[conv_step + channel * d_conv]
+ // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner]
+ // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv
+ ggml_tensor * conv_weight = ggml_reshape_2d(ctx0, conv_w, d_conv, d_inner);
+
+ // Apply conv1d
+ // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * Xcur = ggml_ssm_conv(ctx0, conv_x, conv_weight);
+ // Reshape to 2D for bias add: {d_inner, n_tokens}
+ Xcur = ggml_reshape_2d(ctx0, Xcur, d_inner, n_tokens);
+ Xcur = ggml_silu(ctx0, Xcur);
+
+ return ggml_reshape_4d(ctx0, Xcur, head_dim, n_head, n_seq_tokens, n_seqs);
+}
+
+llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params), model(model) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "model.embed_tokens", -1);
+
+ // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
+ // So we don't need inp_pos
+
+ auto * inp_kv = !hparams.is_mla() ? build_inp_mem_hybrid() : nullptr;
+ auto * inp_k = hparams.is_mla() ? build_inp_mem_hybrid_k() : nullptr;
+ auto * inp_rs = hparams.is_mla() ? inp_k->get_recr() : inp_kv->get_recr();
+ auto * inp_attn_kv = !hparams.is_mla() ? inp_kv->get_attn() : nullptr;
+ auto * inp_attn_k = hparams.is_mla() ? inp_k->get_attn() : nullptr;
+
+ // Output ids for selecting which tokens to output
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ ggml_tensor * chunked_causal_mask =
+ ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
+ GGML_TRI_TYPE_LOWER);
+
+ ggml_tensor * chunked_identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
+ ggml_tensor * chunked_diag_mask = ggml_add(ctx0, chunked_causal_mask, chunked_identity);
+
+ ggml_build_forward_expand(gf, chunked_causal_mask);
+ ggml_build_forward_expand(gf, chunked_identity);
+ ggml_build_forward_expand(gf, chunked_diag_mask);
+
+ // Kimi dimension constants
+ const int64_t n_head = hparams.n_head();
+ const int64_t head_dim = hparams.n_embd_head_kda;
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ // Verify batch consistency for recurrent layers
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ // MLA params
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+ // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
+ // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
+ const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
+ // Attention scale for MLA
+ const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
+
+ for (int il = 0; il < n_layer; ++il) {
+ const auto & layer = model.layers[il];
+ ggml_tensor * inpSA = inpL;
+
+ // Attention Norm
+ cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // Check layer type by checking which tensors exist
+ // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
+ bool is_kda = (layer.ssm_a != nullptr);
+ bool is_mla = (layer.wkv_a_mqa != nullptr);
+
+ if (is_kda) {
+ // === KDA Layer (Kimi Delta Attention) with Recurrent State ===
+ // Reference: vLLM kda.py
+ const auto * mctx_cur = inp_rs->mctx;
+ const auto kv_head = mctx_cur->get_head();
+
+ // Get conv states from r_l tensor (Q, K, V each have separate state)
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ cb(conv_states_all, "conv_states_all", il);
+ ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs);
+ ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+ ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+ ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head);
+
+ // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias)
+ ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur);
+ ggml_tensor * g1 = ggml_mul_mat(ctx0, layer.ssm_f_b, f_a);
+ cb(g1, "g1 f_b(f_a(cur))", il);
+ g1 = ggml_add(ctx0, g1, layer.ssm_dt_b);
+ g1 = ggml_softplus(ctx0, g1);
+ g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens);
+
+ // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens]. No need to -exp(a_log) because it was done in convert_hf_to_gguf.py
+ // Reshape to [1, n_head, 1] for broadcasting with g1 [head_dim, n_head, n_tokens]
+ ggml_tensor * A = ggml_reshape_3d(ctx0, layer.ssm_a, 1, n_head, 1);
+ g1 = ggml_mul(ctx0, g1, A);
+ cb(g1, "kda_g1", il);
+
+ // Compute beta (mixing coefficient)
+ ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur);
+ beta = ggml_reshape_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs);
+ cb(beta, "kda_beta", il);
+
+ // Reshape for KDA recurrence
+ // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs);
+
+ // Get SSM state and compute KDA recurrence using ggml_kda_scan
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+ ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs);
+ state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs);
+ // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens
+ std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
+ build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
+ build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il);
+
+ ggml_tensor * output = attn_out.first;
+ ggml_tensor * new_state = attn_out.second;
+ cb(output, "attn_output", il);
+ cb(new_state, "new_state", il);
+
+ // Update the recurrent states
+ ggml_build_forward_expand(gf,
+ ggml_cpy(ctx0, new_state,
+ ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+ kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
+
+ // Output gating g2 = g_b(g_a(x))
+ ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+ ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d);
+ ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a);
+ cb(g2, "g2 g_b(g_a(cur_2d))", il);
+ g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs);
+
+ // Apply o_norm with sigmoid gating
+ // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish)
+ // Formula: output = RMSNorm(x) * sigmoid(g)
+ ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, output, head_dim, n_head, n_seq_tokens * n_seqs);
+ ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, nullptr, LLM_NORM_RMS, il);
+ cb(normed, "kda_normed", il);
+ ggml_tensor * gate = ggml_sigmoid(ctx0, g2);
+ ggml_tensor * gated = ggml_mul(ctx0, normed, gate);
+
+ // Output projection
+ gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens);
+ cur = ggml_mul_mat(ctx0, layer.wo, gated);
+ cb(cur, "kda_out", il);
+
+ } else if (is_mla) {
+ // === MLA Layer (Multi-head Latent Attention) without KV Cache ===
+ // Reference: vLLM mla.py
+ // Step 1: Q projection and reshape
+ // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim]
+ // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM)
+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.wq, cur);
+
+ // Step 2: KV compression
+ // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens]
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur);
+
+ // Split: kv_cmpr = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:]
+ ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+ // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM)
+ // k_pe is used directly without RoPE
+ // Normalize kv_c
+ kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+
+ if (layer.wk_b && layer.wv_b) { // MLA KV cache enabled
+ // extract q_nope
+ ggml_tensor * q_nope =
+ ggml_view_3d(ctx0, Qcur, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla),
+ ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, 0);
+ cb(q_nope, "q_nope", il);
+
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(
+ ctx0, Qcur, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla),
+ ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, ggml_row_size(Qcur->type, n_embd_head_qk_nope));
+ cb(q_pe, "q_pe", il);
+
+ // {n_embd_head_qk_nope, n_tokens, n_head}
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+ cb(q_nope, "q_nope_perm", il);
+
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, layer.wk_b, q_nope);
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
+
+ // {kv_lora_rank, n_head, n_tokens}
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
+
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+ // note: rope must go first for in-place context shifting in build_rope_shift()
+ Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
+ cb(Qcur, "Qcur", il);
+
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
+
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
+ cb(Kcur, "Kcur", il);
+
+ // {kv_lora_rank, 1, n_tokens}
+ ggml_tensor * Vcur = kv_cmpr;
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn_k, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
+ cb(cur, "mla_out", il);
+ } else { // MLA KV cache disabled. Fall back to MHA KV cache.
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
+ cb(Qcur, "mla_Q", il);
+ // KV decompression: kv = kv_b_proj(kv_c_normed)
+ ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr);
+ const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla;
+
+ // Split kv into k_nope and v
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, kv_per_head),
+ ggml_row_size(kv->type, kv_per_head * n_head), 0);
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens,
+ ggml_row_size(kv->type, kv_per_head),
+ ggml_row_size(kv->type, kv_per_head * n_head),
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
+ Vcur = ggml_cont(ctx0, Vcur);
+ cb(Vcur, "mla_V", il);
+
+ // Concatenate k_nope + k_pe (broadcast k_pe to all heads)
+ // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens]
+ // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads
+ // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens]
+ ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens);
+ ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target);
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_pe_repeated, k_nope, 0);
+ cb(Kcur, "mla_K", il);
+
+ // Direct softmax attention (with MHA KV cache)
+ // Use build_attn with inp_attn for proper mask handling
+ cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
+ cb(cur, "mla_out", il);
+ }
+ } else {
+ // Unknown layer type - this should not happen
+ GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
+ }
+
+ // On last layer, select only the output tokens
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // Residual
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FFN Norm
+ cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ // Dense FFN layer
+ cur = build_ffn(cur,
+ layer.ffn_up, NULL, NULL,
+ layer.ffn_gate, NULL, NULL,
+ layer.ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE layer
+ // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ layer.ffn_gate_inp,
+ layer.ffn_up_exps,
+ layer.ffn_gate_exps,
+ layer.ffn_down_exps,
+ layer.ffn_exp_probs_b,
+ hparams.n_expert,
+ hparams.n_expert_used,
+ LLM_FFN_SILU, true,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // Shared expert
+ {
+ ggml_tensor * ffn_shexp = build_ffn(cur,
+ layer.ffn_up_shexp, NULL, NULL,
+ layer.ffn_gate_shexp, NULL, NULL,
+ layer.ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+ // Residual
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // Final Norm
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // Output
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+/*
+ This is a ggml implementation of the naive_chunk_kda function of
+ https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py
+*/
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * gk,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il) {
+ GGML_ASSERT(ggml_is_contiguous(state));
+
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ // TODO: can this ever be false?
+ const bool use_qk_l2norm = true;
+
+ if (use_qk_l2norm) {
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+ }
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(gk, "gk_in", il);
+
+ q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
+ k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
+ v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+
+ beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+ state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+ cb(q, "q_perm", il);
+ cb(k, "k_perm", il);
+ cb(v, "v_perm", il);
+ cb(beta, "beta_perm", il);
+ cb(gk, "gk_perm", il);
+ cb(state, "state_in", il);
+
+ GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+ GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+ GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+ // Do padding
+ const int64_t chunk_size = CHUNK_SIZE;
+
+ const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+ const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+ q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+ k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+ v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+ gk = ggml_pad(ctx0, gk, 0, pad, 0, 0);
+ beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+
+ cb(q, "q_pad", il);
+ cb(k, "k_pad", il);
+ cb(v, "v_pad", il);
+ cb(beta, "beta_pad", il);
+ cb(gk, "gk_pad", il);
+
+ ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+ ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+ cb(v_beta, "v_beta", il);
+ cb(k_beta, "k_beta", il);
+
+ const int64_t HB = H_k * n_seqs;
+
+ q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, HB);
+ k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, HB);
+ k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, HB);
+ v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, HB);
+ v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, HB);
+
+ gk = ggml_cont_4d(ctx0, gk, S_k, chunk_size, n_chunks, HB);
+ beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, HB);
+
+ // switch for cumsum
+ gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB);
+ cb(gk, "gk", il);
+ ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk);
+ cb(gk_cumsum, "gk_cumsum", il);
+
+/*
+ Compute Akk and Aqk loop together
+ Akk loop:
+ for i in range(BT):
+ k_i = k[..., i, :] # k_i [B,H,NT,S]
+ g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S]
+ A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i)
+ Aqk loop:
+ for j in range(BT):
+ k_j = k[:, :, i, j]
+ g_j = g[:, :, i, j:j+1, :]
+ A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j)
+*/
+ const int64_t CHB = n_chunks * H_k * n_seqs;
+ ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB]
+ ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB]
+
+ ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
+ // decay_mask [chunk_size,chunk_size,S_k,CHB]
+ ggml_tensor * decay_mask = ggml_sub(ctx0, gkcs_j_bc, gkcs_i);
+ cb(decay_mask, "decay_mask", il);
+
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+ cb(decay_mask, "decay_masked", il);
+ decay_mask = ggml_exp(ctx0, decay_mask);
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+ // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
+ decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
+
+ ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB);
+ ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
+ ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB);
+
+ ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i);
+ ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);
+
+ // decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
+ ggml_tensor * Akk = ggml_mul_mat(ctx0, decay_k_i, k_j);
+ ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i, k_j);
+ Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB)));
+ Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB)));
+ cb(Akk, "Akk", il);
+ cb(Aqk, "Aqk", il);
+
+ Akk = ggml_mul(ctx0, Akk, beta);
+ Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask));
+ cb(Akk, "attn_pre_solve", il);
+
+ Aqk = ggml_mul(ctx0, Aqk, diag_mask);
+ Aqk = ggml_scale(ctx0, Aqk, scale); // scale q
+ cb(Aqk, "Aqk_masked", il);
+
+ // for i in range(1, chunk_size):
+ // row = attn[..., i, :i].clone()
+ // sub = attn[..., :i, :i].clone()
+ // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
+ // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+ //
+ // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
+ ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask);
+ ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+
+ ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false);
+ Akk = ggml_mul(ctx0, lin_solve, causal_mask);
+ Akk = ggml_add(ctx0, Akk, identity);
+
+ cb(Akk, "attn_solved", il);
+
+ // switch back for downstream
+ gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB);
+ ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum);
+ cb(gk_cumsum, "gk_cumsum", il);
+
+ // u = (A*beta[..., None, :]) @ v aka U_[t]
+ ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk);
+
+ ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp);
+ cb(kbeta_gkexp, "kbeta_gkexp", il);
+
+ ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk);
+ cb(k_cumdecay, "k_cumdecay", il);
+
+ ggml_tensor * core_attn_out = nullptr;
+ ggml_tensor * new_state = ggml_dup(ctx0, state);
+
+ cb(new_state, "new_state", il);
+
+ for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+// extract one chunk worth of data
+ auto chunkify = [=](ggml_tensor * t) {
+ return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
+ t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
+ };
+ auto chunkify_A = [=](ggml_tensor * t) {
+ return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, chunk_size, 1, t->ne[3],
+ t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
+ };
+
+
+// k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B]
+ ggml_tensor * k_chunk = chunkify(k);
+ ggml_tensor * q_chunk = chunkify(q);
+ ggml_tensor * vb_chunk = chunkify(vb);
+
+// gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B]
+ ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum);
+ ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
+ ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk);
+ ggml_tensor * Aqk_chunk = chunkify_A(Aqk);
+
+ ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
+
+ // new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B]
+ // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t]
+ ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+
+ // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t]
+ ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime);
+ ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+
+ // q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B]
+ // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+ // or Gamma_[t]*Q_]t] @ S
+ ggml_tensor * q_gk_exp = ggml_mul(ctx0, q_chunk, gkexp_chunk);
+ ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp);
+ attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q
+
+ // v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B]
+ // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t])
+ ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk_chunk);
+
+ // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i
+ ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+
+ core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
+
+ ggml_tensor * gk_cum_last =
+ ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cs_chunk, gk_cs_chunk->ne[0], 1, gk_cs_chunk->ne[2], gk_cs_chunk->ne[3],
+ gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3],
+ gk_cs_chunk->nb[1] * (gk_cs_chunk->ne[1] - 1)));
+
+ ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last)));
+
+ ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cs_chunk, gk_cum_last));
+
+ ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff);
+
+ ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp);
+
+ // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S)
+ ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff)));
+
+ new_state = ggml_add(ctx0,
+ ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gkexp_last, gkexp_last->ne[0], gkexp_last->ne[1], H_v, n_seqs)),
+ ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+ }
+
+ core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
+
+ // truncate padded tokens
+ ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+ S_v, n_tokens, H_v, n_seqs,
+ ggml_row_size(core_attn_out->type, S_v),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+ // permute back to (S_v, H_v, n_tokens, n_seqs)
+ output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+
+ cb(new_state, "output_state", il);
+
+ return {output_tokens, new_state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_kimi_linear::build_kda_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * gk,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il) {
+ GGML_ASSERT(ggml_is_contiguous(v));
+ GGML_ASSERT(ggml_is_contiguous(gk));
+
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(n_tokens == 1);
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_k && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_k && state->ne[2] == H_v && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ q = ggml_scale(ctx0, q, scale);
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(gk, "gk_in", il);
+
+// g [H,1,B,1] g_t [1,H,B,1] => [1,1,H,B]
+// gk [S,H,1,B] => [S,1,H,B] gk_t [1,S,H,B]
+// beta [H,1,1,B] beta_t [1,H,1,B] => [1,1,H,B]
+ gk = ggml_reshape_4d(ctx0, gk, S_k, 1, H_k, n_seqs);
+ ggml_tensor * gk_t = ggml_cont(ctx0, ggml_transpose(ctx0, gk));
+ ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
+
+ // Apply exponential to gk_t
+ gk_t = ggml_exp(ctx0, gk_t);
+ // Apply the gated delta rule for the single timestep
+ // last_recurrent_state = last_recurrent_state * gk_t
+ // S = S * g_i[..., None].exp()
+ state = ggml_mul(ctx0, state, gk_t);
+
+ ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
+
+// state [S,S,H,B] k [S,1,H,B] k_state [S_v,1,H,B]
+ k = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs);
+ ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k);
+
+ // v_i - (k_i[..., None] * S).sum(-2)
+ v = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+ ggml_tensor * v_diff = ggml_sub(ctx0, v, k_state);
+
+ // b_i[..., None] * k_i
+ ggml_tensor * k_beta = ggml_mul(ctx0, k, beta_t);
+
+ // S = S + torch.einsum('b h k, b h v -> b h k v', b_i[..., None] * k_i, v_i - (k_i[..., None] * S).sum(-2))
+ // v_diff_t [1,S_v,H,B] k_beta_t [1,S_k,H,B] state [S_v,S_k,H,B]
+ state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta))));
+
+ q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs);
+ state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
+ ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q);
+ // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+ cb(core_attn_out, "output_tokens", il);
+ cb(state, "new_state", il);
+
+ return {core_attn_out, state};
+}
+
diff --git a/llama.cpp/src/models/lfm2.cpp b/llama.cpp/src/models/lfm2.cpp
new file mode 100644
index 0000000..7f805d7
--- /dev/null
+++ b/llama.cpp/src/models/lfm2.cpp
@@ -0,0 +1,175 @@
+#include "models.h"
+
+#include "../llama-memory-hybrid.h"
+
+
+llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model) {
+ ggml_tensor * cur = build_inp_embd(model.tok_embd);
+ cb(cur, "model.embed_tokens", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_hybrid = build_inp_mem_hybrid();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);
+
+ auto * prev_cur = cur;
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "model.layers.{}.operator_norm", il);
+
+ cur = hparams.is_recurrent(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
+ build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
+ }
+
+ cur = ggml_add(ctx0, prev_cur, cur);
+
+ auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);
+
+ ggml_tensor * ffn_out =
+ is_moe_layer ? build_moe_feed_forward(ffn_norm_out, il) : build_dense_feed_forward(ffn_norm_out, il);
+ cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_out);
+ }
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_lfm2::build_moe_feed_forward(ggml_tensor * cur, int il) const {
+ return build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+}
+
+ggml_tensor * llm_build_lfm2::build_dense_feed_forward(ggml_tensor * cur, int il) const {
+ GGML_ASSERT(!model.layers[il].ffn_up_b);
+ GGML_ASSERT(!model.layers[il].ffn_gate_b);
+ GGML_ASSERT(!model.layers[il].ffn_down_b);
+ return build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+}
+
+ggml_tensor * llm_build_lfm2::build_attn_block(ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ int il) const {
+ GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
+ const auto n_embd_head = hparams.n_embd_head_v;
+ const auto n_head_kv = hparams.n_head_kv(il);
+
+ auto * q = build_lora_mm(model.layers[il].wq, cur);
+ cb(q, "model.layers.{}.self_attn.q_proj", il);
+ auto * k = build_lora_mm(model.layers[il].wk, cur);
+ cb(k, "model.layers.{}.self_attn.k_proj", il);
+ auto * v = build_lora_mm(model.layers[il].wv, cur);
+ cb(v, "model.layers.{}.self_attn.v_proj", il);
+
+ q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
+ k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
+
+ // qk norm
+ q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(q, "model.layers.{}.self_attn.q_layernorm", il);
+ k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(k, "model.layers.{}.self_attn.k_layernorm", il);
+
+ // RoPE
+ q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+ attn_factor, beta_fast, beta_slow);
+ k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+ attn_factor, beta_fast, beta_slow);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+
+ cb(cur, "model.layers.{}.self_attn.out_proj", il);
+
+ return cur;
+}
+
+ggml_tensor * llm_build_lfm2::build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) {
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
+ const uint32_t kv_head = mctx_cur->get_head();
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
+ const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
+ cb(bcx, "model.layers.{}.conv.in_proj", il);
+
+ constexpr auto n_chunks = 3;
+ GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
+ const auto chunk_size = bcx->ne[0] / n_chunks;
+ auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 0 * chunk_size * ggml_element_size(bcx));
+ auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 1 * chunk_size * ggml_element_size(bcx));
+ auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
+ 2 * chunk_size * ggml_element_size(bcx));
+
+ auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
+
+ // read conv state
+ auto * conv_state = mctx_cur->get_r_l(il);
+ auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
+ auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
+
+ bx = ggml_concat(ctx0, conv, bx, 0);
+ GGML_ASSERT(bx->ne[0] > conv->ne[0]);
+
+ // last d_conv columns is a new conv state
+ auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2],
+ (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
+ GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
+
+ // write new conv conv state
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv,
+ ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv),
+ kv_head * d_conv * n_embd * ggml_element_size(new_conv))));
+
+ auto * conv_kernel = model.layers[il].shortconv.conv;
+ auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
+ cb(conv_out, "model.layers.{}.conv.conv", il);
+
+ auto * y = ggml_mul(ctx0, c, conv_out);
+ y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
+ cb(y, "model.layers.{}.conv.out_proj", il);
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
+
+ return y;
+}
diff --git a/llama.cpp/src/models/llada-moe.cpp b/llama.cpp/src/models/llada-moe.cpp
new file mode 100644
index 0000000..5f64686
--- /dev/null
+++ b/llama.cpp/src/models/llada-moe.cpp
@@ -0,0 +1,122 @@
+#include "models.h"
+
+llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/llada.cpp b/llama.cpp/src/models/llada.cpp
new file mode 100644
index 0000000..8570336
--- /dev/null
+++ b/llama.cpp/src/models/llada.cpp
@@ -0,0 +1,99 @@
+#include "models.h"
+
+llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // Non-causal attention for diffusion
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/llama-iswa.cpp b/llama.cpp/src/models/llama-iswa.cpp
new file mode 100644
index 0000000..61dd2c1
--- /dev/null
+++ b/llama.cpp/src/models/llama-iswa.cpp
@@ -0,0 +1,178 @@
+#include "models.h"
+
+llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // temperature tuning
+ ggml_tensor * inp_attn_scale = nullptr;
+ inp_attn_scale = build_inp_attn_scale();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ ggml_tensor * inpSA = inpL;
+
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ } else if (inp_attn_scale) {
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ if (use_rope && hparams.use_kq_norm) {
+ // Llama4TextL2Norm
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+ il);
+
+ // Shared experts
+ ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(shexp_out, "ffn_moe_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, shexp_out);
+ cb(cur, "ffn_moe_out_merged", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/llama.cpp b/llama.cpp/src/models/llama.cpp
new file mode 100644
index 0000000..42b5fcd
--- /dev/null
+++ b/llama.cpp/src/models/llama.cpp
@@ -0,0 +1,168 @@
+#include "models.h"
+
+template <bool embed>
+llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
+
+ inp_attn_type * inp_attn = nullptr;
+ if constexpr (embed) {
+ inp_attn = build_attn_inp_no_cache();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ if (hparams.use_kq_norm) {
+ // Llama4TextL2Norm
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ if constexpr (!embed) {
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+ }
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+template struct llm_build_llama<false>;
+template struct llm_build_llama<true>;
diff --git a/llama.cpp/src/models/maincoder.cpp b/llama.cpp/src/models/maincoder.cpp
new file mode 100644
index 0000000..da57308
--- /dev/null
+++ b/llama.cpp/src/models/maincoder.cpp
@@ -0,0 +1,117 @@
+#include "models.h"
+
+llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/mamba.cpp b/llama.cpp/src/models/mamba.cpp
new file mode 100644
index 0000000..4681961
--- /dev/null
+++ b/llama.cpp/src/models/mamba.cpp
@@ -0,0 +1,55 @@
+#include "models.h"
+
+
+llm_build_mamba::llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * rs_inp = build_rs_inp();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (model.arch == LLM_ARCH_MAMBA2) {
+ cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
+ } else {
+ cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ // final rmsnorm
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
diff --git a/llama.cpp/src/models/mimo2-iswa.cpp b/llama.cpp/src/models/mimo2-iswa.cpp
new file mode 100644
index 0000000..edc87cc
--- /dev/null
+++ b/llama.cpp/src/models/mimo2-iswa.cpp
@@ -0,0 +1,123 @@
+
+#include "models.h"
+
+llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ uint32_t n_head_l = hparams.n_head(il);
+ uint32_t n_head_kv_l = hparams.n_head_kv(il);
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ cur = inpL;
+
+ // self_attention
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ ggml_tensor * sinks = model.layers[il].attn_sinks;
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // dense branch
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
+ 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
+ cb(cur, "ffn_moe_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/minicpm3.cpp b/llama.cpp/src/models/minicpm3.cpp
new file mode 100644
index 0000000..297cc34
--- /dev/null
+++ b/llama.cpp/src/models/minicpm3.cpp
@@ -0,0 +1,200 @@
+#include "models.h"
+
+llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ //TODO: if the model varies, these parameters need to be read from the model
+ const int64_t n_embd_base = 256;
+ const float scale_embd = 12.0f;
+ const float scale_depth = 1.4f;
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // scale the input embeddings
+ inpL = ggml_scale(ctx0, inpL, scale_embd);
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ ggml_tensor * q = NULL;
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+ cb(q, "q", il);
+
+ q = build_norm(q,
+ model.layers[il].attn_q_a_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(q, "q", il);
+
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+ cb(q, "q", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ 0);
+ cb(q_nope, "q_nope", il);
+
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, n_embd_head_qk_nope));
+ cb(q_pe, "q_pe", il);
+
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+ // split into {kv_lora_rank, n_tokens}
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+ kv_pe_compresseed->nb[1],
+ 0);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // and {n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+ kv_pe_compresseed->nb[1],
+ kv_pe_compresseed->nb[1],
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+ cb(k_pe, "k_pe", il);
+
+ kv_compressed = build_norm(kv_compressed,
+ model.layers[il].attn_kv_a_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+ cb(kv, "kv", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ 0);
+ cb(k_nope, "k_nope", il);
+
+ // and {n_head * n_embd_head_v, n_tokens}
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+ cb(v_states, "v_states", il);
+
+ v_states = ggml_cont(ctx0, v_states);
+ cb(v_states, "v_states", il);
+
+ q_pe = ggml_rope_ext(
+ ctx0, q_pe, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(q_pe, "q_pe", il);
+
+ // shared RoPE key
+ k_pe = ggml_rope_ext(
+ ctx0, k_pe, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(k_pe, "k_pe", il);
+
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+ cb(q_states, "q_states", il);
+
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+ cb(k_states, "k_states", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ // scale_res - scale the hidden states for residual connection
+ const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
+ cur = ggml_scale(ctx0, cur, scale_res);
+ cb(cur, "hidden_scaled", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ // scale the hidden states for residual connection
+ cur = ggml_scale(ctx0, cur, scale_res);
+ cb(cur, "hidden_scaled_ffn", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head scaling
+ const float scale_lmhead = float(n_embd_base)/float(n_embd);
+ cur = ggml_scale(ctx0, cur, scale_lmhead);
+ cb(cur, "lmhead_scaling", -1);
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/minimax-m2.cpp b/llama.cpp/src/models/minimax-m2.cpp
new file mode 100644
index 0000000..f7001ba
--- /dev/null
+++ b/llama.cpp/src/models/minimax-m2.cpp
@@ -0,0 +1,124 @@
+
+#include "models.h"
+
+llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto inp_attn = build_attn_inp_kv();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = inpL;
+
+ // self_attention
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/mistral3.cpp b/llama.cpp/src/models/mistral3.cpp
new file mode 100644
index 0000000..0b67223
--- /dev/null
+++ b/llama.cpp/src/models/mistral3.cpp
@@ -0,0 +1,160 @@
+#include "models.h"
+
+llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // (optional) temperature tuning
+ ggml_tensor * inp_attn_scale = nullptr;
+ if (hparams.f_attn_temp_scale != 0.0f) {
+ inp_attn_scale = build_inp_attn_scale();
+ }
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ if (inp_attn_scale) {
+ // apply llama 4 temperature scaling
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
+ }
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network (non-MoE)
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/models.h b/llama.cpp/src/models/models.h
new file mode 100644
index 0000000..3c66d32
--- /dev/null
+++ b/llama.cpp/src/models/models.h
@@ -0,0 +1,723 @@
+#pragma once
+
+#include "../llama-model.h"
+#include "../llama-graph.h"
+
+// TODO: remove in follow-up PR - move to .cpp files
+#include "../llama-memory-recurrent.h"
+#include <cmath>
+
+struct llm_graph_context_mamba : public llm_graph_context {
+ llm_graph_context_mamba(const llm_graph_params & params);
+
+ virtual ~llm_graph_context_mamba() = default;
+
+ ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
+ ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const;
+
+};
+
+// Base class for RWKV-related models
+struct llm_build_rwkv6_base : public llm_graph_context {
+ const llama_model & model;
+
+ llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params);
+
+ virtual ~llm_build_rwkv6_base() = default;
+
+ ggml_tensor * build_rwkv6_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const;
+
+ ggml_tensor * build_rwkv6_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ const llama_ubatch & ubatch,
+ int il) const;
+};
+
+// Base class for RWKV7-related models
+struct llm_build_rwkv7_base : public llm_graph_context {
+ const llama_model & model;
+
+ llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params);
+
+ virtual ~llm_build_rwkv7_base() = default;
+
+ // RWKV7-specific graph building methods
+ ggml_tensor * build_rwkv7_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const;
+ ggml_tensor * build_rwkv7_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ ggml_tensor *& first_layer_value,
+ const llama_ubatch & ubatch,
+ int il) const;
+};
+
+struct llm_build_afmoe : public llm_graph_context {
+ llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_apertus : public llm_graph_context {
+ llm_build_apertus(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_arcee : public llm_graph_context {
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_arctic : public llm_graph_context {
+ llm_build_arctic(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_arwkv7 : public llm_build_rwkv7_base {
+ llm_build_arwkv7(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_baichuan : public llm_graph_context {
+ llm_build_baichuan(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bailingmoe2 : public llm_graph_context {
+ llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bailingmoe : public llm_graph_context {
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bert : public llm_graph_context {
+ llm_build_bert(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bitnet : public llm_graph_context {
+ llm_build_bitnet(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_bloom : public llm_graph_context {
+ llm_build_bloom(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_chameleon : public llm_graph_context {
+ llm_build_chameleon(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_chatglm : public llm_graph_context {
+ llm_build_chatglm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_codeshell : public llm_graph_context {
+ llm_build_codeshell(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_cogvlm : public llm_graph_context {
+ llm_build_cogvlm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_cohere2_iswa : public llm_graph_context {
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_command_r : public llm_graph_context {
+ llm_build_command_r(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_dbrx : public llm_graph_context {
+ llm_build_dbrx(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_deci : public llm_graph_context {
+ llm_build_deci(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_deepseek2 : public llm_graph_context {
+ llm_build_deepseek2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_deepseek : public llm_graph_context {
+ llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_dots1 : public llm_graph_context {
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_dream : public llm_graph_context {
+ llm_build_dream(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_ernie4_5 : public llm_graph_context {
+ llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_ernie4_5_moe : public llm_graph_context {
+ llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_exaone4 : public llm_graph_context {
+ llm_build_exaone4(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_exaone : public llm_graph_context {
+ llm_build_exaone(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_exaone_moe : public llm_graph_context {
+ llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_falcon : public llm_graph_context {
+ llm_build_falcon(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_falcon_h1 : public llm_graph_context_mamba {
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gemma2_iswa : public llm_graph_context {
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_gemma3 : public llm_graph_context {
+ llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gemma3n_iswa : public llm_graph_context {
+ const llama_model & model;
+
+ const int64_t n_embd_head;
+ const int64_t n_embd_altup;
+ const int64_t n_altup;
+ const int i_altup_act;
+ const int n_layer_sparsity = 10; // number of layers using activation sparsity
+ const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
+
+ llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * calc_magnitude(ggml_tensor * x);
+ ggml_tensor * view_2d_slice(ggml_tensor * x, int idx);
+ ggml_tensor * get_per_layer_inputs();
+ ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer);
+ ggml_tensor * gaussian_topk(ggml_tensor * x);
+ ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il);
+ ggml_tensor * altup_predict(ggml_tensor * cur, int il);
+ ggml_tensor * laurel(ggml_tensor * cur, int il);
+ ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
+};
+
+struct llm_build_gemma_embedding : public llm_graph_context {
+ llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gemma : public llm_graph_context {
+ llm_build_gemma(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_glm4 : public llm_graph_context {
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_glm4_moe : public llm_graph_context {
+ llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gpt2 : public llm_graph_context {
+ llm_build_gpt2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_gptneox : public llm_graph_context {
+ llm_build_gptneox(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_granite : public llm_graph_context {
+ llm_build_granite(const llama_model & model, const llm_graph_params & params);
+
+private:
+ ggml_tensor * build_attention_layer(
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il);
+
+ ggml_tensor * build_layer_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * inpSA,
+ const llama_model & model,
+ const int il);
+};
+
+struct llm_build_granite_hybrid : public llm_graph_context_mamba {
+ llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
+ ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,const int64_t n_embd_head, const int il);
+};
+
+struct llm_build_grok : public llm_graph_context {
+ llm_build_grok(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_grovemoe : public llm_graph_context {
+ llm_build_grovemoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_hunyuan_dense : public llm_graph_context {
+ llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_hunyuan_moe : public llm_graph_context {
+ llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_internlm2 : public llm_graph_context {
+ llm_build_internlm2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_jais : public llm_graph_context {
+ llm_build_jais(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_jamba : public llm_graph_context_mamba {
+ llm_build_jamba(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_kimi_linear : public llm_graph_context_mamba {
+ llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
+
+ std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * gk,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il);
+
+ std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * gk,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il);
+
+ const llama_model & model;
+};
+
+struct llm_build_lfm2 : public llm_graph_context {
+ const llama_model & model;
+
+ llm_build_lfm2(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const;
+ ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const;
+ ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const;
+ ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il);
+
+};
+
+struct llm_build_llada : public llm_graph_context {
+ llm_build_llada(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_llada_moe : public llm_graph_context {
+ llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool embed>
+struct llm_build_llama : public llm_graph_context {
+ llm_build_llama(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_llama_iswa : public llm_graph_context {
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_maincoder : public llm_graph_context {
+ llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_mamba : public llm_graph_context_mamba {
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_mimo2_iswa : public llm_graph_context {
+ llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_minicpm3 : public llm_graph_context {
+ llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_minimax_m2 : public llm_graph_context {
+ llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_mistral3 : public llm_graph_context {
+ llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_modern_bert : public llm_graph_context {
+ llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_mpt : public llm_graph_context {
+ llm_build_mpt(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_nemotron : public llm_graph_context {
+ llm_build_nemotron(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_nemotron_h : public llm_graph_context_mamba {
+ llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params);
+ ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il);
+ ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model, const int64_t n_embd_head, const int il);
+};
+
+struct llm_build_neo_bert : public llm_graph_context {
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_olmo2 : public llm_graph_context {
+ llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_olmoe : public llm_graph_context {
+ llm_build_olmoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_olmo : public llm_graph_context {
+ llm_build_olmo(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_openai_moe_iswa : public llm_graph_context {
+ llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_openelm : public llm_graph_context {
+ llm_build_openelm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_orion : public llm_graph_context {
+ llm_build_orion(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_pangu_embedded : public llm_graph_context {
+ llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_phi2 : public llm_graph_context {
+ llm_build_phi2(const llama_model & model, const llm_graph_params & params);
+};
+
+template<bool iswa>
+struct llm_build_phi3 : public llm_graph_context {
+ llm_build_phi3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_plamo2 : public llm_graph_context_mamba {
+ llm_build_plamo2(const llama_model & model, const llm_graph_params & params);
+ private:
+ ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
+ ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur,
+ const llama_model & model, int il);
+};
+
+struct llm_build_plamo : public llm_graph_context {
+ llm_build_plamo(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_plamo3 : public llm_graph_context {
+ llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_plm : public llm_graph_context {
+ llm_build_plm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen2 : public llm_graph_context {
+ llm_build_qwen2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen2moe : public llm_graph_context {
+ llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen2vl : public llm_graph_context {
+ llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3 : public llm_graph_context {
+ llm_build_qwen3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3moe : public llm_graph_context {
+ llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3vl : public llm_graph_context {
+ llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3vlmoe : public llm_graph_context {
+ llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_qwen3next : public llm_graph_context_mamba {
+ llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
+private:
+ ggml_tensor * build_layer_attn(
+ llm_graph_input_attn_kv * inp_attn,
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ int il);
+
+ ggml_tensor * build_layer_attn_linear(
+ llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il);
+
+ ggml_tensor * build_layer_ffn(
+ ggml_tensor * cur,
+ int il);
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il);
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il);
+
+ ggml_tensor * build_norm_gated(
+ ggml_tensor * input,
+ ggml_tensor * weights,
+ ggml_tensor * gate,
+ int layer);
+
+ // returns pair of qkv, z
+ std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+ ggml_tensor * input,
+ int il);
+
+ const llama_model & model;
+};
+
+struct llm_build_qwen35 : public llm_graph_context_mamba {
+ llm_build_qwen35(const llama_model & model, const llm_graph_params & params);
+private:
+ ggml_tensor * build_layer_attn(
+ llm_graph_input_attn_kv * inp_attn,
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ int * sections,
+ int il);
+
+ ggml_tensor * build_layer_attn_linear(
+ llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il);
+
+ ggml_tensor * build_layer_ffn(
+ ggml_tensor * cur,
+ int il);
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il);
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il);
+
+ ggml_tensor * build_norm_gated(
+ ggml_tensor * input,
+ ggml_tensor * weights,
+ ggml_tensor * gate,
+ int layer);
+
+ // returns pair of qkv, z
+ std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+ ggml_tensor * input,
+ int il);
+
+ const llama_model & model;
+};
+
+struct llm_build_qwen35moe : public llm_graph_context_mamba {
+ llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params);
+private:
+ ggml_tensor * build_layer_attn(
+ llm_graph_input_attn_kv * inp_attn,
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ int * sections,
+ int il);
+
+ ggml_tensor * build_layer_attn_linear(
+ llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il);
+
+ ggml_tensor * build_layer_ffn(
+ ggml_tensor * cur,
+ int il);
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il);
+
+ // returns pair of output and new state
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il);
+
+ ggml_tensor * build_norm_gated(
+ ggml_tensor * input,
+ ggml_tensor * weights,
+ ggml_tensor * gate,
+ int layer);
+
+ // returns pair of qkv, z
+ std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+ ggml_tensor * input,
+ int il);
+
+ const llama_model & model;
+};
+
+struct llm_build_qwen : public llm_graph_context {
+ llm_build_qwen(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_refact : public llm_graph_context {
+ llm_build_refact(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_rnd1 : public llm_graph_context {
+ llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_rwkv6 : public llm_build_rwkv6_base {
+ llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
+ llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_rwkv7 : public llm_build_rwkv7_base {
+ llm_build_rwkv7(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_seed_oss : public llm_graph_context {
+ llm_build_seed_oss(const llama_model & model, const llm_graph_params & params);
+};
+
+template <bool iswa>
+struct llm_build_smallthinker : public llm_graph_context {
+ llm_build_smallthinker(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_smollm3 : public llm_graph_context {
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_stablelm : public llm_graph_context {
+ llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_starcoder2 : public llm_graph_context {
+ llm_build_starcoder2(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_starcoder : public llm_graph_context {
+ llm_build_starcoder(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_step35_iswa : public llm_graph_context {
+ llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_t5_dec : public llm_graph_context {
+ llm_build_t5_dec(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_t5_enc : public llm_graph_context {
+ llm_build_t5_enc(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_wavtokenizer_dec : public llm_graph_context {
+ llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_xverse : public llm_graph_context {
+ llm_build_xverse(const llama_model & model, const llm_graph_params & params);
+};
diff --git a/llama.cpp/src/models/modern-bert.cpp b/llama.cpp/src/models/modern-bert.cpp
new file mode 100644
index 0000000..bb12ed8
--- /dev/null
+++ b/llama.cpp/src/models/modern-bert.cpp
@@ -0,0 +1,116 @@
+#include "models.h"
+
+llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // construct input embeddings (token, type, position)
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "inp_embd", -1);
+
+ // embed layer norm
+ inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
+ cb(inpL, "inp_norm", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ cur = inpL;
+
+ // attention layer norm
+ if (model.layers[il].attn_norm) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+ }
+
+ // self attention
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ const size_t type_size = ggml_type_size(cur->type);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
+
+ // RoPE
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // re-add the layer input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // attention layer norm
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
+
+ // attentions bypass the intermediate layer
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM, -1);
+ cb(cur, "final_norm_out", -1);
+
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+ // extracting cls token
+ cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
+ cb(cur, "cls_pooled_embd", -1);
+ }
+
+ cb(cur, "res_embd", -1);
+ res->t_embd = cur;
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/mpt.cpp b/llama.cpp/src/models/mpt.cpp
new file mode 100644
index 0000000..2328e02
--- /dev/null
+++ b/llama.cpp/src/models/mpt.cpp
@@ -0,0 +1,126 @@
+#include "models.h"
+
+
+
+llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * pos;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ if (model.pos_embd) {
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
+
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
+ }
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * attn_norm;
+
+ attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, il);
+ cb(attn_norm, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = attn_norm;
+
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+
+ if (hparams.f_clamp_kqv > 0.0f) {
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(cur, "wqkv_clamped", il);
+ }
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 0 * sizeof(float) * (n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+ cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+
+ // Q/K Layernorm
+ if (model.layers[il].attn_q_norm) {
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head * n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ }
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // Add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed forward
+ {
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/nemotron-h.cpp b/llama.cpp/src/models/nemotron-h.cpp
new file mode 100644
index 0000000..079c730
--- /dev/null
+++ b/llama.cpp/src/models/nemotron-h.cpp
@@ -0,0 +1,150 @@
+#include "models.h"
+
+
+
+llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ ggml_build_forward_expand(gf, inpL);
+
+ auto * inp = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ if (hparams.is_recurrent(il)) {
+ // ssm layer //
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
+ } else if (hparams.n_ff(il) == 0) {
+ // attention layer //
+ cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
+ } else {
+ cur = build_ffn_layer(cur, model, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // add residual
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "nemotron_h_block_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor * cur,
+ llm_graph_input_attn_kv * inp_attn,
+ const llama_model & model,
+ const int64_t n_embd_head,
+ const int il) {
+ // compute Q and K
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ const float kq_scale =
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ return cur;
+}
+
+ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ ggml_tensor * ffn_inp = cur;
+ ggml_tensor * moe_out =
+ build_moe_ffn(ffn_inp,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ nullptr, // no gate
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ NULL /* no gate */ , NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ return cur;
+}
diff --git a/llama.cpp/src/models/nemotron.cpp b/llama.cpp/src/models/nemotron.cpp
new file mode 100644
index 0000000..fcead04
--- /dev/null
+++ b/llama.cpp/src/models/nemotron.cpp
@@ -0,0 +1,122 @@
+#include "models.h"
+
+llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ //GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/neo-bert.cpp b/llama.cpp/src/models/neo-bert.cpp
new file mode 100644
index 0000000..7c32bfc
--- /dev/null
+++ b/llama.cpp/src/models/neo-bert.cpp
@@ -0,0 +1,104 @@
+#include "models.h"
+
+llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // construct input embeddings (token, type, position)
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "inp_embd", -1);
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * cur = inpL;
+
+ // pre-norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+
+ {
+ ggml_tensor * Qcur;
+ ggml_tensor * Kcur;
+ ggml_tensor * Vcur;
+
+ // self-attention
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ // RoPE
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // re-add the layer input
+ cur = ggml_add(ctx0, cur, inpL);
+
+ ggml_tensor * ffn_inp = cur;
+ cb(ffn_inp, "ffn_inp", il);
+
+ // pre-norm
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up,
+ NULL, NULL, NULL, NULL, NULL,
+ model.layers[il].ffn_down,
+ NULL, NULL, NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+
+ // attentions bypass the intermediate layer
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm_enc, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_embd", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/olmo.cpp b/llama.cpp/src/models/olmo.cpp
new file mode 100644
index 0000000..bbd623f
--- /dev/null
+++ b/llama.cpp/src/models/olmo.cpp
@@ -0,0 +1,121 @@
+#include "models.h"
+
+llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ NULL, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (hparams.f_clamp_kqv > 0.0f) {
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (hparams.f_clamp_kqv > 0.0f) {
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (hparams.f_clamp_kqv > 0.0f) {
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ NULL, NULL,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ NULL, NULL,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/olmo2.cpp b/llama.cpp/src/models/olmo2.cpp
new file mode 100644
index 0000000..713552d
--- /dev/null
+++ b/llama.cpp/src/models/olmo2.cpp
@@ -0,0 +1,150 @@
+#include "models.h"
+
+template <bool iswa>
+llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = inpL;
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ const bool is_swa = hparams.is_swa(il);
+
+ if (is_swa) {
+ // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
+ // This is achieved here by setting freq_scale and attn_factor to 1.
+ // We also set ext_factor to 0 to avoid a few unnecessary computations.
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+ 0.0, 1.0, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+ 0.0, 1.0, beta_fast, beta_slow
+ );
+ } else {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_ffn(ffn_inp,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_olmo2<false>;
+template struct llm_build_olmo2<true>;
diff --git a/llama.cpp/src/models/olmoe.cpp b/llama.cpp/src/models/olmoe.cpp
new file mode 100644
index 0000000..b8b6988
--- /dev/null
+++ b/llama.cpp/src/models/olmoe.cpp
@@ -0,0 +1,124 @@
+#include "models.h"
+
+llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/openai-moe-iswa.cpp b/llama.cpp/src/models/openai-moe-iswa.cpp
new file mode 100644
index 0000000..dbe3ca1
--- /dev/null
+++ b/llama.cpp/src/models/openai-moe-iswa.cpp
@@ -0,0 +1,127 @@
+#include "models.h"
+
+llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
+
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = ffn_inp;
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm, nullptr,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ // MoE branch
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
+ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
+ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SWIGLU_OAI_MOE, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
+ il);
+ cb(cur, "ffn_moe_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/openelm.cpp b/llama.cpp/src/models/openelm.cpp
new file mode 100644
index 0000000..fbf682e
--- /dev/null
+++ b/llama.cpp/src/models/openelm.cpp
@@ -0,0 +1,124 @@
+#include "models.h"
+
+llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const int64_t n_head = hparams.n_head(il);
+ const int64_t n_head_kv = hparams.n_head_kv(il);
+ const int64_t n_head_qkv = 2*n_head_kv + n_head;
+
+ cur = inpL;
+ ggml_tensor * residual = cur;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv));
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur", il);
+
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, NULL,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, NULL,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Qcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // norm
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/orion.cpp b/llama.cpp/src/models/orion.cpp
new file mode 100644
index 0000000..bb02273
--- /dev/null
+++ b/llama.cpp/src/models/orion.cpp
@@ -0,0 +1,123 @@
+#include "models.h"
+
+llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ // if (model.layers[il].bq) {
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ // cb(Qcur, "Qcur", il);
+ // }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ // if (model.layers[il].bk) {
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ // cb(Kcur, "Kcur", il);
+ // }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ // if (model.layers[il].bv) {
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ // cb(Vcur, "Vcur", il);
+ // }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/pangu-embedded.cpp b/llama.cpp/src/models/pangu-embedded.cpp
new file mode 100644
index 0000000..664572a
--- /dev/null
+++ b/llama.cpp/src/models/pangu-embedded.cpp
@@ -0,0 +1,121 @@
+#include "models.h"
+
+
+llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (model.output_b != nullptr) {
+ cur = ggml_add(ctx0, cur, model.output_b);
+ }
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/phi2.cpp b/llama.cpp/src/models/phi2.cpp
new file mode 100644
index 0000000..22dbf61
--- /dev/null
+++ b/llama.cpp/src/models/phi2.cpp
@@ -0,0 +1,121 @@
+#include "models.h"
+
+
+llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * attn_norm_output;
+ ggml_tensor * ffn_output;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ attn_norm_output = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(attn_norm_output, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv) {
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+ } else {
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ }
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // with phi2, we scale the Q to avoid precision issues
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
+ }
+ // FF
+ {
+ ffn_output = build_ffn(attn_norm_output,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(ffn_output, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_output);
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output_no_bias", -1);
+
+ cur = ggml_add(ctx0, cur, model.output_b);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/phi3.cpp b/llama.cpp/src/models/phi3.cpp
new file mode 100644
index 0000000..c8e5da3
--- /dev/null
+++ b/llama.cpp/src/models/phi3.cpp
@@ -0,0 +1,152 @@
+#include "models.h"
+
+template<bool iswa>
+llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ auto * residual = inpL;
+
+ // self-attention
+ {
+ // rope freq factors for 128k context
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ ggml_tensor* attn_norm_output = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM_RMS, il);
+ cb(attn_norm_output, "attn_norm", il);
+
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv) {
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
+ cb(cur, "wqkv", il);
+
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+ }
+ else {
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+ }
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
+ cb(Qcur, "Qcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ }
+ cur = ggml_add(ctx0, cur, residual);
+ residual = cur;
+
+ cur = build_norm(cur,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(cur, "ffn_moe_out", il);
+ }
+ cur = ggml_add(ctx0, residual, cur);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ if (model.output_b != nullptr) {
+ cb(cur, "result_output_no_bias", -1);
+ cur = ggml_add(ctx0, cur, model.output_b);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_phi3<false>;
+template struct llm_build_phi3<true>;
diff --git a/llama.cpp/src/models/plamo.cpp b/llama.cpp/src/models/plamo.cpp
new file mode 100644
index 0000000..04ff709
--- /dev/null
+++ b/llama.cpp/src/models/plamo.cpp
@@ -0,0 +1,110 @@
+#include "models.h"
+
+llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * sa_inp = cur;
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ ggml_tensor * sa_out = cur;
+
+ cur = sa_inp;
+
+ // feed-forward network
+ {
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, sa_out);
+ cur = ggml_add(ctx0, cur, inpL);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/plamo2.cpp b/llama.cpp/src/models/plamo2.cpp
new file mode 100644
index 0000000..31115a0
--- /dev/null
+++ b/llama.cpp/src/models/plamo2.cpp
@@ -0,0 +1,316 @@
+#include "models.h"
+
+llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "embedding_output", -1);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_hybrid = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * residual = inpL;
+
+ // ggml_graph_add_node(gf, model.layers[il].attn_norm);
+ // cb(model.layers[il].attn_norm, "attn_norm", il);
+
+ // pre_mixer_norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+
+ // check if this layer is Mamba or Attention
+ bool is_mamba_layer = hparams.is_recurrent(il);
+
+ if (is_mamba_layer) {
+ // PLaMo-2 Mamba layer
+ cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
+ } else {
+ // PLaMo-2 Attention layer
+ cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
+ }
+
+ // post_mixer_norm
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ // residual connection
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "attn_residual", il);
+ residual = cur;
+
+ // pre-ffn norm
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_pre_norm", il);
+
+ // feed-forward network
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ // post ffn norm
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_post_norm", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ }
+
+ // residual connection
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "ffn_residual", il);
+
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ // final norm
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+
+ // Explicitly mark as output tensor to ensure proper backend assignment
+ ggml_set_output(cur);
+
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv * inp,
+ ggml_tensor * inp_pos,
+ ggml_tensor * cur,
+ const llama_model & model,
+ int il) {
+ // self-attention
+ {
+ // PLaMo-2 uses combined QKV tensor
+ ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(qkv, "wqkv", il);
+
+ // split QKV tensor into Q, K, V
+ const int64_t n_embd_head_q = hparams.n_embd_head_k;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ int32_t n_head = hparams.n_head(il);
+ int32_t n_head_kv = hparams.n_head_kv(il);
+
+ const int64_t q_offset = 0;
+ const int64_t k_offset = n_embd_head_q * n_head;
+ const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float),
+ qkv->nb[1], q_offset * ggml_element_size(qkv));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float),
+ qkv->nb[1], k_offset * ggml_element_size(qkv));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float),
+ qkv->nb[1], v_offset * ggml_element_size(qkv));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cur = build_attn(inp,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f / sqrtf(float(n_embd_head_v)), il);
+ }
+
+ cb(cur, "attn_out", il);
+
+ return cur;
+}
+
+ggml_tensor * llm_build_plamo2::build_plamo2_mamba_layer(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ const llama_model & model,
+ const llama_ubatch & ubatch,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t n_heads = hparams.ssm_dt_rank;
+ const int64_t head_dim = d_inner / n_heads;
+ const int64_t n_group = hparams.ssm_n_group;
+ const int64_t n_seqs = ubatch.n_seqs;
+
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs);
+
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+ // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
+ cb(zx, "mamba_in_proj", il);
+ // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
+ zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
+ zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
+ cb(zx, "mamba_in_proj_out", il);
+
+ // split into z and x
+ // => {head_dim * n_heads, n_seq_tokens, n_seqs}
+ ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3],
+ head_dim * ggml_element_size(zx));
+ x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
+ // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
+ cb(x, "mamba_x_split", il);
+
+ ggml_tensor * z =
+ ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
+ cb(z, "mamba_z_split", il);
+
+ // conv1d
+ {
+ // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+ cb(conv_x, "mamba_conv1d_input", il);
+
+ // copy last (d_conv - 1) columns back into the state cache
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2],
+ n_seq_tokens * (conv_x->nb[0]));
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv,
+ ggml_view_1d(ctx0, conv_states_all,
+ (d_conv - 1) * (d_inner + 2 * n_group * d_state) * (n_seqs),
+ kv_head * (d_conv - 1) * (d_inner + 2 * n_group * d_state) *
+ ggml_element_size(conv_states_all))));
+ cb(conv_states_all, "mamba_conv1d_state", il);
+
+ // 1D convolution
+ x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+ cb(x, "mamba_conv1d", il);
+
+ x = ggml_silu(ctx0, x);
+ cb(x, "mamba_conv1d_silu", il);
+ }
+
+ // SSM
+ {
+ // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+ ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
+ cb(x_bcdt, "mamba_bcdt_proj", il);
+
+ // split into dt, B, C
+ const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
+ ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
+ ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2],
+ ggml_element_size(x_bcdt) * d_state);
+ ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2],
+ ggml_element_size(x_bcdt) * (2 * d_state));
+ cb(B, "mamba_B_raw", il);
+ cb(C, "mamba_C_raw", il);
+ cb(dt, "mamba_dt_raw", il);
+
+ // Apply RMS norm to dt, B, C (PLaMo-2 specific)
+ B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
+ C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
+ dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
+ cb(B, "mamba_B_normed", il);
+ cb(C, "mamba_C_normed", il);
+ cb(dt, "mamba_dt_normed", il);
+
+ // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+ dt = build_lora_mm(model.layers[il].ssm_dt, dt);
+ dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
+ cb(dt, "mamba_dt_proj", il);
+
+ ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
+ cb(A, "mamba_A", il);
+
+ x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x),
+ head_dim * n_heads * ggml_element_size(x),
+ head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
+ B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
+ C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
+
+ // use the states and the indices provided by build_recurrent_state
+ // (this is necessary in order to properly use the states before they are overwritten,
+ // while avoiding to make unnecessary copies of the states)
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
+
+ // Custom operator to optimize the parallel associative scan
+ // as described in the Annex D of the Mamba paper.
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
+ };
+
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
+ cb(y_ssm, "mamba_ssm_scan", il);
+
+ // store last states
+ ggml_build_forward_expand(
+ gf, ggml_cpy(
+ ctx0,
+ ggml_view_1d(ctx0, y_ssm, n_heads * head_dim * d_state * n_seqs,
+ n_heads * head_dim * n_seq_tokens * n_seqs * ggml_element_size(y_ssm)),
+ ggml_view_1d(ctx0, ssm_states_all, n_heads * head_dim * d_state * n_seqs,
+ kv_head * n_seqs * n_heads * head_dim * d_state * ggml_element_size(ssm_states_all))));
+ cb(ssm_states_all, "mamba_ssm_states", il);
+
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs,
+ head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x),
+ head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
+ cb(y, "mamba_y_view", il);
+
+ // Add D parameter and apply gating with z
+ // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
+ ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
+ cb(y, "mamba_y_add_d", il);
+
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
+ cb(y, "mamba_y_swiglu_z", il);
+
+ // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+ y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
+ cb(cur, "mamba_out_proj", il);
+ }
+
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+ cb(cur, "mamba_out", il);
+
+ return cur;
+}
diff --git a/llama.cpp/src/models/plamo3.cpp b/llama.cpp/src/models/plamo3.cpp
new file mode 100644
index 0000000..55c8064
--- /dev/null
+++ b/llama.cpp/src/models/plamo3.cpp
@@ -0,0 +1,128 @@
+#include "models.h"
+
+template <bool iswa>
+llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t head_dim_q = hparams.n_embd_head_k;
+ const int64_t head_dim_v = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * residual = inpL;
+
+ float freq_base_l = 0.0f;
+ float freq_scale_l = 0.0f;
+ if constexpr (iswa) {
+ freq_base_l = model.get_rope_freq_base (cparams, il);
+ freq_scale_l = model.get_rope_freq_scale(cparams, il);
+ } else {
+ freq_base_l = freq_base;
+ freq_scale_l = freq_scale;
+ }
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ const int32_t n_head = hparams.n_head(il);
+ const int32_t n_head_kv = hparams.n_head_kv(il);
+
+ const int64_t q_offset = 0;
+ const int64_t k_offset = head_dim_q * n_head;
+ const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
+ head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
+ head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
+ head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "attn_q_norm", il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "attn_k_norm", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
+ cb(cur, "attn_out", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ }
+
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "attn_residual", il);
+
+ residual = cur;
+
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "ffn_residual", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_plamo3<false>;
+template struct llm_build_plamo3<true>;
diff --git a/llama.cpp/src/models/plm.cpp b/llama.cpp/src/models/plm.cpp
new file mode 100644
index 0000000..612a487
--- /dev/null
+++ b/llama.cpp/src/models/plm.cpp
@@ -0,0 +1,169 @@
+#include "models.h"
+
+llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
+
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ // {n_embd, n_tokens}
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ ggml_tensor * q = NULL;
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(q, "q", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ 0);
+ cb(q_nope, "q_nope", il);
+
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+ ggml_row_size(q->type, hparams.n_embd_head_k),
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+ ggml_row_size(q->type, n_embd_head_qk_nope));
+ cb(q_pe, "q_pe", il);
+
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+ // split into {kv_lora_rank, n_tokens}
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+ kv_pe_compresseed->nb[1],
+ 0);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // and {n_embd_head_qk_rope, n_tokens}
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+ kv_pe_compresseed->nb[1],
+ kv_pe_compresseed->nb[1],
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+ cb(k_pe, "k_pe", il);
+
+ kv_compressed = build_norm(kv_compressed,
+ model.layers[il].attn_kv_a_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(kv_compressed, "kv_compressed", il);
+
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+ cb(kv, "kv", il);
+
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ 0);
+ cb(k_nope, "k_nope", il);
+
+ // and {n_head * n_embd_head_v, n_tokens}
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+ cb(v_states, "v_states", il);
+
+ v_states = ggml_cont(ctx0, v_states);
+ cb(v_states, "v_states", il);
+
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+ 0);
+ cb(v_states, "v_states", il);
+
+ q_pe = ggml_rope_ext(
+ ctx0, q_pe, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(q_pe, "q_pe", il);
+
+ // shared RoPE key
+ k_pe = ggml_rope_ext(
+ ctx0, k_pe, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(k_pe, "k_pe", il);
+
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+ cb(q_states, "q_states", il);
+
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+ cb(k_states, "k_states", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/qwen.cpp b/llama.cpp/src/models/qwen.cpp
new file mode 100644
index 0000000..31fd9b7
--- /dev/null
+++ b/llama.cpp/src/models/qwen.cpp
@@ -0,0 +1,108 @@
+#include "models.h"
+
+
+llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
+
+ // using mode = 2 for neox mode
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward forward
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/qwen2.cpp b/llama.cpp/src/models/qwen2.cpp
new file mode 100644
index 0000000..3da4dea
--- /dev/null
+++ b/llama.cpp/src/models/qwen2.cpp
@@ -0,0 +1,126 @@
+#include "models.h"
+
+llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ if (model.output_b != nullptr) {
+ cur = ggml_add(ctx0, cur, model.output_b);
+ }
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/qwen2moe.cpp b/llama.cpp/src/models/qwen2moe.cpp
new file mode 100644
index 0000000..49142b7
--- /dev/null
+++ b/llama.cpp/src/models/qwen2moe.cpp
@@ -0,0 +1,151 @@
+#include "models.h"
+
+llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
+
+ // sigmoid
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
+ cb(cur_gate, "ffn_shexp_gate", il);
+
+ ggml_tensor * cur_ffn = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur_ffn, "ffn_shexp", il);
+
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
+
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
+ cb(moe_out, "ffn_out", il);
+
+ cur = moe_out;
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/qwen2vl.cpp b/llama.cpp/src/models/qwen2vl.cpp
new file mode 100644
index 0000000..9be3867
--- /dev/null
+++ b/llama.cpp/src/models/qwen2vl.cpp
@@ -0,0 +1,117 @@
+#include "models.h"
+
+llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/qwen3.cpp b/llama.cpp/src/models/qwen3.cpp
new file mode 100644
index 0000000..a5cfffa
--- /dev/null
+++ b/llama.cpp/src/models/qwen3.cpp
@@ -0,0 +1,117 @@
+#include "models.h"
+
+llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/qwen35.cpp b/llama.cpp/src/models/qwen35.cpp
new file mode 100644
index 0000000..592c170
--- /dev/null
+++ b/llama.cpp/src/models/qwen35.cpp
@@ -0,0 +1,740 @@
+#include "ggml.h"
+#include "models.h"
+
+#define CHUNK_SIZE 64
+
+llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params), model(model) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ cb(inpL, "model.input_embed", -1);
+
+ auto * inp = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ ggml_tensor * causal_mask =
+ ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
+ GGML_TRI_TYPE_LOWER);
+
+ ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
+ ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+ ggml_build_forward_expand(gf, causal_mask);
+ ggml_build_forward_expand(gf, identity);
+ ggml_build_forward_expand(gf, diag_mask);
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // Determine layer type and build appropriate attention mechanism
+ if (hparams.is_recurrent(il)) {
+ // Linear attention layer (gated delta net)
+ cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
+ } else {
+ // Full attention layer
+ cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // Residual connection
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "attn_residual", il);
+
+ // Save the tensor before post-attention norm for residual connection
+ ggml_tensor * ffn_residual = cur;
+
+ // Post-attention norm
+ ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
+ cb(attn_post_norm, "attn_post_norm", il);
+
+ // Dense FFN layer - without residual connection
+ cur = build_layer_ffn(attn_post_norm, il);
+ cb(cur, "ffn_out", il);
+
+ // Residual connection for FFN - add to the tensor from before post_attention_layernorm
+ cur = ggml_add(ctx0, cur, ffn_residual);
+ cb(cur, "post_ffn", il);
+
+ // Input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // Final norm
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // LM head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// utility to get one slice from the third dimension
+// input dim: [x, y, c, b]
+// output dim: [x, y, 1, b]
+static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
+ return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
+ t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ q = ggml_scale(ctx0, q, scale);
+
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(g, "g_in", il);
+
+ q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
+
+ beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+ state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+ cb(q, "q_perm", il);
+ cb(k, "k_perm", il);
+ cb(v, "v_perm", il);
+ cb(beta, "beta_perm", il);
+ cb(g, "g_perm", il);
+ cb(state, "state_in", il);
+
+ GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+ GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+ GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+ // Do padding
+ const int64_t chunk_size = CHUNK_SIZE;
+
+ const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+ const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+ q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+ k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+ v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+ g = ggml_pad(ctx0, g, pad, 0, 0, 0);
+ beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+
+ cb(q, "q_pad", il);
+ cb(k, "k_pad", il);
+ cb(v, "v_pad", il);
+ cb(beta, "beta_pad", il);
+ cb(g, "g_pad", il);
+
+ ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+ ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+ cb(v_beta, "v_beta", il);
+ cb(k_beta, "k_beta", il);
+
+ q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
+ v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
+
+ g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
+ beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
+
+ ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+ cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+ ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
+
+ ggml_tensor * gcs_j_broadcast =
+ ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
+
+ ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
+ cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+ decay_mask = ggml_exp(ctx0, decay_mask);
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+ ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
+
+ ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
+ ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
+ cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
+ ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+
+ ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+ attn = ggml_mul(ctx0, lin_solve, causal_mask);
+ attn = ggml_add(ctx0, attn, identity);
+ cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
+
+ ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
+ ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
+
+ ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
+ cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * k_cumdecay =
+ ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+ cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
+ attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
+ attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
+ cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+
+ // vectorized calculation of key_gdiff
+ // improved from the chunked version:
+ // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+ // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+ // key_gdiff = key * g_diff.unsqueeze(-1)
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+ // get last element in g_cumsum along chunk_size dimension (ne0)
+ // example: [[x, y, z, ..., last], ...] -> [[last], ...]
+ ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
+ g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
+ (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
+ g_last = ggml_cont(ctx0, g_last);
+ cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
+ cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
+ cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
+ ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
+ 1, chunk_size, n_chunks, g_diff_exp->ne[3]);
+
+ ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
+ cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
+ cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
+
+ // state to be updated per chunk
+ ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
+ cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
+
+ // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
+ ggml_tensor * core_attn_out = nullptr;
+
+ for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+ // shape: (S_k, chunk_size, 1, H_k * n_seqs)
+ ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
+
+ // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+ ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
+
+ // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+ ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
+
+ // shape: (chunk_size, 1, H_v * n_seqs)
+ ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
+
+ // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+ // replaced by precomputed attn_kq
+ ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
+ cb(attn_chunk, "attn_chunk", il);
+
+ ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
+
+ // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+ ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+ cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
+
+ // v_new = v_i - v_prime
+ ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
+ ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+ cb(v_new, "v_new_chunk", il);
+
+ // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+ ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
+ ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+ cb(attn_inter, "attn_inter_chunk", il);
+
+ // core_attn_out[:, :, i] = attn_inter + attn @ v_new
+ ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
+ cb(v_attn, "v_attn_chunk", il);
+
+ ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+ cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+
+ core_attn_out = core_attn_out == nullptr
+ ? core_attn_out_chunk
+ : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
+
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
+ //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
+ ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
+
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+ ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
+ new_state = ggml_add(ctx0,
+ ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
+ ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+ }
+
+ // truncate padded tokens
+ ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+ S_v, n_tokens, H_v, n_seqs,
+ ggml_row_size(core_attn_out->type, S_v),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+ cb(output_tokens, "output_tokens", il);
+
+ // permute back to (S_v, H_v, n_tokens, n_seqs)
+ output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+
+ return {output_tokens, new_state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ q = ggml_scale(ctx0, q, scale);
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(g, "g_in", il);
+
+ state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+ ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
+ ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
+
+ // Apply exponential to g_t
+ g_t = ggml_exp(ctx0, g_t);
+
+ // Apply the gated delta rule for the single timestep
+ // last_recurrent_state = last_recurrent_state * g_t
+ state = ggml_mul(ctx0, state, g_t);
+
+ // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
+ ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
+ ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
+ // we need to sum over dim=-2, so we transpose, sum, then transpose again
+ kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
+
+ // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
+ ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+ // delta = (v_t - kv_mem) * beta_t
+ ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
+ ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
+
+ // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
+ ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
+ state = ggml_add(ctx0, state, k_t_delta);
+
+ // Compute the attention output
+ // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
+ ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
+ ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
+ // again, since it's over dim = -2, transpose, sum, transpose back
+ ggml_tensor * core_attn_out =
+ ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
+
+ // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+ cb(core_attn_out, "output_tokens", il);
+ cb(state, "new_state", il);
+
+ return {core_attn_out, state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz(
+ ggml_tensor * input,
+ int il) {
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+ qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
+ cb(qkv_mixed, "linear_attn_qkv_mixed", il);
+
+ ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+ cb(z, "z", il);
+
+ return { qkv_mixed, z };
+}
+
+ggml_tensor * llm_build_qwen35::build_norm_gated(
+ ggml_tensor * input,
+ ggml_tensor * weights,
+ ggml_tensor * gate,
+ int layer) {
+ ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
+ ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
+
+ return ggml_mul(ctx0, normalized, gated_silu);
+}
+
+ggml_tensor * llm_build_qwen35::build_layer_attn(
+ llm_graph_input_attn_kv * inp,
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ int * sections,
+ int il) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
+
+ // Qwen3Next uses a single Q projection that outputs query + gate
+ ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
+ cb(Qcur_full, "Qcur_full", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
+ ggml_element_size(Qcur_full) * n_embd_head * 2,
+ ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0);
+ cb(Qcur, "Qcur_reshaped", il);
+
+ // Apply Q normalization
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ // Apply K normalization
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
+ ggml_element_size(Qcur_full) * n_embd_head * 2,
+ ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
+ ggml_element_size(Qcur_full) * n_embd_head);
+ gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
+ cb(gate, "gate_reshaped", il);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ // Apply MRoPE
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // Attention computation
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ cur = build_attn(inp,
+ nullptr, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_pregate", il);
+
+ ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
+ cb(gate_sigmoid, "gate_sigmoid", il);
+
+ cur = ggml_mul(ctx0, cur, gate_sigmoid);
+ cb(cur, "attn_gated", il);
+
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ cb(cur, "attn_output", il);
+
+ return cur;
+}
+
+ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
+ llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t head_k_dim = hparams.ssm_d_state;
+ const int64_t num_k_heads = hparams.ssm_n_group;
+ const int64_t num_v_heads = hparams.ssm_dt_rank;
+ const int64_t head_v_dim = d_inner / num_v_heads;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ // Input projections
+ auto qkvz = build_qkvz(cur, il);
+ ggml_tensor * qkv_mixed = qkvz.first;
+ ggml_tensor * z = qkvz.second;
+
+ ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
+ beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
+ cb(beta, "beta", il);
+ ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
+ alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
+ cb(alpha, "alpha", il);
+
+ ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
+ ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
+ cb(alpha_softplus, "a_softplus", il);
+ ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
+ cb(gate, "gate", il);
+
+ // Get convolution states from cache
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ // bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
+
+ // Build the convolution states tensor
+ ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ cb(conv_states, "conv_states", il);
+
+ // Calculate convolution kernel size
+ ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
+ const int64_t conv_kernel_size = conv_kernel->ne[0];
+ const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
+ conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
+ cb(conv_states, "conv_states_reshaped", il);
+
+ qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
+ cb(qkv_mixed, "qkv_mixed_permuted", il);
+
+ ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
+ cb(conv_input, "conv_input", il);
+
+ // Update convolution state cache
+ // Extract the last (conv_kernel_size - 1) states from conv_input
+ ggml_tensor * last_conv_states =
+ ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
+ conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
+ cb(last_conv_states, "last_conv_states", il);
+
+ ggml_tensor * state_update_target =
+ ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
+ kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
+ cb(state_update_target, "state_update_target", il);
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
+ cb(conv_states_all, "conv_states_updated", il);
+
+ // Apply SSM convolution
+ ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
+ cb(conv_output_proper, "conv_output_raw", il);
+
+ ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
+ cb(conv_output_silu, "conv_output_silu", il);
+
+ ggml_tensor * conv_qkv_mix = conv_output_silu;
+
+ // Calculate the total conv dimension
+ int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
+ int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
+
+ // Extract the convolved Q, K, V from conv_output
+ ggml_tensor * q_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
+ cb(q_conv, "q_conv", il);
+ ggml_tensor * k_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
+ head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+ cb(k_conv, "k_conv", il);
+ ggml_tensor * v_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
+ 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+ cb(v_conv, "v_conv", il);
+
+ // Unsqueeze them
+ q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+
+ ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
+ state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
+ cb(state, "state_predelta", il);
+
+ // if head keys and value keys are different, repeat Q/K to match V's head count
+ // V heads are in tiled order (from conversion), so simple tiled repeat works
+ if (num_k_heads != num_v_heads) {
+ GGML_ASSERT(num_v_heads % num_k_heads == 0);
+ q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
+ k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
+ }
+
+ cb(q_conv, "q_conv_predelta", il);
+ cb(k_conv, "k_conv_predelta", il);
+ cb(v_conv, "v_conv_predelta", il);
+
+ // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
+ std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
+ if (n_seq_tokens == 1) {
+ attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
+ } else {
+ attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
+ }
+ ggml_tensor * output = attn_out.first;
+ ggml_tensor * new_state = attn_out.second;
+ cb(output, "attn_output", il);
+ cb(new_state, "new_state", il);
+
+ // Update the recurrent states
+ ggml_build_forward_expand(gf,
+ ggml_cpy(ctx0, new_state,
+ ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+ kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
+
+ // Reshape both attn_out_final and z to 2D tensors for normalization
+ // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+ ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+ // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+ ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+ // Apply gated normalization: self.norm(core_attn_out, z)
+ ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
+
+ // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
+ ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+ cb(final_output, "final_output", il);
+
+ // Output projection
+ cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+ cb(cur, "linear_attn_out", il);
+
+ // Reshape back to original dimensions
+ cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+ return cur;
+}
+
+ggml_tensor * llm_build_qwen35::build_layer_ffn(ggml_tensor * cur, const int il) {
+ // Qwen3.5 does not use MoE FFN
+ GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ return cur;
+}
diff --git a/llama.cpp/src/models/qwen35moe.cpp b/llama.cpp/src/models/qwen35moe.cpp
new file mode 100644
index 0000000..0db8f82
--- /dev/null
+++ b/llama.cpp/src/models/qwen35moe.cpp
@@ -0,0 +1,774 @@
+#include "ggml.h"
+#include "models.h"
+
+#define CHUNK_SIZE 64
+
+llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params), model(model) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ cb(inpL, "model.input_embed", -1);
+
+ auto * inp = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ ggml_tensor * causal_mask =
+ ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
+ GGML_TRI_TYPE_LOWER);
+
+ ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
+ ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+ ggml_build_forward_expand(gf, causal_mask);
+ ggml_build_forward_expand(gf, identity);
+ ggml_build_forward_expand(gf, diag_mask);
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // Determine layer type and build appropriate attention mechanism
+ if (hparams.is_recurrent(il)) {
+ // Linear attention layer (gated delta net)
+ cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
+ } else {
+ // Full attention layer
+ cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // Residual connection
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "attn_residual", il);
+
+ // Save the tensor before post-attention norm for residual connection
+ ggml_tensor * ffn_residual = cur;
+
+ // Post-attention norm
+ ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
+ cb(attn_post_norm, "attn_post_norm", il);
+
+ // MOE FFN layer
+ cur = build_layer_ffn(attn_post_norm, il);
+ cb(cur, "ffn_out", il);
+
+ // Residual connection for FFN - add to the tensor from before post_attention_layernorm
+ cur = ggml_add(ctx0, cur, ffn_residual);
+ cb(cur, "post_moe", il);
+
+ // Input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // Final norm
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // LM head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// utility to get one slice from the third dimension
+// input dim: [x, y, c, b]
+// output dim: [x, y, 1, b]
+static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
+ return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
+ t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ q = ggml_scale(ctx0, q, scale);
+
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(g, "g_in", il);
+
+ q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
+
+ beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+ state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+ cb(q, "q_perm", il);
+ cb(k, "k_perm", il);
+ cb(v, "v_perm", il);
+ cb(beta, "beta_perm", il);
+ cb(g, "g_perm", il);
+ cb(state, "state_in", il);
+
+ GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+ GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+ GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+ // Do padding
+ const int64_t chunk_size = CHUNK_SIZE;
+
+ const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+ const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+ q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+ k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+ v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+ g = ggml_pad(ctx0, g, pad, 0, 0, 0);
+ beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+
+ cb(q, "q_pad", il);
+ cb(k, "k_pad", il);
+ cb(v, "v_pad", il);
+ cb(beta, "beta_pad", il);
+ cb(g, "g_pad", il);
+
+ ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+ ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+ cb(v_beta, "v_beta", il);
+ cb(k_beta, "k_beta", il);
+
+ q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
+ v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
+
+ g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
+ beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
+
+ ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+ cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+ ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
+
+ ggml_tensor * gcs_j_broadcast =
+ ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
+
+ ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
+ cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+ decay_mask = ggml_exp(ctx0, decay_mask);
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+ ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
+
+ ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
+ ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
+ cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
+ ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+
+ ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+ attn = ggml_mul(ctx0, lin_solve, causal_mask);
+ attn = ggml_add(ctx0, attn, identity);
+ cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
+
+ ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
+ ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
+
+ ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
+ cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * k_cumdecay =
+ ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+ cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
+ attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
+ attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
+ cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+
+ // vectorized calculation of key_gdiff
+ // improved from the chunked version:
+ // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+ // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+ // key_gdiff = key * g_diff.unsqueeze(-1)
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+ // get last element in g_cumsum along chunk_size dimension (ne0)
+ // example: [[x, y, z, ..., last], ...] -> [[last], ...]
+ ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
+ g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
+ (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
+ g_last = ggml_cont(ctx0, g_last);
+ cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
+ cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
+ cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
+ ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
+ 1, chunk_size, n_chunks, g_diff_exp->ne[3]);
+
+ ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
+ cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
+ cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
+
+
+ // state to be updated per chunk
+ ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
+ cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
+
+ // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
+ ggml_tensor * core_attn_out = nullptr;
+
+ for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+ // shape: (S_k, chunk_size, 1, H_k * n_seqs)
+ ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
+
+ // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+ ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
+
+ // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+ ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
+
+ // shape: (chunk_size, 1, H_v * n_seqs)
+ ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
+
+ // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+ // replaced by precomputed attn_kq
+ ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
+ cb(attn_chunk, "attn_chunk", il);
+
+ ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
+
+ // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+ ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+ cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
+
+ // v_new = v_i - v_prime
+ ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
+ ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+ cb(v_new, "v_new_chunk", il);
+
+ // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+ ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
+ ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+ cb(attn_inter, "attn_inter_chunk", il);
+
+ // core_attn_out[:, :, i] = attn_inter + attn @ v_new
+ ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
+ cb(v_attn, "v_attn_chunk", il);
+
+ ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+ cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+
+ core_attn_out = core_attn_out == nullptr
+ ? core_attn_out_chunk
+ : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
+
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
+ //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
+ ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
+
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+ ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
+ new_state = ggml_add(ctx0,
+ ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
+ ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+ }
+
+ // truncate padded tokens
+ ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+ S_v, n_tokens, H_v, n_seqs,
+ ggml_row_size(core_attn_out->type, S_v),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+ cb(output_tokens, "output_tokens", il);
+
+ // permute back to (S_v, H_v, n_tokens, n_seqs)
+ output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+
+ return {output_tokens, new_state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ q = ggml_scale(ctx0, q, scale);
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(g, "g_in", il);
+
+ state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+ ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
+ ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
+
+ // Apply exponential to g_t
+ g_t = ggml_exp(ctx0, g_t);
+
+ // Apply the gated delta rule for the single timestep
+ // last_recurrent_state = last_recurrent_state * g_t
+ state = ggml_mul(ctx0, state, g_t);
+
+ // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
+ ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
+ ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
+ // we need to sum over dim=-2, so we transpose, sum, then transpose again
+ kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
+
+ // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
+ ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+ // delta = (v_t - kv_mem) * beta_t
+ ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
+ ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
+
+ // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
+ ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
+ state = ggml_add(ctx0, state, k_t_delta);
+
+ // Compute the attention output
+ // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
+ ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
+ ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
+ // again, since it's over dim = -2, transpose, sum, transpose back
+ ggml_tensor * core_attn_out =
+ ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
+
+ // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+ cb(core_attn_out, "output_tokens", il);
+ cb(state, "new_state", il);
+
+ return {core_attn_out, state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
+ ggml_tensor * input,
+ int il) {
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+ qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
+ cb(qkv_mixed, "linear_attn_qkv_mixed", il);
+
+ ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+ cb(z, "z", il);
+
+ return { qkv_mixed, z };
+}
+
+ggml_tensor * llm_build_qwen35moe::build_norm_gated(
+ ggml_tensor * input,
+ ggml_tensor * weights,
+ ggml_tensor * gate,
+ int layer) {
+ ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
+ ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
+
+ return ggml_mul(ctx0, normalized, gated_silu);
+}
+
+ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
+ llm_graph_input_attn_kv * inp,
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ int * sections,
+ int il) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
+
+ // Qwen3Next uses a single Q projection that outputs query + gate
+ ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
+ cb(Qcur_full, "Qcur_full", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
+ ggml_element_size(Qcur_full) * n_embd_head * 2,
+ ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0);
+ cb(Qcur, "Qcur_reshaped", il);
+
+ // Apply Q normalization
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ // Apply K normalization
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
+ ggml_element_size(Qcur_full) * n_embd_head * 2,
+ ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
+ ggml_element_size(Qcur_full) * n_embd_head);
+ gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
+ cb(gate, "gate_reshaped", il);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ // Apply IMRoPE
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // Attention computation
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ cur = build_attn(inp,
+ nullptr, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_pregate", il);
+
+ ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
+ cb(gate_sigmoid, "gate_sigmoid", il);
+
+ cur = ggml_mul(ctx0, cur, gate_sigmoid);
+ cb(cur, "attn_gated", il);
+
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ cb(cur, "attn_output", il);
+
+ return cur;
+}
+
+ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
+ llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t head_k_dim = hparams.ssm_d_state;
+ const int64_t num_k_heads = hparams.ssm_n_group;
+ const int64_t num_v_heads = hparams.ssm_dt_rank;
+ const int64_t head_v_dim = d_inner / num_v_heads;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ // Input projections
+ auto qkvz = build_qkvz(cur, il);
+ ggml_tensor * qkv_mixed = qkvz.first;
+ ggml_tensor * z = qkvz.second;
+
+ ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
+ beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
+ cb(beta, "beta", il);
+ ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
+ alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
+ cb(alpha, "alpha", il);
+
+ ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
+ ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
+ cb(alpha_softplus, "a_softplus", il);
+ ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
+ cb(gate, "gate", il);
+
+ // Get convolution states from cache
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ // bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
+
+ // Build the convolution states tensor
+ ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ cb(conv_states, "conv_states", il);
+
+ // Calculate convolution kernel size
+ ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
+ const int64_t conv_kernel_size = conv_kernel->ne[0];
+ const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
+ conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
+ cb(conv_states, "conv_states_reshaped", il);
+
+ qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
+ cb(qkv_mixed, "qkv_mixed_permuted", il);
+
+ ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
+ cb(conv_input, "conv_input", il);
+
+ // Update convolution state cache
+ // Extract the last (conv_kernel_size - 1) states from conv_input
+ ggml_tensor * last_conv_states =
+ ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
+ conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
+ cb(last_conv_states, "last_conv_states", il);
+
+ ggml_tensor * state_update_target =
+ ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
+ kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
+ cb(state_update_target, "state_update_target", il);
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
+ cb(conv_states_all, "conv_states_updated", il);
+
+ // Apply SSM convolution
+ ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
+ cb(conv_output_proper, "conv_output_raw", il);
+
+ ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
+ cb(conv_output_silu, "conv_output_silu", il);
+
+ ggml_tensor * conv_qkv_mix = conv_output_silu;
+
+ // Calculate the total conv dimension
+ int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
+ int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
+
+ // Extract the convolved Q, K, V from conv_output
+ ggml_tensor * q_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
+ cb(q_conv, "q_conv", il);
+ ggml_tensor * k_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
+ head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+ cb(k_conv, "k_conv", il);
+ ggml_tensor * v_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
+ 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+ cb(v_conv, "v_conv", il);
+
+ // Unsqueeze them
+ q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+
+ ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
+ state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
+ cb(state, "state_predelta", il);
+
+ // if head keys and value keys are different, repeat Q/K to match V's head count
+ // V heads are in tiled order (from conversion), so simple tiled repeat works
+ if (num_k_heads != num_v_heads) {
+ GGML_ASSERT(num_v_heads % num_k_heads == 0);
+ q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
+ k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
+ }
+
+ cb(q_conv, "q_conv_predelta", il);
+ cb(k_conv, "k_conv_predelta", il);
+ cb(v_conv, "v_conv_predelta", il);
+
+ // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
+ std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
+ if (n_seq_tokens == 1) {
+ attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
+ } else {
+ attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
+ }
+ ggml_tensor * output = attn_out.first;
+ ggml_tensor * new_state = attn_out.second;
+ cb(output, "attn_output", il);
+ cb(new_state, "new_state", il);
+
+ // Update the recurrent states
+ ggml_build_forward_expand(gf,
+ ggml_cpy(ctx0, new_state,
+ ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+ kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
+
+ // Reshape both attn_out_final and z to 2D tensors for normalization
+ // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+ ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+ // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+ ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+ // Apply gated normalization: self.norm(core_attn_out, z)
+ ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
+
+ // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
+ ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+ cb(final_output, "final_output", il);
+
+ // Output projection
+ cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+ cb(cur, "linear_attn_out", il);
+
+ // Reshape back to original dimensions
+ cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+ return cur;
+}
+
+ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int il) {
+ // Check if this is an MoE layer
+ GGML_ASSERT(model.layers[il].ffn_gate_inp != nullptr);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used, LLM_FFN_SILU,
+ true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // Add shared experts if present - following Qwen3Next reference implementation
+ if (model.layers[il].ffn_up_shexp != nullptr) {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ // Apply shared expert gating as in the reference implementation
+ // The shared expert has its own gate that is sigmoided
+ // Note: ffn_gate_inp_shexp is the shared expert gate (outputs 1 value per token)
+ ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
+ cb(shared_gate, "shared_expert_gate", il);
+
+ // Apply sigmoid to the gate
+ shared_gate = ggml_sigmoid(ctx0, shared_gate);
+ cb(shared_gate, "shared_expert_gate_sigmoid", il);
+
+
+ // Apply the gate to the shared expert output
+ ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
+ cb(ffn_shexp, "ffn_shexp_gated", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+
+ return cur;
+}
diff --git a/llama.cpp/src/models/qwen3moe.cpp b/llama.cpp/src/models/qwen3moe.cpp
new file mode 100644
index 0000000..888534f
--- /dev/null
+++ b/llama.cpp/src/models/qwen3moe.cpp
@@ -0,0 +1,124 @@
+#include "models.h"
+
+llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+ cur = moe_out;
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/qwen3next.cpp b/llama.cpp/src/models/qwen3next.cpp
new file mode 100644
index 0000000..99b1a76
--- /dev/null
+++ b/llama.cpp/src/models/qwen3next.cpp
@@ -0,0 +1,879 @@
+#include "ggml.h"
+#include "models.h"
+
+#define CHUNK_SIZE 64
+
+llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context_mamba(params), model(model) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "model.embed_tokens", -1);
+
+ auto * inp = build_inp_mem_hybrid();
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ ggml_tensor * causal_mask =
+ ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
+ GGML_TRI_TYPE_LOWER);
+
+ ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
+ ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+ ggml_build_forward_expand(gf, causal_mask);
+ ggml_build_forward_expand(gf, identity);
+ ggml_build_forward_expand(gf, diag_mask);
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // Determine layer type and build appropriate attention mechanism
+ if (hparams.is_recurrent(il)) {
+ // Linear attention layer (gated delta net)
+ cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
+ } else {
+ // Full attention layer
+ cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // Residual connection
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "attn_residual", il);
+
+ // Save the tensor before post-attention norm for residual connection
+ ggml_tensor * ffn_residual = cur;
+
+ // Post-attention norm
+ ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
+ cb(attn_post_norm, "attn_post_norm", il);
+
+ // FFN layer (MoE or dense) - without residual connection
+ cur = build_layer_ffn(attn_post_norm, il);
+ cb(cur, "ffn_out", il);
+
+ // Residual connection for FFN - add to the tensor from before post_attention_layernorm
+ cur = ggml_add(ctx0, cur, ffn_residual);
+ cb(cur, "post_moe", il);
+
+ // Input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // Final norm
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // LM head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// utility to get one slice from the third dimension
+// input dim: [x, y, c, b]
+// output dim: [x, y, 1, b]
+static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
+ return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
+ t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ q = ggml_scale(ctx0, q, scale);
+
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(g, "g_in", il);
+
+ q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+ g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
+
+ beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+ state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+ cb(q, "q_perm", il);
+ cb(k, "k_perm", il);
+ cb(v, "v_perm", il);
+ cb(beta, "beta_perm", il);
+ cb(g, "g_perm", il);
+ cb(state, "state_in", il);
+
+ GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+ GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+ GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+ // Do padding
+ const int64_t chunk_size = CHUNK_SIZE;
+
+ const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+ const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+ q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+ k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+ v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+ g = ggml_pad(ctx0, g, pad, 0, 0, 0);
+ beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+
+ cb(q, "q_pad", il);
+ cb(k, "k_pad", il);
+ cb(v, "v_pad", il);
+ cb(beta, "beta_pad", il);
+ cb(g, "g_pad", il);
+
+ ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+ ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+ cb(v_beta, "v_beta", il);
+ cb(k_beta, "k_beta", il);
+
+ q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
+ v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
+ v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
+
+ g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
+ beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
+
+ ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+ cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+ ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
+
+ ggml_tensor * gcs_j_broadcast =
+ ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
+
+ ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
+ cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+ decay_mask = ggml_exp(ctx0, decay_mask);
+ decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+ ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
+
+ ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
+ ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
+ cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
+ ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+
+ ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+ attn = ggml_mul(ctx0, lin_solve, causal_mask);
+ attn = ggml_add(ctx0, attn, identity);
+ cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
+
+ ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
+ ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
+
+ ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
+ cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * k_cumdecay =
+ ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+ cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
+ attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
+ attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
+ cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+
+ // vectorized calculation of key_gdiff
+ // improved from the chunked version:
+ // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+ // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+ // key_gdiff = key * g_diff.unsqueeze(-1)
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+ // get last element in g_cumsum along chunk_size dimension (ne0)
+ // example: [[x, y, z, ..., last], ...] -> [[last], ...]
+ ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
+ g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
+ (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
+ g_last = ggml_cont(ctx0, g_last);
+ cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
+ cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
+ cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
+ ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
+ 1, chunk_size, n_chunks, g_diff_exp->ne[3]);
+
+ ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
+ cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+ ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
+ cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
+
+
+ // state to be updated per chunk
+ ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
+ cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
+
+ // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
+ ggml_tensor * core_attn_out = nullptr;
+
+ for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+ // shape: (S_k, chunk_size, 1, H_k * n_seqs)
+ ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
+
+ // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+ ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
+
+ // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+ ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
+
+ // shape: (chunk_size, 1, H_v * n_seqs)
+ ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
+
+ // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+ // replaced by precomputed attn_kq
+ ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
+ cb(attn_chunk, "attn_chunk", il);
+
+ ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
+
+ // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+ ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+ cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
+
+ // v_new = v_i - v_prime
+ ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
+ ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+ cb(v_new, "v_new_chunk", il);
+
+ // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+ ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
+ ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+ cb(attn_inter, "attn_inter_chunk", il);
+
+ // core_attn_out[:, :, i] = attn_inter + attn @ v_new
+ ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
+ cb(v_attn, "v_attn_chunk", il);
+
+ ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+ cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+
+ core_attn_out = core_attn_out == nullptr
+ ? core_attn_out_chunk
+ : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
+
+ // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+ ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
+ //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
+ ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
+
+ // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+ ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
+ new_state = ggml_add(ctx0,
+ ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
+ ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+ }
+
+ // truncate padded tokens
+ ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+ S_v, n_tokens, H_v, n_seqs,
+ ggml_row_size(core_attn_out->type, S_v),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+ ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+ cb(output_tokens, "output_tokens", il);
+
+ // permute back to (S_v, H_v, n_tokens, n_seqs)
+ output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+ output_tokens = ggml_cont(ctx0, output_tokens);
+
+ return {output_tokens, new_state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
+ ggml_tensor * q,
+ ggml_tensor * k,
+ ggml_tensor * v,
+ ggml_tensor * g,
+ ggml_tensor * beta,
+ ggml_tensor * state,
+ int il) {
+ const int64_t S_k = q->ne[0];
+ const int64_t H_k = q->ne[1];
+ const int64_t n_tokens = q->ne[2];
+ const int64_t n_seqs = q->ne[3];
+
+ const int64_t S_v = v->ne[0];
+ const int64_t H_v = v->ne[1];
+
+ GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
+ GGML_ASSERT(v->ne[2] == n_tokens);
+ GGML_ASSERT(k->ne[2] == n_tokens);
+ GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+ GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+ GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+ GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
+
+ const float eps_norm = hparams.f_norm_rms_eps;
+
+ q = ggml_l2_norm(ctx0, q, eps_norm);
+ k = ggml_l2_norm(ctx0, k, eps_norm);
+
+ const float scale = 1.0f / sqrtf(S_v);
+
+ q = ggml_scale(ctx0, q, scale);
+ beta = ggml_sigmoid(ctx0, beta);
+
+ cb(q, "q_in", il);
+ cb(k, "k_in", il);
+ cb(v, "v_in", il);
+ cb(beta, "beta_in", il);
+ cb(g, "g_in", il);
+
+ state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+ ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
+ ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
+
+ // Apply exponential to g_t
+ g_t = ggml_exp(ctx0, g_t);
+
+ // Apply the gated delta rule for the single timestep
+ // last_recurrent_state = last_recurrent_state * g_t
+ state = ggml_mul(ctx0, state, g_t);
+
+ // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
+ ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
+ ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
+ // we need to sum over dim=-2, so we transpose, sum, then transpose again
+ kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
+
+ // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
+ ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+ // delta = (v_t - kv_mem) * beta_t
+ ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
+ ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
+
+ // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
+ ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
+ state = ggml_add(ctx0, state, k_t_delta);
+
+ // Compute the attention output
+ // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
+ ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
+ ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
+ // again, since it's over dim = -2, transpose, sum, transpose back
+ ggml_tensor * core_attn_out =
+ ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
+
+ // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+ cb(core_attn_out, "output_tokens", il);
+ cb(state, "new_state", il);
+
+ return {core_attn_out, state};
+}
+
+ggml_tensor * llm_build_qwen3next::build_norm_gated(
+ ggml_tensor * input,
+ ggml_tensor * weights,
+ ggml_tensor * gate,
+ int layer) {
+ ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
+ ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
+
+ return ggml_mul(ctx0, normalized, gated_silu);
+}
+
+ggml_tensor * llm_build_qwen3next::build_layer_attn(
+ llm_graph_input_attn_kv * inp,
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ int il) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
+
+ // Qwen3Next uses a single Q projection that outputs query + gate
+ ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur_full, "Qcur_full", il);
+
+ Qcur_full = ggml_reshape_4d(ctx0, Qcur_full, n_embd_head * 2, n_head, n_tokens, 1);
+
+ // Split Q projection into query and gate
+ // The split should be along dimension 0 (the feature dimension)
+ ggml_tensor * Qcur = ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
+ Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0);
+ ggml_tensor * gate =
+ ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
+ Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], n_embd_head * ggml_element_size(Qcur_full));
+ cb(Qcur, "Qcur", il);
+ cb(gate, "gate", il);
+
+ // Now reshape Qcur to [n_embd_head, n_head, n_tokens] for multi-head attention
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ cb(Qcur, "Qcur_reshaped", il);
+
+ // Apply Q normalization
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ // Apply K normalization
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ // Reshape gate to [n_embd, n_tokens] for the sigmoid gating (flatten the heads)
+ gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
+ cb(gate, "gate_reshaped", il);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ // Apply RoPE
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ // Attention computation
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ cur = build_attn(inp,
+ nullptr, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_pregate", il);
+
+ ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
+ cb(gate_sigmoid, "gate_sigmoid", il);
+
+ cur = ggml_mul(ctx0, cur, gate_sigmoid);
+ cb(cur, "attn_gated", il);
+
+ cur = build_lora_mm(model.layers[il].wo, cur);
+ cb(cur, "attn_output", il);
+
+ return cur;
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
+ ggml_tensor * input,
+ int il) {
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t head_k_dim = hparams.ssm_d_state;
+ const int64_t num_k_heads = hparams.ssm_n_group;
+ const int64_t num_v_heads = hparams.ssm_dt_rank;
+ const int64_t head_v_dim = d_inner / num_v_heads;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ if (model.layers[il].wqkv) {
+ // optimized path
+ ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+ qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
+ cb(qkv_mixed, "linear_attn_qkv_mixed", il);
+
+ ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+ cb(z, "z", il);
+
+ return { qkv_mixed, z };
+
+ } else {
+ // legacy (slower) path
+ ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
+ cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
+
+ int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
+ ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+
+ // Split mixed_qkvz into query, key, value, z
+ int64_t split_sizes_qkvz[4] = {
+ head_k_dim, // query size
+ head_k_dim, // key size
+ head_v_dim * num_v_heads / num_k_heads, // value size
+ head_v_dim * num_v_heads / num_k_heads // z size
+ };
+
+ ggml_tensor * query =
+ ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
+ mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
+ cb(query, "q", il);
+
+ ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
+ mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+ split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped));
+ cb(key, "k", il);
+
+ ggml_tensor * value =
+ ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
+ mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+ (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped));
+ cb(value, "v", il);
+
+ ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
+ mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+ (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped));
+ z = ggml_cont(ctx0, z);
+ cb(z, "z", il);
+
+ // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
+ // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+ ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+ cb(query_flat, "query_flat", il);
+
+ // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+ ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+ cb(key_flat, "key_flat", il);
+
+ // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
+ ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+ cb(value_flat, "value_flat", il);
+
+ // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
+ ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
+ qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
+ cb(qkv_mixed, "qkv_mixed", il);
+
+ return { qkv_mixed, z };
+ }
+}
+
+ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
+ llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * causal_mask,
+ ggml_tensor * identity,
+ ggml_tensor * diag_mask,
+ int il) {
+ const auto * mctx_cur = inp->mctx;
+
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t head_k_dim = hparams.ssm_d_state;
+ const int64_t num_k_heads = hparams.ssm_n_group;
+ const int64_t num_v_heads = hparams.ssm_dt_rank;
+ const int64_t head_v_dim = d_inner / num_v_heads;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs());
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+ // Input projections
+ auto qkvz = build_qkvz(cur, il);
+ ggml_tensor * qkv_mixed = qkvz.first;
+ ggml_tensor * z = qkvz.second;
+
+ ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
+ cb(mixed_ba, "linear_attn_mixed_ba", il);
+
+ // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
+ int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
+ ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+
+ // Split mixed_ba into b and a (beta and alpha parameters)
+ int64_t split_sizes_ba[2] = {
+ num_v_heads / num_k_heads, // beta size
+ num_v_heads / num_k_heads // alpha size
+ };
+
+ ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_seq_tokens, n_seqs,
+ mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0);
+ cb(b, "b", il);
+
+ ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_seq_tokens, n_seqs,
+ mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3],
+ split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
+ cb(a, "a", il);
+
+ ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
+
+ // Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
+ ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
+
+ ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
+ ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
+ cb(alpha_softplus, "a_softplus", il);
+ ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
+ cb(gate, "gate", il);
+
+ // Get convolution states from cache
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
+
+ // bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
+
+ // Build the convolution states tensor
+ ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+ cb(conv_states, "conv_states", il);
+
+ // Calculate convolution kernel size
+ ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
+ const int64_t conv_kernel_size = conv_kernel->ne[0];
+ const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
+ conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
+ cb(conv_states, "conv_states_reshaped", il);
+
+ qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
+ cb(qkv_mixed, "qkv_mixed_permuted", il);
+
+ ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
+ cb(conv_input, "conv_input", il);
+
+ // Update convolution state cache
+ // Extract the last (conv_kernel_size - 1) states from conv_input
+ ggml_tensor * last_conv_states =
+ ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
+ conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
+ cb(last_conv_states, "last_conv_states", il);
+
+ ggml_tensor * state_update_target =
+ ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
+ kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
+ cb(state_update_target, "state_update_target", il);
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
+ cb(conv_states_all, "conv_states_updated", il);
+
+ // Apply SSM convolution
+ ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
+ cb(conv_output_proper, "conv_output_raw", il);
+
+ ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
+ cb(conv_output_silu, "conv_output_silu", il);
+
+ ggml_tensor * conv_qkv_mix = conv_output_silu;
+
+ // Calculate the total conv dimension
+ int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
+ int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
+
+ // Extract the convolved Q, K, V from conv_output
+ ggml_tensor * q_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
+ cb(q_conv, "q_conv", il);
+ ggml_tensor * k_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
+ head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+ cb(k_conv, "k_conv", il);
+ ggml_tensor * v_conv =
+ ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
+ 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+ cb(v_conv, "v_conv", il);
+
+ // Unsqueeze them
+ q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+ v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+
+ ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
+ state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
+ cb(state, "state_predelta", il);
+
+ // if head keys and value keys are different, repeat to force tensors into matching shapes
+ if (num_k_heads != num_v_heads) {
+ GGML_ASSERT(num_v_heads % num_k_heads == 0);
+ int64_t repeat_factor = num_v_heads / num_k_heads;
+
+ // repeat interleave: reshape to (repeat part, 1, remaining part), do repeat, then reshape back
+ ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
+ ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
+
+ // Repeat along the third dimension (the new dimension with size 1)
+ ggml_tensor * q_repeated =
+ ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
+ ggml_tensor * k_repeated =
+ ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
+
+ // Reshape back to merge the head and repeat dimensions
+ // From [head_dim, num_k_heads, repeat_factor, n_seq_tokens * n_seqs]
+ // Back to [head_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs]
+ q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
+ k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
+ }
+
+ cb(q_conv, "q_conv_predelta", il);
+ cb(k_conv, "k_conv_predelta", il);
+ cb(v_conv, "v_conv_predelta", il);
+
+ // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
+ std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
+ if (n_seq_tokens == 1) {
+ attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
+ } else {
+ attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
+ }
+ ggml_tensor * output = attn_out.first;
+ ggml_tensor * new_state = attn_out.second;
+ cb(output, "attn_output", il);
+ cb(new_state, "new_state", il);
+
+ // Update the recurrent states
+ ggml_build_forward_expand(gf,
+ ggml_cpy(ctx0, new_state,
+ ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+ kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
+
+ // Reshape both attn_out_final and z to 2D tensors for normalization
+ // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+ ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+ // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+ ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+ // Apply gated normalization: self.norm(core_attn_out, z)
+ ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
+
+ // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
+ ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+ cb(final_output, "final_output", il);
+
+ // Output projection
+ cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+ cb(cur, "linear_attn_out", il);
+
+ // Reshape back to original dimensions
+ cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+ return cur;
+}
+
+ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int il) {
+ // Check if this is an MoE layer
+ if (model.layers[il].ffn_gate_inp != nullptr) {
+ // MoE branch
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used, LLM_FFN_SILU,
+ true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // Add shared experts if present - following Qwen3Next reference implementation
+ if (model.layers[il].ffn_up_shexp != nullptr) {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ // Apply shared expert gating as in the reference implementation
+ // The shared expert has its own gate that is sigmoided
+ // Note: ffn_gate_inp_shexp is the shared expert gate (outputs 1 value per token)
+ ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
+ cb(shared_gate, "shared_expert_gate", il);
+
+ // Apply sigmoid to the gate
+ shared_gate = ggml_sigmoid(ctx0, shared_gate);
+ cb(shared_gate, "shared_expert_gate_sigmoid", il);
+
+ // Apply the gate to the shared expert output
+ ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
+ cb(ffn_shexp, "ffn_shexp_gated", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ } else {
+ cur = moe_out;
+ }
+ } else {
+ // Dense FFN branch (not currently used I believe)
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ return cur;
+}
diff --git a/llama.cpp/src/models/qwen3vl-moe.cpp b/llama.cpp/src/models/qwen3vl-moe.cpp
new file mode 100644
index 0000000..e5e1a21
--- /dev/null
+++ b/llama.cpp/src/models/qwen3vl-moe.cpp
@@ -0,0 +1,140 @@
+#include "models.h"
+
+llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const size_t n_deepstack_layers = hparams.n_deepstack_layers;
+
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+ cur = moe_out;
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ if (il < (int) n_deepstack_layers) {
+ ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
+ cur = ggml_add(ctx0, cur, ds);
+ cb(cur, "deepstack_out", il);
+ }
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
diff --git a/llama.cpp/src/models/qwen3vl.cpp b/llama.cpp/src/models/qwen3vl.cpp
new file mode 100644
index 0000000..0f8315b
--- /dev/null
+++ b/llama.cpp/src/models/qwen3vl.cpp
@@ -0,0 +1,132 @@
+#include "models.h"
+
+llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const size_t n_deepstack_layers = hparams.n_deepstack_layers;
+
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ if (il < (int) n_deepstack_layers) {
+ ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
+ cur = ggml_add(ctx0, cur, ds);
+ cb(cur, "deepstack_out", il);
+ }
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/refact.cpp b/llama.cpp/src/models/refact.cpp
new file mode 100644
index 0000000..ff5eb28
--- /dev/null
+++ b/llama.cpp/src/models/refact.cpp
@@ -0,0 +1,94 @@
+#include "models.h"
+
+llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/rnd1.cpp b/llama.cpp/src/models/rnd1.cpp
new file mode 100644
index 0000000..46b3dc3
--- /dev/null
+++ b/llama.cpp/src/models/rnd1.cpp
@@ -0,0 +1,126 @@
+#include "models.h"
+
+// RND1 is a Qwen3Moe AR model converted to diffusion model.
+llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // Non-causal attention for diffusion
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * moe_out =
+ build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+ cur = moe_out;
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/rwkv6-base.cpp b/llama.cpp/src/models/rwkv6-base.cpp
new file mode 100644
index 0000000..7beed2d
--- /dev/null
+++ b/llama.cpp/src/models/rwkv6-base.cpp
@@ -0,0 +1,162 @@
+#include "models.h"
+
+llm_build_rwkv6_base::llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model) {}
+
+ggml_tensor * llm_build_rwkv6_base::build_rwkv6_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const {
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+ switch (arch) {
+ case LLM_ARCH_RWKV6:
+ {
+ ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+ ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
+
+ ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
+ ggml_tensor * k = ggml_sqr(ctx0, ggml_relu(ctx0, build_lora_mm(layer->channel_mix_key, xk)));
+ cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
+ }
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+ return cur;
+}
+
+ggml_tensor * llm_build_rwkv6_base::build_rwkv6_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_embd = hparams.n_embd;
+ const auto head_size = hparams.wkv_head_size;
+ const auto n_head = n_embd / head_size;
+ const auto n_head_kv = hparams.n_head_kv(il);
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const auto & layer = model.layers[il];
+
+ bool is_qrwkv = layer.time_mix_first == nullptr;
+
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+
+ sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+
+ ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
+
+ xxx = ggml_reshape_4d(ctx0, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)),
+ layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens);
+
+ xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
+
+ xxx = ggml_mul_mat(
+ ctx0, ggml_reshape_4d(ctx0, layer.time_mix_w2, layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5), xxx);
+
+ ggml_tensor *xw, *xk, *xv, *xr, *xg;
+ if (layer.time_mix_lerp_fused) {
+ // fusing these weights makes some performance improvement
+ sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+ xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
+ xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+ } else {
+ // for backward compatibility
+ xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+
+ xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
+ xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
+ xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
+ xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
+ xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
+ }
+ ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
+ ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
+ ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
+ if (layer.time_mix_receptance_b) {
+ r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
+ }
+ if (layer.time_mix_key_b) {
+ k = ggml_add(ctx0, k, layer.time_mix_key_b);
+ }
+ if (layer.time_mix_value_b) {
+ v = ggml_add(ctx0, v, layer.time_mix_value_b);
+ }
+ ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
+ if (is_qrwkv) {
+ g = ggml_sigmoid(ctx0, g);
+ } else {
+ g = ggml_silu(ctx0, g);
+ }
+ if (n_head_kv != 0 && n_head_kv != n_head) {
+ GGML_ASSERT(n_head % n_head_kv == 0);
+ k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
+ v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
+ ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
+ k = ggml_repeat(ctx0, k, tmp);
+ v = ggml_repeat(ctx0, v, tmp);
+ }
+ k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
+ r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
+
+ ggml_tensor * w =
+ ggml_mul_mat(ctx0, layer.time_mix_decay_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)));
+
+ w = ggml_add(ctx0, w, layer.time_mix_decay);
+ w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
+ w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
+
+ if (is_qrwkv) {
+ // k = k * (1 - w)
+ k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
+ }
+ ggml_tensor * wkv_state = build_rs(inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs);
+
+ ggml_tensor * wkv_output;
+ if (is_qrwkv) {
+ wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
+ } else {
+ wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
+ }
+ cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+ wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, wkv_state,
+ ggml_view_1d(ctx0, mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs,
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)))));
+
+ if (!is_qrwkv) {
+ // group norm with head_count groups
+ cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
+ cur = ggml_norm(ctx0, cur, 64e-5f);
+
+ // Convert back to regular vectors.
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
+ } else {
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ }
+ cur = ggml_mul(ctx0, cur, g);
+ cur = build_lora_mm(layer.time_mix_output, cur);
+
+ return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
+}
diff --git a/llama.cpp/src/models/rwkv6.cpp b/llama.cpp/src/models/rwkv6.cpp
new file mode 100644
index 0000000..15453fb
--- /dev/null
+++ b/llama.cpp/src/models/rwkv6.cpp
@@ -0,0 +1,94 @@
+#include "models.h"
+
+llm_build_rwkv6::llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) :
+ llm_build_rwkv6_base(model, params) {
+ GGML_ASSERT(hparams.token_shift_count == 2);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_shift =
+ ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+ ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1],
+ token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0, att_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), 1);
+
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+ cb(ffn_norm, "ffn_norm", il);
+
+ x_prev = ggml_concat(
+ ctx0, ffn_shift,
+ ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), 1);
+
+ token_shift = ggml_concat(ctx0,
+ ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(att_norm)),
+ ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(ffn_norm)),
+ 1);
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ }
+ cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+ cur = ggml_scale(ctx0, cur, 0.5F);
+ }
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/rwkv6qwen2.cpp b/llama.cpp/src/models/rwkv6qwen2.cpp
new file mode 100644
index 0000000..e84e597
--- /dev/null
+++ b/llama.cpp/src/models/rwkv6qwen2.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+llm_build_rwkv6qwen2::llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0,
+ token_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+ 1
+ );
+
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
+
+ token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ }
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/rwkv7-base.cpp b/llama.cpp/src/models/rwkv7-base.cpp
new file mode 100644
index 0000000..cda4465
--- /dev/null
+++ b/llama.cpp/src/models/rwkv7-base.cpp
@@ -0,0 +1,135 @@
+#include "models.h"
+
+llm_build_rwkv7_base::llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params),
+ model(model) {}
+
+ggml_tensor * llm_build_rwkv7_base::build_rwkv7_channel_mix(const llama_layer * layer,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ llm_arch arch) const {
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+ switch (arch) {
+ case LLM_ARCH_RWKV7:
+ {
+ ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+
+ ggml_tensor * k = ggml_sqr(ctx0, ggml_relu(ctx0, build_lora_mm(layer->channel_mix_key, xk)));
+
+ cur = build_lora_mm(layer->channel_mix_value, k);
+ }
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+ return cur;
+}
+
+ggml_tensor * llm_build_rwkv7_base::build_rwkv7_time_mix(llm_graph_input_rs * inp,
+ ggml_tensor * cur,
+ ggml_tensor * x_prev,
+ ggml_tensor *& first_layer_value,
+ const llama_ubatch & ubatch,
+ int il) const {
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+ const auto n_embd = hparams.n_embd;
+ const auto head_size = hparams.wkv_head_size;
+ const auto head_count = n_embd / head_size;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+
+ const auto kv_head = mctx_cur->get_head();
+
+ const auto & layer = model.layers[il];
+
+ bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
+
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+ ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
+ sx = ggml_repeat(ctx0, sx, dummy);
+
+ ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
+
+ ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+ ggml_tensor * xg =
+ has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) :
+ nullptr;
+
+ ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
+ ggml_tensor * w = ggml_add(
+ ctx0, ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
+ layer.time_mix_w0);
+ w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
+
+ ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
+ ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
+ if (first_layer_value == nullptr) {
+ first_layer_value = v;
+ } else {
+ // Add the first layer value as a residual connection.
+ v = ggml_add(ctx0, v,
+ ggml_mul(ctx0, ggml_sub(ctx0, first_layer_value, v),
+ ggml_sigmoid(ctx0, ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.time_mix_v2,
+ ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
+ layer.time_mix_v0))));
+ }
+ ggml_tensor * g = nullptr;
+ if (layer.time_mix_g1 && layer.time_mix_g2) {
+ g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
+ }
+ ggml_tensor * a = ggml_sigmoid(
+ ctx0, ggml_add(ctx0, ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
+ layer.time_mix_a0));
+
+ ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
+ kk = ggml_l2_norm(ctx0, kk, 1e-12);
+
+ ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
+ k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
+
+ r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
+ w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
+ k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
+ a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
+
+ ggml_tensor * wkv_state = build_rs(inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs);
+
+ ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
+ cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+ wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+ ggml_build_forward_expand(
+ gf, ggml_cpy(ctx0, wkv_state,
+ ggml_view_1d(ctx0, mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs,
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)))));
+
+ if (layer.time_mix_ln && layer.time_mix_ln_b) {
+ // group norm with head_count groups
+ cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
+ cur = ggml_norm(ctx0, cur, 64e-5f);
+
+ // Convert back to regular vectors.
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
+ } else {
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ }
+ ggml_tensor * rk = ggml_sum_rows(
+ ctx0, ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
+ cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
+
+ if (has_gating) {
+ cur = ggml_mul(ctx0, cur, g);
+ }
+ cur = build_lora_mm(layer.time_mix_output, cur);
+
+ return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
+}
diff --git a/llama.cpp/src/models/rwkv7.cpp b/llama.cpp/src/models/rwkv7.cpp
new file mode 100644
index 0000000..5caf655
--- /dev/null
+++ b/llama.cpp/src/models/rwkv7.cpp
@@ -0,0 +1,90 @@
+#include "models.h"
+
+llm_build_rwkv7::llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) :
+ llm_build_rwkv7_base(model, params) {
+ GGML_ASSERT(hparams.token_shift_count == 2);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * v_first = nullptr;
+
+ inpL = build_inp_embd(model.tok_embd);
+ inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+
+ auto * rs_inp = build_rs_inp();
+
+ const auto n_embd = hparams.n_embd;
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
+ const auto n_seqs = ubatch.n_seqs;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+
+ ggml_tensor * att_shift =
+ ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+ ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1],
+ token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+ cb(att_norm, "attn_norm", il);
+
+ ggml_tensor * x_prev = ggml_concat(
+ ctx0, att_shift,
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), 1);
+
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+ cb(ffn_norm, "ffn_norm", il);
+
+ x_prev = ggml_concat(
+ ctx0, ffn_shift,
+ ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), 1);
+
+ token_shift = ggml_concat(ctx0,
+ ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(att_norm)),
+ ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2],
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(ffn_norm)),
+ 1);
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
+ }
+ cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/seed-oss.cpp b/llama.cpp/src/models/seed-oss.cpp
new file mode 100644
index 0000000..0dc33c5
--- /dev/null
+++ b/llama.cpp/src/models/seed-oss.cpp
@@ -0,0 +1,124 @@
+#include "models.h"
+
+llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = build_norm(ffn_inp,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/smallthinker.cpp b/llama.cpp/src/models/smallthinker.cpp
new file mode 100644
index 0000000..4c497ca
--- /dev/null
+++ b/llama.cpp/src/models/smallthinker.cpp
@@ -0,0 +1,126 @@
+#include "models.h"
+
+template <bool iswa>
+llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ ggml_tensor * inpSA = inpL;
+
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
+ const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
+ il % hparams.n_no_rope_layer_step != 0;
+
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
+ cb(probs, "ffn_moe_logits", il);
+
+ // norm
+ cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self_attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ probs = ggml_get_rows(ctx0, probs, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // MoE branch
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ ggml_tensor * ffn_out =
+ build_moe_ffn(cur,
+ nullptr,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_RELU, true,
+ false, 0.0,
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+ il, probs);
+
+ cb(ffn_out, "ffn_out", il);
+ cur = ffn_out;
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_smallthinker<false>;
+template struct llm_build_smallthinker<true>;
diff --git a/llama.cpp/src/models/smollm3.cpp b/llama.cpp/src/models/smollm3.cpp
new file mode 100644
index 0000000..97c30de
--- /dev/null
+++ b/llama.cpp/src/models/smollm3.cpp
@@ -0,0 +1,128 @@
+#include "models.h"
+
+llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (use_rope) {
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/stablelm.cpp b/llama.cpp/src/models/stablelm.cpp
new file mode 100644
index 0000000..bed1915
--- /dev/null
+++ b/llama.cpp/src/models/stablelm.cpp
@@ -0,0 +1,146 @@
+#include "models.h"
+
+llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * inpSA = cur;
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur,
+ model.layers[il].attn_q_norm,
+ NULL,
+ LLM_NORM, il);
+ cb(Qcur, "Qcur", il);
+ }
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur,
+ model.layers[il].attn_k_norm,
+ NULL,
+ LLM_NORM, il);
+ cb(Kcur, "Kcur", il);
+ }
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ if (model.layers[il].ffn_norm) {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+ } else {
+ // parallel residual
+ cur = inpSA;
+ }
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/starcoder.cpp b/llama.cpp/src/models/starcoder.cpp
new file mode 100644
index 0000000..e197af4
--- /dev/null
+++ b/llama.cpp/src/models/starcoder.cpp
@@ -0,0 +1,100 @@
+#include "models.h"
+
+llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
+
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+ // add the input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = build_norm(inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/starcoder2.cpp b/llama.cpp/src/models/starcoder2.cpp
new file mode 100644
index 0000000..e40ef2c
--- /dev/null
+++ b/llama.cpp/src/models/starcoder2.cpp
@@ -0,0 +1,121 @@
+#include "models.h"
+
+llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, model.output_norm_b,
+ LLM_NORM, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/step35-iswa.cpp b/llama.cpp/src/models/step35-iswa.cpp
new file mode 100644
index 0000000..f873781
--- /dev/null
+++ b/llama.cpp/src/models/step35-iswa.cpp
@@ -0,0 +1,168 @@
+#include "models.h"
+
+llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ const uint32_t n_head_l = hparams.n_head(il);
+ const uint32_t n_head_kv_l = hparams.n_head_kv(il);
+
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ cur = inpL;
+
+ // dump pre-attn RMSNorm input to pinpoint layer boundary issues
+ cb(cur, "attn_norm_in", il);
+
+ // self-attention
+ {
+ cur = build_norm(cur, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
+
+ // Q/K per-head RMSNorm (Step35 q_norm / k_norm)
+ if (model.layers[il].attn_q_norm) {
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ }
+ if (model.layers[il].attn_k_norm) {
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur_normed", il);
+ }
+
+ // RoPE (partial rotary factors per layer)
+ const bool is_swa = hparams.is_swa(il);
+ ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il);
+ const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2);
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, rope_factors,
+ n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, rope_factors,
+ n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur_pos", il);
+ cb(Kcur, "Kcur_pos", il);
+
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k));
+ ggml_tensor * attn_out = build_attn(inp_attn,
+ nullptr, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(attn_out, "attn_out", il);
+ // head-wise attention gate: sigmoid(g_proj(x)) in torch
+ if (model.layers[il].wqkv_gate) {
+ ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, cur); // [n_head_l, n_tokens]
+ cb(gate, "attn_gate", il);
+
+ gate = ggml_sigmoid(ctx0, gate);
+ cb(gate, "attn_gate_sigmoid", il);
+
+ // reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens]
+ ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens);
+ ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
+ cb(gate_3d, "attn_gate_3d", il);
+
+ attn_3d = ggml_mul(ctx0, attn_3d, gate_3d);
+ cb(attn_3d, "attn_gated_3d", il);
+
+ attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens);
+ cb(attn_out, "attn_gated", il);
+ }
+
+ // output projection
+ cur = build_lora_mm(model.layers[il].wo, attn_out);
+ cb(cur, "attn_proj", il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // dense MLP
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr,
+ nullptr,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE routed experts
+ const bool norm_w = hparams.expert_weights_norm;
+ const float w_scale = hparams.expert_weights_scale;
+ const bool scale_w = w_scale != 0.0f;
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU,
+ norm_w, scale_w, w_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // shared expert MLP (always added on MoE layers in Step35)
+ ggml_tensor * sh_out = build_ffn(cur,
+ model.layers[il].ffn_up_shexp, nullptr, nullptr,
+ model.layers[il].ffn_gate_shexp, nullptr, nullptr,
+ model.layers[il].ffn_down_shexp, nullptr, nullptr,
+ nullptr,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(sh_out, "ffn_shared_out", il);
+
+ cur = ggml_add(ctx0, moe_out, sh_out);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/t5-dec.cpp b/llama.cpp/src/models/t5-dec.cpp
new file mode 100644
index 0000000..297e450
--- /dev/null
+++ b/llama.cpp/src/models/t5-dec.cpp
@@ -0,0 +1,166 @@
+#include "models.h"
+
+llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * embd_enc = build_inp_cross_embd();
+ ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
+
+ const int64_t n_outputs_enc = embd_enc->ne[1];
+
+ auto * inp_attn_self = build_attn_inp_kv();
+ auto * inp_attn_cross = build_attn_inp_cross();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ const int64_t dec_n_layer = hparams.dec_n_layer;
+
+ for (int il = 0; il < dec_n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+ ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
+
+ cur = build_attn(inp_attn_self,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
+ cb(cur, "kqv_out", il);
+ }
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "cross_inp", il);
+
+ ggml_tensor * inpCA = cur;
+
+ // norm
+ cur = build_norm(cur,
+ model.layers[il].attn_norm_cross, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm_cross", il);
+
+ // cross-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
+
+ cur = build_attn(inp_attn_cross,
+ model.layers[il].wo_cross, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+ cb(cur, "kqv_out", il);
+
+ //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+ //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ //cb(kq, "kq", il);
+
+ //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+ //cb(kq, "kq_soft_max_ext", il);
+
+ //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+ //cb(v, "v", il);
+
+ //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+ //cb(kqv, "kqv", il);
+
+ //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ //cb(kqv_merged, "kqv_merged", il);
+
+ //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ //cb(cur, "kqv_merged_cont", il);
+
+ //ggml_build_forward_expand(gf, cur);
+
+ //cur = build_lora_mm(model.layers[il].wo_cross, cur);
+ //cb(cur, "kqv_out", il);
+ }
+ if (il == dec_n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/t5-enc.cpp b/llama.cpp/src/models/t5-enc.cpp
new file mode 100644
index 0000000..70e1d80
--- /dev/null
+++ b/llama.cpp/src/models/t5-enc.cpp
@@ -0,0 +1,96 @@
+#include "models.h"
+
+llm_build_t5_enc::llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm_enc, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+ ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo_enc, nullptr,
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
+ cb(cur, "kqv_out", il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm_enc, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up_enc, NULL, NULL,
+ model.layers[il].ffn_gate_enc, NULL, NULL,
+ model.layers[il].ffn_down_enc, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = build_norm(cur,
+ model.output_norm_enc, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/wavtokenizer-dec.cpp b/llama.cpp/src/models/wavtokenizer-dec.cpp
new file mode 100644
index 0000000..537a0d4
--- /dev/null
+++ b/llama.cpp/src/models/wavtokenizer-dec.cpp
@@ -0,0 +1,149 @@
+#include "models.h"
+
+llm_build_wavtokenizer_dec::llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
+
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, model.conv1d_b);
+
+ // posnet
+ for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
+ const auto & layer = model.layers[il].posnet;
+
+ inpL = cur;
+
+ switch (il) {
+ case 0:
+ case 1:
+ case 3:
+ case 4:
+ {
+ cur = build_norm(cur,
+ layer.norm1,
+ layer.norm1_b,
+ LLM_NORM_GROUP, 0);
+
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+ cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.conv1_b);
+
+ cur = build_norm(cur,
+ layer.norm2,
+ layer.norm2_b,
+ LLM_NORM_GROUP, 0);
+
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+ cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.conv2_b);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ } break;
+ case 2:
+ {
+ cur = build_norm(cur,
+ layer.attn_norm,
+ layer.attn_norm_b,
+ LLM_NORM_GROUP, 0);
+
+ ggml_tensor * q;
+ ggml_tensor * k;
+ ggml_tensor * v;
+
+ q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
+ k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
+ v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
+
+ q = ggml_add(ctx0, q, layer.attn_q_b);
+ k = ggml_add(ctx0, k, layer.attn_k_b);
+ v = ggml_add(ctx0, v, layer.attn_v_b);
+
+ q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
+ k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
+
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+
+ kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
+
+ cur = ggml_mul_mat(ctx0, kq, v);
+
+ cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.attn_o_b);
+
+ cur = ggml_add(ctx0, cur, inpL);
+ } break;
+ case 5:
+ {
+ cur = build_norm(cur,
+ layer.norm,
+ layer.norm_b,
+ LLM_NORM_GROUP, 0);
+ } break;
+ default: GGML_ABORT("unknown posnet layer");
+ };
+ }
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ cur = build_norm(cur,
+ model.tok_norm,
+ model.tok_norm_b,
+ LLM_NORM, -1);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ inpL = cur;
+
+ // convnext
+ for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
+ const auto & layer = model.layers[il].convnext;
+
+ cur = inpL;
+
+ cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
+ cur = ggml_add(ctx0, cur, layer.dw_b);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ cur = build_norm(cur,
+ layer.norm,
+ layer.norm_b,
+ LLM_NORM, -1);
+
+ cur = build_ffn(cur,
+ layer.pw1, layer.pw1_b, NULL,
+ NULL, NULL, NULL,
+ layer.pw2, layer.pw2_b, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
+
+ cur = ggml_mul(ctx0, cur, layer.gamma);
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ inpL = ggml_add(ctx0, cur, inpL);
+ }
+ cur = inpL;
+
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+ cur = build_norm(cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, -1);
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cur = ggml_add(ctx0, cur, model.output_b);
+
+ cb(cur, "result_embd", -1);
+ res->t_embd = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/models/xverse.cpp b/llama.cpp/src/models/xverse.cpp
new file mode 100644
index 0000000..364797d
--- /dev/null
+++ b/llama.cpp/src/models/xverse.cpp
@@ -0,0 +1,108 @@
+#include "models.h"
+
+llm_build_xverse::llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ }
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
diff --git a/llama.cpp/src/unicode-data.cpp b/llama.cpp/src/unicode-data.cpp
new file mode 100644
index 0000000..04dcd7f
--- /dev/null
+++ b/llama.cpp/src/unicode-data.cpp
@@ -0,0 +1,7034 @@
+// generated with scripts/gen-unicode-data.py
+
+#include "unicode-data.h"
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
+{0x000000, 0x0080},
+{0x000020, 0x0008},
+{0x000021, 0x0020},
+{0x000024, 0x0040},
+{0x000025, 0x0020},
+{0x00002B, 0x0040},
+{0x00002C, 0x0020},
+{0x000030, 0x0002},
+{0x00003A, 0x0020},
+{0x00003C, 0x0040},
+{0x00003F, 0x0020},
+{0x000041, 0x0004},
+{0x00005B, 0x0020},
+{0x00005E, 0x0040},
+{0x00005F, 0x0020},
+{0x000060, 0x0040},
+{0x000061, 0x0004},
+{0x00007B, 0x0020},
+{0x00007C, 0x0040},
+{0x00007D, 0x0020},
+{0x00007E, 0x0040},
+{0x00007F, 0x0080},
+{0x0000A0, 0x0008},
+{0x0000A1, 0x0020},
+{0x0000A2, 0x0040},
+{0x0000A7, 0x0020},
+{0x0000A8, 0x0040},
+{0x0000AA, 0x0004},
+{0x0000AB, 0x0020},
+{0x0000AC, 0x0040},
+{0x0000AD, 0x0080},
+{0x0000AE, 0x0040},
+{0x0000B2, 0x0002},
+{0x0000B4, 0x0040},
+{0x0000B5, 0x0004},
+{0x0000B6, 0x0020},
+{0x0000B8, 0x0040},
+{0x0000B9, 0x0002},
+{0x0000BA, 0x0004},
+{0x0000BB, 0x0020},
+{0x0000BC, 0x0002},
+{0x0000BF, 0x0020},
+{0x0000C0, 0x0004},
+{0x0000D7, 0x0040},
+{0x0000D8, 0x0004},
+{0x0000F7, 0x0040},
+{0x0000F8, 0x0004},
+{0x0002C2, 0x0040},
+{0x0002C6, 0x0004},
+{0x0002D2, 0x0040},
+{0x0002E0, 0x0004},
+{0x0002E5, 0x0040},
+{0x0002EC, 0x0004},
+{0x0002ED, 0x0040},
+{0x0002EE, 0x0004},
+{0x0002EF, 0x0040},
+{0x000300, 0x0010},
+{0x000370, 0x0004},
+{0x000375, 0x0040},
+{0x000376, 0x0004},
+{0x000378, 0x0001},
+{0x00037A, 0x0004},
+{0x00037E, 0x0020},
+{0x00037F, 0x0004},
+{0x000380, 0x0001},
+{0x000384, 0x0040},
+{0x000386, 0x0004},
+{0x000387, 0x0020},
+{0x000388, 0x0004},
+{0x00038B, 0x0001},
+{0x00038C, 0x0004},
+{0x00038D, 0x0001},
+{0x00038E, 0x0004},
+{0x0003A2, 0x0001},
+{0x0003A3, 0x0004},
+{0x0003F6, 0x0040},
+{0x0003F7, 0x0004},
+{0x000482, 0x0040},
+{0x000483, 0x0010},
+{0x00048A, 0x0004},
+{0x000530, 0x0001},
+{0x000531, 0x0004},
+{0x000557, 0x0001},
+{0x000559, 0x0004},
+{0x00055A, 0x0020},
+{0x000560, 0x0004},
+{0x000589, 0x0020},
+{0x00058B, 0x0001},
+{0x00058D, 0x0040},
+{0x000590, 0x0001},
+{0x000591, 0x0010},
+{0x0005BE, 0x0020},
+{0x0005BF, 0x0010},
+{0x0005C0, 0x0020},
+{0x0005C1, 0x0010},
+{0x0005C3, 0x0020},
+{0x0005C4, 0x0010},
+{0x0005C6, 0x0020},
+{0x0005C7, 0x0010},
+{0x0005C8, 0x0001},
+{0x0005D0, 0x0004},
+{0x0005EB, 0x0001},
+{0x0005EF, 0x0004},
+{0x0005F3, 0x0020},
+{0x0005F5, 0x0001},
+{0x000600, 0x0080},
+{0x000606, 0x0040},
+{0x000609, 0x0020},
+{0x00060B, 0x0040},
+{0x00060C, 0x0020},
+{0x00060E, 0x0040},
+{0x000610, 0x0010},
+{0x00061B, 0x0020},
+{0x00061C, 0x0080},
+{0x00061D, 0x0020},
+{0x000620, 0x0004},
+{0x00064B, 0x0010},
+{0x000660, 0x0002},
+{0x00066A, 0x0020},
+{0x00066E, 0x0004},
+{0x000670, 0x0010},
+{0x000671, 0x0004},
+{0x0006D4, 0x0020},
+{0x0006D5, 0x0004},
+{0x0006D6, 0x0010},
+{0x0006DD, 0x0080},
+{0x0006DE, 0x0040},
+{0x0006DF, 0x0010},
+{0x0006E5, 0x0004},
+{0x0006E7, 0x0010},
+{0x0006E9, 0x0040},
+{0x0006EA, 0x0010},
+{0x0006EE, 0x0004},
+{0x0006F0, 0x0002},
+{0x0006FA, 0x0004},
+{0x0006FD, 0x0040},
+{0x0006FF, 0x0004},
+{0x000700, 0x0020},
+{0x00070E, 0x0001},
+{0x00070F, 0x0080},
+{0x000710, 0x0004},
+{0x000711, 0x0010},
+{0x000712, 0x0004},
+{0x000730, 0x0010},
+{0x00074B, 0x0001},
+{0x00074D, 0x0004},
+{0x0007A6, 0x0010},
+{0x0007B1, 0x0004},
+{0x0007B2, 0x0001},
+{0x0007C0, 0x0002},
+{0x0007CA, 0x0004},
+{0x0007EB, 0x0010},
+{0x0007F4, 0x0004},
+{0x0007F6, 0x0040},
+{0x0007F7, 0x0020},
+{0x0007FA, 0x0004},
+{0x0007FB, 0x0001},
+{0x0007FD, 0x0010},
+{0x0007FE, 0x0040},
+{0x000800, 0x0004},
+{0x000816, 0x0010},
+{0x00081A, 0x0004},
+{0x00081B, 0x0010},
+{0x000824, 0x0004},
+{0x000825, 0x0010},
+{0x000828, 0x0004},
+{0x000829, 0x0010},
+{0x00082E, 0x0001},
+{0x000830, 0x0020},
+{0x00083F, 0x0001},
+{0x000840, 0x0004},
+{0x000859, 0x0010},
+{0x00085C, 0x0001},
+{0x00085E, 0x0020},
+{0x00085F, 0x0001},
+{0x000860, 0x0004},
+{0x00086B, 0x0001},
+{0x000870, 0x0004},
+{0x000888, 0x0040},
+{0x000889, 0x0004},
+{0x00088F, 0x0001},
+{0x000890, 0x0080},
+{0x000892, 0x0001},
+{0x000898, 0x0010},
+{0x0008A0, 0x0004},
+{0x0008CA, 0x0010},
+{0x0008E2, 0x0080},
+{0x0008E3, 0x0010},
+{0x000904, 0x0004},
+{0x00093A, 0x0010},
+{0x00093D, 0x0004},
+{0x00093E, 0x0010},
+{0x000950, 0x0004},
+{0x000951, 0x0010},
+{0x000958, 0x0004},
+{0x000962, 0x0010},
+{0x000964, 0x0020},
+{0x000966, 0x0002},
+{0x000970, 0x0020},
+{0x000971, 0x0004},
+{0x000981, 0x0010},
+{0x000984, 0x0001},
+{0x000985, 0x0004},
+{0x00098D, 0x0001},
+{0x00098F, 0x0004},
+{0x000991, 0x0001},
+{0x000993, 0x0004},
+{0x0009A9, 0x0001},
+{0x0009AA, 0x0004},
+{0x0009B1, 0x0001},
+{0x0009B2, 0x0004},
+{0x0009B3, 0x0001},
+{0x0009B6, 0x0004},
+{0x0009BA, 0x0001},
+{0x0009BC, 0x0010},
+{0x0009BD, 0x0004},
+{0x0009BE, 0x0010},
+{0x0009C5, 0x0001},
+{0x0009C7, 0x0010},
+{0x0009C9, 0x0001},
+{0x0009CB, 0x0010},
+{0x0009CE, 0x0004},
+{0x0009CF, 0x0001},
+{0x0009D7, 0x0010},
+{0x0009D8, 0x0001},
+{0x0009DC, 0x0004},
+{0x0009DE, 0x0001},
+{0x0009DF, 0x0004},
+{0x0009E2, 0x0010},
+{0x0009E4, 0x0001},
+{0x0009E6, 0x0002},
+{0x0009F0, 0x0004},
+{0x0009F2, 0x0040},
+{0x0009F4, 0x0002},
+{0x0009FA, 0x0040},
+{0x0009FC, 0x0004},
+{0x0009FD, 0x0020},
+{0x0009FE, 0x0010},
+{0x0009FF, 0x0001},
+{0x000A01, 0x0010},
+{0x000A04, 0x0001},
+{0x000A05, 0x0004},
+{0x000A0B, 0x0001},
+{0x000A0F, 0x0004},
+{0x000A11, 0x0001},
+{0x000A13, 0x0004},
+{0x000A29, 0x0001},
+{0x000A2A, 0x0004},
+{0x000A31, 0x0001},
+{0x000A32, 0x0004},
+{0x000A34, 0x0001},
+{0x000A35, 0x0004},
+{0x000A37, 0x0001},
+{0x000A38, 0x0004},
+{0x000A3A, 0x0001},
+{0x000A3C, 0x0010},
+{0x000A3D, 0x0001},
+{0x000A3E, 0x0010},
+{0x000A43, 0x0001},
+{0x000A47, 0x0010},
+{0x000A49, 0x0001},
+{0x000A4B, 0x0010},
+{0x000A4E, 0x0001},
+{0x000A51, 0x0010},
+{0x000A52, 0x0001},
+{0x000A59, 0x0004},
+{0x000A5D, 0x0001},
+{0x000A5E, 0x0004},
+{0x000A5F, 0x0001},
+{0x000A66, 0x0002},
+{0x000A70, 0x0010},
+{0x000A72, 0x0004},
+{0x000A75, 0x0010},
+{0x000A76, 0x0020},
+{0x000A77, 0x0001},
+{0x000A81, 0x0010},
+{0x000A84, 0x0001},
+{0x000A85, 0x0004},
+{0x000A8E, 0x0001},
+{0x000A8F, 0x0004},
+{0x000A92, 0x0001},
+{0x000A93, 0x0004},
+{0x000AA9, 0x0001},
+{0x000AAA, 0x0004},
+{0x000AB1, 0x0001},
+{0x000AB2, 0x0004},
+{0x000AB4, 0x0001},
+{0x000AB5, 0x0004},
+{0x000ABA, 0x0001},
+{0x000ABC, 0x0010},
+{0x000ABD, 0x0004},
+{0x000ABE, 0x0010},
+{0x000AC6, 0x0001},
+{0x000AC7, 0x0010},
+{0x000ACA, 0x0001},
+{0x000ACB, 0x0010},
+{0x000ACE, 0x0001},
+{0x000AD0, 0x0004},
+{0x000AD1, 0x0001},
+{0x000AE0, 0x0004},
+{0x000AE2, 0x0010},
+{0x000AE4, 0x0001},
+{0x000AE6, 0x0002},
+{0x000AF0, 0x0020},
+{0x000AF1, 0x0040},
+{0x000AF2, 0x0001},
+{0x000AF9, 0x0004},
+{0x000AFA, 0x0010},
+{0x000B00, 0x0001},
+{0x000B01, 0x0010},
+{0x000B04, 0x0001},
+{0x000B05, 0x0004},
+{0x000B0D, 0x0001},
+{0x000B0F, 0x0004},
+{0x000B11, 0x0001},
+{0x000B13, 0x0004},
+{0x000B29, 0x0001},
+{0x000B2A, 0x0004},
+{0x000B31, 0x0001},
+{0x000B32, 0x0004},
+{0x000B34, 0x0001},
+{0x000B35, 0x0004},
+{0x000B3A, 0x0001},
+{0x000B3C, 0x0010},
+{0x000B3D, 0x0004},
+{0x000B3E, 0x0010},
+{0x000B45, 0x0001},
+{0x000B47, 0x0010},
+{0x000B49, 0x0001},
+{0x000B4B, 0x0010},
+{0x000B4E, 0x0001},
+{0x000B55, 0x0010},
+{0x000B58, 0x0001},
+{0x000B5C, 0x0004},
+{0x000B5E, 0x0001},
+{0x000B5F, 0x0004},
+{0x000B62, 0x0010},
+{0x000B64, 0x0001},
+{0x000B66, 0x0002},
+{0x000B70, 0x0040},
+{0x000B71, 0x0004},
+{0x000B72, 0x0002},
+{0x000B78, 0x0001},
+{0x000B82, 0x0010},
+{0x000B83, 0x0004},
+{0x000B84, 0x0001},
+{0x000B85, 0x0004},
+{0x000B8B, 0x0001},
+{0x000B8E, 0x0004},
+{0x000B91, 0x0001},
+{0x000B92, 0x0004},
+{0x000B96, 0x0001},
+{0x000B99, 0x0004},
+{0x000B9B, 0x0001},
+{0x000B9C, 0x0004},
+{0x000B9D, 0x0001},
+{0x000B9E, 0x0004},
+{0x000BA0, 0x0001},
+{0x000BA3, 0x0004},
+{0x000BA5, 0x0001},
+{0x000BA8, 0x0004},
+{0x000BAB, 0x0001},
+{0x000BAE, 0x0004},
+{0x000BBA, 0x0001},
+{0x000BBE, 0x0010},
+{0x000BC3, 0x0001},
+{0x000BC6, 0x0010},
+{0x000BC9, 0x0001},
+{0x000BCA, 0x0010},
+{0x000BCE, 0x0001},
+{0x000BD0, 0x0004},
+{0x000BD1, 0x0001},
+{0x000BD7, 0x0010},
+{0x000BD8, 0x0001},
+{0x000BE6, 0x0002},
+{0x000BF3, 0x0040},
+{0x000BFB, 0x0001},
+{0x000C00, 0x0010},
+{0x000C05, 0x0004},
+{0x000C0D, 0x0001},
+{0x000C0E, 0x0004},
+{0x000C11, 0x0001},
+{0x000C12, 0x0004},
+{0x000C29, 0x0001},
+{0x000C2A, 0x0004},
+{0x000C3A, 0x0001},
+{0x000C3C, 0x0010},
+{0x000C3D, 0x0004},
+{0x000C3E, 0x0010},
+{0x000C45, 0x0001},
+{0x000C46, 0x0010},
+{0x000C49, 0x0001},
+{0x000C4A, 0x0010},
+{0x000C4E, 0x0001},
+{0x000C55, 0x0010},
+{0x000C57, 0x0001},
+{0x000C58, 0x0004},
+{0x000C5B, 0x0001},
+{0x000C5D, 0x0004},
+{0x000C5E, 0x0001},
+{0x000C60, 0x0004},
+{0x000C62, 0x0010},
+{0x000C64, 0x0001},
+{0x000C66, 0x0002},
+{0x000C70, 0x0001},
+{0x000C77, 0x0020},
+{0x000C78, 0x0002},
+{0x000C7F, 0x0040},
+{0x000C80, 0x0004},
+{0x000C81, 0x0010},
+{0x000C84, 0x0020},
+{0x000C85, 0x0004},
+{0x000C8D, 0x0001},
+{0x000C8E, 0x0004},
+{0x000C91, 0x0001},
+{0x000C92, 0x0004},
+{0x000CA9, 0x0001},
+{0x000CAA, 0x0004},
+{0x000CB4, 0x0001},
+{0x000CB5, 0x0004},
+{0x000CBA, 0x0001},
+{0x000CBC, 0x0010},
+{0x000CBD, 0x0004},
+{0x000CBE, 0x0010},
+{0x000CC5, 0x0001},
+{0x000CC6, 0x0010},
+{0x000CC9, 0x0001},
+{0x000CCA, 0x0010},
+{0x000CCE, 0x0001},
+{0x000CD5, 0x0010},
+{0x000CD7, 0x0001},
+{0x000CDD, 0x0004},
+{0x000CDF, 0x0001},
+{0x000CE0, 0x0004},
+{0x000CE2, 0x0010},
+{0x000CE4, 0x0001},
+{0x000CE6, 0x0002},
+{0x000CF0, 0x0001},
+{0x000CF1, 0x0004},
+{0x000CF3, 0x0010},
+{0x000CF4, 0x0001},
+{0x000D00, 0x0010},
+{0x000D04, 0x0004},
+{0x000D0D, 0x0001},
+{0x000D0E, 0x0004},
+{0x000D11, 0x0001},
+{0x000D12, 0x0004},
+{0x000D3B, 0x0010},
+{0x000D3D, 0x0004},
+{0x000D3E, 0x0010},
+{0x000D45, 0x0001},
+{0x000D46, 0x0010},
+{0x000D49, 0x0001},
+{0x000D4A, 0x0010},
+{0x000D4E, 0x0004},
+{0x000D4F, 0x0040},
+{0x000D50, 0x0001},
+{0x000D54, 0x0004},
+{0x000D57, 0x0010},
+{0x000D58, 0x0002},
+{0x000D5F, 0x0004},
+{0x000D62, 0x0010},
+{0x000D64, 0x0001},
+{0x000D66, 0x0002},
+{0x000D79, 0x0040},
+{0x000D7A, 0x0004},
+{0x000D80, 0x0001},
+{0x000D81, 0x0010},
+{0x000D84, 0x0001},
+{0x000D85, 0x0004},
+{0x000D97, 0x0001},
+{0x000D9A, 0x0004},
+{0x000DB2, 0x0001},
+{0x000DB3, 0x0004},
+{0x000DBC, 0x0001},
+{0x000DBD, 0x0004},
+{0x000DBE, 0x0001},
+{0x000DC0, 0x0004},
+{0x000DC7, 0x0001},
+{0x000DCA, 0x0010},
+{0x000DCB, 0x0001},
+{0x000DCF, 0x0010},
+{0x000DD5, 0x0001},
+{0x000DD6, 0x0010},
+{0x000DD7, 0x0001},
+{0x000DD8, 0x0010},
+{0x000DE0, 0x0001},
+{0x000DE6, 0x0002},
+{0x000DF0, 0x0001},
+{0x000DF2, 0x0010},
+{0x000DF4, 0x0020},
+{0x000DF5, 0x0001},
+{0x000E01, 0x0004},
+{0x000E31, 0x0010},
+{0x000E32, 0x0004},
+{0x000E34, 0x0010},
+{0x000E3B, 0x0001},
+{0x000E3F, 0x0040},
+{0x000E40, 0x0004},
+{0x000E47, 0x0010},
+{0x000E4F, 0x0020},
+{0x000E50, 0x0002},
+{0x000E5A, 0x0020},
+{0x000E5C, 0x0001},
+{0x000E81, 0x0004},
+{0x000E83, 0x0001},
+{0x000E84, 0x0004},
+{0x000E85, 0x0001},
+{0x000E86, 0x0004},
+{0x000E8B, 0x0001},
+{0x000E8C, 0x0004},
+{0x000EA4, 0x0001},
+{0x000EA5, 0x0004},
+{0x000EA6, 0x0001},
+{0x000EA7, 0x0004},
+{0x000EB1, 0x0010},
+{0x000EB2, 0x0004},
+{0x000EB4, 0x0010},
+{0x000EBD, 0x0004},
+{0x000EBE, 0x0001},
+{0x000EC0, 0x0004},
+{0x000EC5, 0x0001},
+{0x000EC6, 0x0004},
+{0x000EC7, 0x0001},
+{0x000EC8, 0x0010},
+{0x000ECF, 0x0001},
+{0x000ED0, 0x0002},
+{0x000EDA, 0x0001},
+{0x000EDC, 0x0004},
+{0x000EE0, 0x0001},
+{0x000F00, 0x0004},
+{0x000F01, 0x0040},
+{0x000F04, 0x0020},
+{0x000F13, 0x0040},
+{0x000F14, 0x0020},
+{0x000F15, 0x0040},
+{0x000F18, 0x0010},
+{0x000F1A, 0x0040},
+{0x000F20, 0x0002},
+{0x000F34, 0x0040},
+{0x000F35, 0x0010},
+{0x000F36, 0x0040},
+{0x000F37, 0x0010},
+{0x000F38, 0x0040},
+{0x000F39, 0x0010},
+{0x000F3A, 0x0020},
+{0x000F3E, 0x0010},
+{0x000F40, 0x0004},
+{0x000F48, 0x0001},
+{0x000F49, 0x0004},
+{0x000F6D, 0x0001},
+{0x000F71, 0x0010},
+{0x000F85, 0x0020},
+{0x000F86, 0x0010},
+{0x000F88, 0x0004},
+{0x000F8D, 0x0010},
+{0x000F98, 0x0001},
+{0x000F99, 0x0010},
+{0x000FBD, 0x0001},
+{0x000FBE, 0x0040},
+{0x000FC6, 0x0010},
+{0x000FC7, 0x0040},
+{0x000FCD, 0x0001},
+{0x000FCE, 0x0040},
+{0x000FD0, 0x0020},
+{0x000FD5, 0x0040},
+{0x000FD9, 0x0020},
+{0x000FDB, 0x0001},
+{0x001000, 0x0004},
+{0x00102B, 0x0010},
+{0x00103F, 0x0004},
+{0x001040, 0x0002},
+{0x00104A, 0x0020},
+{0x001050, 0x0004},
+{0x001056, 0x0010},
+{0x00105A, 0x0004},
+{0x00105E, 0x0010},
+{0x001061, 0x0004},
+{0x001062, 0x0010},
+{0x001065, 0x0004},
+{0x001067, 0x0010},
+{0x00106E, 0x0004},
+{0x001071, 0x0010},
+{0x001075, 0x0004},
+{0x001082, 0x0010},
+{0x00108E, 0x0004},
+{0x00108F, 0x0010},
+{0x001090, 0x0002},
+{0x00109A, 0x0010},
+{0x00109E, 0x0040},
+{0x0010A0, 0x0004},
+{0x0010C6, 0x0001},
+{0x0010C7, 0x0004},
+{0x0010C8, 0x0001},
+{0x0010CD, 0x0004},
+{0x0010CE, 0x0001},
+{0x0010D0, 0x0004},
+{0x0010FB, 0x0020},
+{0x0010FC, 0x0004},
+{0x001249, 0x0001},
+{0x00124A, 0x0004},
+{0x00124E, 0x0001},
+{0x001250, 0x0004},
+{0x001257, 0x0001},
+{0x001258, 0x0004},
+{0x001259, 0x0001},
+{0x00125A, 0x0004},
+{0x00125E, 0x0001},
+{0x001260, 0x0004},
+{0x001289, 0x0001},
+{0x00128A, 0x0004},
+{0x00128E, 0x0001},
+{0x001290, 0x0004},
+{0x0012B1, 0x0001},
+{0x0012B2, 0x0004},
+{0x0012B6, 0x0001},
+{0x0012B8, 0x0004},
+{0x0012BF, 0x0001},
+{0x0012C0, 0x0004},
+{0x0012C1, 0x0001},
+{0x0012C2, 0x0004},
+{0x0012C6, 0x0001},
+{0x0012C8, 0x0004},
+{0x0012D7, 0x0001},
+{0x0012D8, 0x0004},
+{0x001311, 0x0001},
+{0x001312, 0x0004},
+{0x001316, 0x0001},
+{0x001318, 0x0004},
+{0x00135B, 0x0001},
+{0x00135D, 0x0010},
+{0x001360, 0x0020},
+{0x001369, 0x0002},
+{0x00137D, 0x0001},
+{0x001380, 0x0004},
+{0x001390, 0x0040},
+{0x00139A, 0x0001},
+{0x0013A0, 0x0004},
+{0x0013F6, 0x0001},
+{0x0013F8, 0x0004},
+{0x0013FE, 0x0001},
+{0x001400, 0x0020},
+{0x001401, 0x0004},
+{0x00166D, 0x0040},
+{0x00166E, 0x0020},
+{0x00166F, 0x0004},
+{0x001680, 0x0008},
+{0x001681, 0x0004},
+{0x00169B, 0x0020},
+{0x00169D, 0x0001},
+{0x0016A0, 0x0004},
+{0x0016EB, 0x0020},
+{0x0016EE, 0x0002},
+{0x0016F1, 0x0004},
+{0x0016F9, 0x0001},
+{0x001700, 0x0004},
+{0x001712, 0x0010},
+{0x001716, 0x0001},
+{0x00171F, 0x0004},
+{0x001732, 0x0010},
+{0x001735, 0x0020},
+{0x001737, 0x0001},
+{0x001740, 0x0004},
+{0x001752, 0x0010},
+{0x001754, 0x0001},
+{0x001760, 0x0004},
+{0x00176D, 0x0001},
+{0x00176E, 0x0004},
+{0x001771, 0x0001},
+{0x001772, 0x0010},
+{0x001774, 0x0001},
+{0x001780, 0x0004},
+{0x0017B4, 0x0010},
+{0x0017D4, 0x0020},
+{0x0017D7, 0x0004},
+{0x0017D8, 0x0020},
+{0x0017DB, 0x0040},
+{0x0017DC, 0x0004},
+{0x0017DD, 0x0010},
+{0x0017DE, 0x0001},
+{0x0017E0, 0x0002},
+{0x0017EA, 0x0001},
+{0x0017F0, 0x0002},
+{0x0017FA, 0x0001},
+{0x001800, 0x0020},
+{0x00180B, 0x0010},
+{0x00180E, 0x0080},
+{0x00180F, 0x0010},
+{0x001810, 0x0002},
+{0x00181A, 0x0001},
+{0x001820, 0x0004},
+{0x001879, 0x0001},
+{0x001880, 0x0004},
+{0x001885, 0x0010},
+{0x001887, 0x0004},
+{0x0018A9, 0x0010},
+{0x0018AA, 0x0004},
+{0x0018AB, 0x0001},
+{0x0018B0, 0x0004},
+{0x0018F6, 0x0001},
+{0x001900, 0x0004},
+{0x00191F, 0x0001},
+{0x001920, 0x0010},
+{0x00192C, 0x0001},
+{0x001930, 0x0010},
+{0x00193C, 0x0001},
+{0x001940, 0x0040},
+{0x001941, 0x0001},
+{0x001944, 0x0020},
+{0x001946, 0x0002},
+{0x001950, 0x0004},
+{0x00196E, 0x0001},
+{0x001970, 0x0004},
+{0x001975, 0x0001},
+{0x001980, 0x0004},
+{0x0019AC, 0x0001},
+{0x0019B0, 0x0004},
+{0x0019CA, 0x0001},
+{0x0019D0, 0x0002},
+{0x0019DB, 0x0001},
+{0x0019DE, 0x0040},
+{0x001A00, 0x0004},
+{0x001A17, 0x0010},
+{0x001A1C, 0x0001},
+{0x001A1E, 0x0020},
+{0x001A20, 0x0004},
+{0x001A55, 0x0010},
+{0x001A5F, 0x0001},
+{0x001A60, 0x0010},
+{0x001A7D, 0x0001},
+{0x001A7F, 0x0010},
+{0x001A80, 0x0002},
+{0x001A8A, 0x0001},
+{0x001A90, 0x0002},
+{0x001A9A, 0x0001},
+{0x001AA0, 0x0020},
+{0x001AA7, 0x0004},
+{0x001AA8, 0x0020},
+{0x001AAE, 0x0001},
+{0x001AB0, 0x0010},
+{0x001ACF, 0x0001},
+{0x001B00, 0x0010},
+{0x001B05, 0x0004},
+{0x001B34, 0x0010},
+{0x001B45, 0x0004},
+{0x001B4D, 0x0001},
+{0x001B50, 0x0002},
+{0x001B5A, 0x0020},
+{0x001B61, 0x0040},
+{0x001B6B, 0x0010},
+{0x001B74, 0x0040},
+{0x001B7D, 0x0020},
+{0x001B7F, 0x0001},
+{0x001B80, 0x0010},
+{0x001B83, 0x0004},
+{0x001BA1, 0x0010},
+{0x001BAE, 0x0004},
+{0x001BB0, 0x0002},
+{0x001BBA, 0x0004},
+{0x001BE6, 0x0010},
+{0x001BF4, 0x0001},
+{0x001BFC, 0x0020},
+{0x001C00, 0x0004},
+{0x001C24, 0x0010},
+{0x001C38, 0x0001},
+{0x001C3B, 0x0020},
+{0x001C40, 0x0002},
+{0x001C4A, 0x0001},
+{0x001C4D, 0x0004},
+{0x001C50, 0x0002},
+{0x001C5A, 0x0004},
+{0x001C7E, 0x0020},
+{0x001C80, 0x0004},
+{0x001C89, 0x0001},
+{0x001C90, 0x0004},
+{0x001CBB, 0x0001},
+{0x001CBD, 0x0004},
+{0x001CC0, 0x0020},
+{0x001CC8, 0x0001},
+{0x001CD0, 0x0010},
+{0x001CD3, 0x0020},
+{0x001CD4, 0x0010},
+{0x001CE9, 0x0004},
+{0x001CED, 0x0010},
+{0x001CEE, 0x0004},
+{0x001CF4, 0x0010},
+{0x001CF5, 0x0004},
+{0x001CF7, 0x0010},
+{0x001CFA, 0x0004},
+{0x001CFB, 0x0001},
+{0x001D00, 0x0004},
+{0x001DC0, 0x0010},
+{0x001E00, 0x0004},
+{0x001F16, 0x0001},
+{0x001F18, 0x0004},
+{0x001F1E, 0x0001},
+{0x001F20, 0x0004},
+{0x001F46, 0x0001},
+{0x001F48, 0x0004},
+{0x001F4E, 0x0001},
+{0x001F50, 0x0004},
+{0x001F58, 0x0001},
+{0x001F59, 0x0004},
+{0x001F5A, 0x0001},
+{0x001F5B, 0x0004},
+{0x001F5C, 0x0001},
+{0x001F5D, 0x0004},
+{0x001F5E, 0x0001},
+{0x001F5F, 0x0004},
+{0x001F7E, 0x0001},
+{0x001F80, 0x0004},
+{0x001FB5, 0x0001},
+{0x001FB6, 0x0004},
+{0x001FBD, 0x0040},
+{0x001FBE, 0x0004},
+{0x001FBF, 0x0040},
+{0x001FC2, 0x0004},
+{0x001FC5, 0x0001},
+{0x001FC6, 0x0004},
+{0x001FCD, 0x0040},
+{0x001FD0, 0x0004},
+{0x001FD4, 0x0001},
+{0x001FD6, 0x0004},
+{0x001FDC, 0x0001},
+{0x001FDD, 0x0040},
+{0x001FE0, 0x0004},
+{0x001FED, 0x0040},
+{0x001FF0, 0x0001},
+{0x001FF2, 0x0004},
+{0x001FF5, 0x0001},
+{0x001FF6, 0x0004},
+{0x001FFD, 0x0040},
+{0x001FFF, 0x0001},
+{0x002000, 0x0008},
+{0x00200B, 0x0080},
+{0x002010, 0x0020},
+{0x002028, 0x0008},
+{0x00202A, 0x0080},
+{0x00202F, 0x0008},
+{0x002030, 0x0020},
+{0x002044, 0x0040},
+{0x002045, 0x0020},
+{0x002052, 0x0040},
+{0x002053, 0x0020},
+{0x00205F, 0x0008},
+{0x002060, 0x0080},
+{0x002065, 0x0001},
+{0x002066, 0x0080},
+{0x002070, 0x0002},
+{0x002071, 0x0004},
+{0x002072, 0x0001},
+{0x002074, 0x0002},
+{0x00207A, 0x0040},
+{0x00207D, 0x0020},
+{0x00207F, 0x0004},
+{0x002080, 0x0002},
+{0x00208A, 0x0040},
+{0x00208D, 0x0020},
+{0x00208F, 0x0001},
+{0x002090, 0x0004},
+{0x00209D, 0x0001},
+{0x0020A0, 0x0040},
+{0x0020C1, 0x0001},
+{0x0020D0, 0x0010},
+{0x0020F1, 0x0001},
+{0x002100, 0x0040},
+{0x002102, 0x0004},
+{0x002103, 0x0040},
+{0x002107, 0x0004},
+{0x002108, 0x0040},
+{0x00210A, 0x0004},
+{0x002114, 0x0040},
+{0x002115, 0x0004},
+{0x002116, 0x0040},
+{0x002119, 0x0004},
+{0x00211E, 0x0040},
+{0x002124, 0x0004},
+{0x002125, 0x0040},
+{0x002126, 0x0004},
+{0x002127, 0x0040},
+{0x002128, 0x0004},
+{0x002129, 0x0040},
+{0x00212A, 0x0004},
+{0x00212E, 0x0040},
+{0x00212F, 0x0004},
+{0x00213A, 0x0040},
+{0x00213C, 0x0004},
+{0x002140, 0x0040},
+{0x002145, 0x0004},
+{0x00214A, 0x0040},
+{0x00214E, 0x0004},
+{0x00214F, 0x0040},
+{0x002150, 0x0002},
+{0x002183, 0x0004},
+{0x002185, 0x0002},
+{0x00218A, 0x0040},
+{0x00218C, 0x0001},
+{0x002190, 0x0040},
+{0x002308, 0x0020},
+{0x00230C, 0x0040},
+{0x002329, 0x0020},
+{0x00232B, 0x0040},
+{0x002427, 0x0001},
+{0x002440, 0x0040},
+{0x00244B, 0x0001},
+{0x002460, 0x0002},
+{0x00249C, 0x0040},
+{0x0024EA, 0x0002},
+{0x002500, 0x0040},
+{0x002768, 0x0020},
+{0x002776, 0x0002},
+{0x002794, 0x0040},
+{0x0027C5, 0x0020},
+{0x0027C7, 0x0040},
+{0x0027E6, 0x0020},
+{0x0027F0, 0x0040},
+{0x002983, 0x0020},
+{0x002999, 0x0040},
+{0x0029D8, 0x0020},
+{0x0029DC, 0x0040},
+{0x0029FC, 0x0020},
+{0x0029FE, 0x0040},
+{0x002B74, 0x0001},
+{0x002B76, 0x0040},
+{0x002B96, 0x0001},
+{0x002B97, 0x0040},
+{0x002C00, 0x0004},
+{0x002CE5, 0x0040},
+{0x002CEB, 0x0004},
+{0x002CEF, 0x0010},
+{0x002CF2, 0x0004},
+{0x002CF4, 0x0001},
+{0x002CF9, 0x0020},
+{0x002CFD, 0x0002},
+{0x002CFE, 0x0020},
+{0x002D00, 0x0004},
+{0x002D26, 0x0001},
+{0x002D27, 0x0004},
+{0x002D28, 0x0001},
+{0x002D2D, 0x0004},
+{0x002D2E, 0x0001},
+{0x002D30, 0x0004},
+{0x002D68, 0x0001},
+{0x002D6F, 0x0004},
+{0x002D70, 0x0020},
+{0x002D71, 0x0001},
+{0x002D7F, 0x0010},
+{0x002D80, 0x0004},
+{0x002D97, 0x0001},
+{0x002DA0, 0x0004},
+{0x002DA7, 0x0001},
+{0x002DA8, 0x0004},
+{0x002DAF, 0x0001},
+{0x002DB0, 0x0004},
+{0x002DB7, 0x0001},
+{0x002DB8, 0x0004},
+{0x002DBF, 0x0001},
+{0x002DC0, 0x0004},
+{0x002DC7, 0x0001},
+{0x002DC8, 0x0004},
+{0x002DCF, 0x0001},
+{0x002DD0, 0x0004},
+{0x002DD7, 0x0001},
+{0x002DD8, 0x0004},
+{0x002DDF, 0x0001},
+{0x002DE0, 0x0010},
+{0x002E00, 0x0020},
+{0x002E2F, 0x0004},
+{0x002E30, 0x0020},
+{0x002E50, 0x0040},
+{0x002E52, 0x0020},
+{0x002E5E, 0x0001},
+{0x002E80, 0x0040},
+{0x002E9A, 0x0001},
+{0x002E9B, 0x0040},
+{0x002EF4, 0x0001},
+{0x002F00, 0x0040},
+{0x002FD6, 0x0001},
+{0x002FF0, 0x0040},
+{0x003000, 0x0008},
+{0x003001, 0x0020},
+{0x003004, 0x0040},
+{0x003005, 0x0004},
+{0x003007, 0x0002},
+{0x003008, 0x0020},
+{0x003012, 0x0040},
+{0x003014, 0x0020},
+{0x003020, 0x0040},
+{0x003021, 0x0002},
+{0x00302A, 0x0010},
+{0x003030, 0x0020},
+{0x003031, 0x0004},
+{0x003036, 0x0040},
+{0x003038, 0x0002},
+{0x00303B, 0x0004},
+{0x00303D, 0x0020},
+{0x00303E, 0x0040},
+{0x003040, 0x0001},
+{0x003041, 0x0004},
+{0x003097, 0x0001},
+{0x003099, 0x0010},
+{0x00309B, 0x0040},
+{0x00309D, 0x0004},
+{0x0030A0, 0x0020},
+{0x0030A1, 0x0004},
+{0x0030FB, 0x0020},
+{0x0030FC, 0x0004},
+{0x003100, 0x0001},
+{0x003105, 0x0004},
+{0x003130, 0x0001},
+{0x003131, 0x0004},
+{0x00318F, 0x0001},
+{0x003190, 0x0040},
+{0x003192, 0x0002},
+{0x003196, 0x0040},
+{0x0031A0, 0x0004},
+{0x0031C0, 0x0040},
+{0x0031E4, 0x0001},
+{0x0031EF, 0x0040},
+{0x0031F0, 0x0004},
+{0x003200, 0x0040},
+{0x00321F, 0x0001},
+{0x003220, 0x0002},
+{0x00322A, 0x0040},
+{0x003248, 0x0002},
+{0x003250, 0x0040},
+{0x003251, 0x0002},
+{0x003260, 0x0040},
+{0x003280, 0x0002},
+{0x00328A, 0x0040},
+{0x0032B1, 0x0002},
+{0x0032C0, 0x0040},
+{0x003400, 0x0004},
+{0x004DC0, 0x0040},
+{0x004E00, 0x0004},
+{0x00A48D, 0x0001},
+{0x00A490, 0x0040},
+{0x00A4C7, 0x0001},
+{0x00A4D0, 0x0004},
+{0x00A4FE, 0x0020},
+{0x00A500, 0x0004},
+{0x00A60D, 0x0020},
+{0x00A610, 0x0004},
+{0x00A620, 0x0002},
+{0x00A62A, 0x0004},
+{0x00A62C, 0x0001},
+{0x00A640, 0x0004},
+{0x00A66F, 0x0010},
+{0x00A673, 0x0020},
+{0x00A674, 0x0010},
+{0x00A67E, 0x0020},
+{0x00A67F, 0x0004},
+{0x00A69E, 0x0010},
+{0x00A6A0, 0x0004},
+{0x00A6E6, 0x0002},
+{0x00A6F0, 0x0010},
+{0x00A6F2, 0x0020},
+{0x00A6F8, 0x0001},
+{0x00A700, 0x0040},
+{0x00A717, 0x0004},
+{0x00A720, 0x0040},
+{0x00A722, 0x0004},
+{0x00A789, 0x0040},
+{0x00A78B, 0x0004},
+{0x00A7CB, 0x0001},
+{0x00A7D0, 0x0004},
+{0x00A7D2, 0x0001},
+{0x00A7D3, 0x0004},
+{0x00A7D4, 0x0001},
+{0x00A7D5, 0x0004},
+{0x00A7DA, 0x0001},
+{0x00A7F2, 0x0004},
+{0x00A802, 0x0010},
+{0x00A803, 0x0004},
+{0x00A806, 0x0010},
+{0x00A807, 0x0004},
+{0x00A80B, 0x0010},
+{0x00A80C, 0x0004},
+{0x00A823, 0x0010},
+{0x00A828, 0x0040},
+{0x00A82C, 0x0010},
+{0x00A82D, 0x0001},
+{0x00A830, 0x0002},
+{0x00A836, 0x0040},
+{0x00A83A, 0x0001},
+{0x00A840, 0x0004},
+{0x00A874, 0x0020},
+{0x00A878, 0x0001},
+{0x00A880, 0x0010},
+{0x00A882, 0x0004},
+{0x00A8B4, 0x0010},
+{0x00A8C6, 0x0001},
+{0x00A8CE, 0x0020},
+{0x00A8D0, 0x0002},
+{0x00A8DA, 0x0001},
+{0x00A8E0, 0x0010},
+{0x00A8F2, 0x0004},
+{0x00A8F8, 0x0020},
+{0x00A8FB, 0x0004},
+{0x00A8FC, 0x0020},
+{0x00A8FD, 0x0004},
+{0x00A8FF, 0x0010},
+{0x00A900, 0x0002},
+{0x00A90A, 0x0004},
+{0x00A926, 0x0010},
+{0x00A92E, 0x0020},
+{0x00A930, 0x0004},
+{0x00A947, 0x0010},
+{0x00A954, 0x0001},
+{0x00A95F, 0x0020},
+{0x00A960, 0x0004},
+{0x00A97D, 0x0001},
+{0x00A980, 0x0010},
+{0x00A984, 0x0004},
+{0x00A9B3, 0x0010},
+{0x00A9C1, 0x0020},
+{0x00A9CE, 0x0001},
+{0x00A9CF, 0x0004},
+{0x00A9D0, 0x0002},
+{0x00A9DA, 0x0001},
+{0x00A9DE, 0x0020},
+{0x00A9E0, 0x0004},
+{0x00A9E5, 0x0010},
+{0x00A9E6, 0x0004},
+{0x00A9F0, 0x0002},
+{0x00A9FA, 0x0004},
+{0x00A9FF, 0x0001},
+{0x00AA00, 0x0004},
+{0x00AA29, 0x0010},
+{0x00AA37, 0x0001},
+{0x00AA40, 0x0004},
+{0x00AA43, 0x0010},
+{0x00AA44, 0x0004},
+{0x00AA4C, 0x0010},
+{0x00AA4E, 0x0001},
+{0x00AA50, 0x0002},
+{0x00AA5A, 0x0001},
+{0x00AA5C, 0x0020},
+{0x00AA60, 0x0004},
+{0x00AA77, 0x0040},
+{0x00AA7A, 0x0004},
+{0x00AA7B, 0x0010},
+{0x00AA7E, 0x0004},
+{0x00AAB0, 0x0010},
+{0x00AAB1, 0x0004},
+{0x00AAB2, 0x0010},
+{0x00AAB5, 0x0004},
+{0x00AAB7, 0x0010},
+{0x00AAB9, 0x0004},
+{0x00AABE, 0x0010},
+{0x00AAC0, 0x0004},
+{0x00AAC1, 0x0010},
+{0x00AAC2, 0x0004},
+{0x00AAC3, 0x0001},
+{0x00AADB, 0x0004},
+{0x00AADE, 0x0020},
+{0x00AAE0, 0x0004},
+{0x00AAEB, 0x0010},
+{0x00AAF0, 0x0020},
+{0x00AAF2, 0x0004},
+{0x00AAF5, 0x0010},
+{0x00AAF7, 0x0001},
+{0x00AB01, 0x0004},
+{0x00AB07, 0x0001},
+{0x00AB09, 0x0004},
+{0x00AB0F, 0x0001},
+{0x00AB11, 0x0004},
+{0x00AB17, 0x0001},
+{0x00AB20, 0x0004},
+{0x00AB27, 0x0001},
+{0x00AB28, 0x0004},
+{0x00AB2F, 0x0001},
+{0x00AB30, 0x0004},
+{0x00AB5B, 0x0040},
+{0x00AB5C, 0x0004},
+{0x00AB6A, 0x0040},
+{0x00AB6C, 0x0001},
+{0x00AB70, 0x0004},
+{0x00ABE3, 0x0010},
+{0x00ABEB, 0x0020},
+{0x00ABEC, 0x0010},
+{0x00ABEE, 0x0001},
+{0x00ABF0, 0x0002},
+{0x00ABFA, 0x0001},
+{0x00AC00, 0x0004},
+{0x00D7A4, 0x0001},
+{0x00D7B0, 0x0004},
+{0x00D7C7, 0x0001},
+{0x00D7CB, 0x0004},
+{0x00D7FC, 0x0001},
+{0x00D800, 0x0080},
+{0x00F900, 0x0004},
+{0x00FA6E, 0x0001},
+{0x00FA70, 0x0004},
+{0x00FADA, 0x0001},
+{0x00FB00, 0x0004},
+{0x00FB07, 0x0001},
+{0x00FB13, 0x0004},
+{0x00FB18, 0x0001},
+{0x00FB1D, 0x0004},
+{0x00FB1E, 0x0010},
+{0x00FB1F, 0x0004},
+{0x00FB29, 0x0040},
+{0x00FB2A, 0x0004},
+{0x00FB37, 0x0001},
+{0x00FB38, 0x0004},
+{0x00FB3D, 0x0001},
+{0x00FB3E, 0x0004},
+{0x00FB3F, 0x0001},
+{0x00FB40, 0x0004},
+{0x00FB42, 0x0001},
+{0x00FB43, 0x0004},
+{0x00FB45, 0x0001},
+{0x00FB46, 0x0004},
+{0x00FBB2, 0x0040},
+{0x00FBC3, 0x0001},
+{0x00FBD3, 0x0004},
+{0x00FD3E, 0x0020},
+{0x00FD40, 0x0040},
+{0x00FD50, 0x0004},
+{0x00FD90, 0x0001},
+{0x00FD92, 0x0004},
+{0x00FDC8, 0x0001},
+{0x00FDCF, 0x0040},
+{0x00FDD0, 0x0001},
+{0x00FDF0, 0x0004},
+{0x00FDFC, 0x0040},
+{0x00FE00, 0x0010},
+{0x00FE10, 0x0020},
+{0x00FE1A, 0x0001},
+{0x00FE20, 0x0010},
+{0x00FE30, 0x0020},
+{0x00FE53, 0x0001},
+{0x00FE54, 0x0020},
+{0x00FE62, 0x0040},
+{0x00FE63, 0x0020},
+{0x00FE64, 0x0040},
+{0x00FE67, 0x0001},
+{0x00FE68, 0x0020},
+{0x00FE69, 0x0040},
+{0x00FE6A, 0x0020},
+{0x00FE6C, 0x0001},
+{0x00FE70, 0x0004},
+{0x00FE75, 0x0001},
+{0x00FE76, 0x0004},
+{0x00FEFD, 0x0001},
+{0x00FEFF, 0x0080},
+{0x00FF00, 0x0001},
+{0x00FF01, 0x0020},
+{0x00FF04, 0x0040},
+{0x00FF05, 0x0020},
+{0x00FF0B, 0x0040},
+{0x00FF0C, 0x0020},
+{0x00FF10, 0x0002},
+{0x00FF1A, 0x0020},
+{0x00FF1C, 0x0040},
+{0x00FF1F, 0x0020},
+{0x00FF21, 0x0004},
+{0x00FF3B, 0x0020},
+{0x00FF3E, 0x0040},
+{0x00FF3F, 0x0020},
+{0x00FF40, 0x0040},
+{0x00FF41, 0x0004},
+{0x00FF5B, 0x0020},
+{0x00FF5C, 0x0040},
+{0x00FF5D, 0x0020},
+{0x00FF5E, 0x0040},
+{0x00FF5F, 0x0020},
+{0x00FF66, 0x0004},
+{0x00FFBF, 0x0001},
+{0x00FFC2, 0x0004},
+{0x00FFC8, 0x0001},
+{0x00FFCA, 0x0004},
+{0x00FFD0, 0x0001},
+{0x00FFD2, 0x0004},
+{0x00FFD8, 0x0001},
+{0x00FFDA, 0x0004},
+{0x00FFDD, 0x0001},
+{0x00FFE0, 0x0040},
+{0x00FFE7, 0x0001},
+{0x00FFE8, 0x0040},
+{0x00FFEF, 0x0001},
+{0x00FFF9, 0x0080},
+{0x00FFFC, 0x0040},
+{0x00FFFE, 0x0001},
+{0x010000, 0x0004},
+{0x01000C, 0x0001},
+{0x01000D, 0x0004},
+{0x010027, 0x0001},
+{0x010028, 0x0004},
+{0x01003B, 0x0001},
+{0x01003C, 0x0004},
+{0x01003E, 0x0001},
+{0x01003F, 0x0004},
+{0x01004E, 0x0001},
+{0x010050, 0x0004},
+{0x01005E, 0x0001},
+{0x010080, 0x0004},
+{0x0100FB, 0x0001},
+{0x010100, 0x0020},
+{0x010103, 0x0001},
+{0x010107, 0x0002},
+{0x010134, 0x0001},
+{0x010137, 0x0040},
+{0x010140, 0x0002},
+{0x010179, 0x0040},
+{0x01018A, 0x0002},
+{0x01018C, 0x0040},
+{0x01018F, 0x0001},
+{0x010190, 0x0040},
+{0x01019D, 0x0001},
+{0x0101A0, 0x0040},
+{0x0101A1, 0x0001},
+{0x0101D0, 0x0040},
+{0x0101FD, 0x0010},
+{0x0101FE, 0x0001},
+{0x010280, 0x0004},
+{0x01029D, 0x0001},
+{0x0102A0, 0x0004},
+{0x0102D1, 0x0001},
+{0x0102E0, 0x0010},
+{0x0102E1, 0x0002},
+{0x0102FC, 0x0001},
+{0x010300, 0x0004},
+{0x010320, 0x0002},
+{0x010324, 0x0001},
+{0x01032D, 0x0004},
+{0x010341, 0x0002},
+{0x010342, 0x0004},
+{0x01034A, 0x0002},
+{0x01034B, 0x0001},
+{0x010350, 0x0004},
+{0x010376, 0x0010},
+{0x01037B, 0x0001},
+{0x010380, 0x0004},
+{0x01039E, 0x0001},
+{0x01039F, 0x0020},
+{0x0103A0, 0x0004},
+{0x0103C4, 0x0001},
+{0x0103C8, 0x0004},
+{0x0103D0, 0x0020},
+{0x0103D1, 0x0002},
+{0x0103D6, 0x0001},
+{0x010400, 0x0004},
+{0x01049E, 0x0001},
+{0x0104A0, 0x0002},
+{0x0104AA, 0x0001},
+{0x0104B0, 0x0004},
+{0x0104D4, 0x0001},
+{0x0104D8, 0x0004},
+{0x0104FC, 0x0001},
+{0x010500, 0x0004},
+{0x010528, 0x0001},
+{0x010530, 0x0004},
+{0x010564, 0x0001},
+{0x01056F, 0x0020},
+{0x010570, 0x0004},
+{0x01057B, 0x0001},
+{0x01057C, 0x0004},
+{0x01058B, 0x0001},
+{0x01058C, 0x0004},
+{0x010593, 0x0001},
+{0x010594, 0x0004},
+{0x010596, 0x0001},
+{0x010597, 0x0004},
+{0x0105A2, 0x0001},
+{0x0105A3, 0x0004},
+{0x0105B2, 0x0001},
+{0x0105B3, 0x0004},
+{0x0105BA, 0x0001},
+{0x0105BB, 0x0004},
+{0x0105BD, 0x0001},
+{0x010600, 0x0004},
+{0x010737, 0x0001},
+{0x010740, 0x0004},
+{0x010756, 0x0001},
+{0x010760, 0x0004},
+{0x010768, 0x0001},
+{0x010780, 0x0004},
+{0x010786, 0x0001},
+{0x010787, 0x0004},
+{0x0107B1, 0x0001},
+{0x0107B2, 0x0004},
+{0x0107BB, 0x0001},
+{0x010800, 0x0004},
+{0x010806, 0x0001},
+{0x010808, 0x0004},
+{0x010809, 0x0001},
+{0x01080A, 0x0004},
+{0x010836, 0x0001},
+{0x010837, 0x0004},
+{0x010839, 0x0001},
+{0x01083C, 0x0004},
+{0x01083D, 0x0001},
+{0x01083F, 0x0004},
+{0x010856, 0x0001},
+{0x010857, 0x0020},
+{0x010858, 0x0002},
+{0x010860, 0x0004},
+{0x010877, 0x0040},
+{0x010879, 0x0002},
+{0x010880, 0x0004},
+{0x01089F, 0x0001},
+{0x0108A7, 0x0002},
+{0x0108B0, 0x0001},
+{0x0108E0, 0x0004},
+{0x0108F3, 0x0001},
+{0x0108F4, 0x0004},
+{0x0108F6, 0x0001},
+{0x0108FB, 0x0002},
+{0x010900, 0x0004},
+{0x010916, 0x0002},
+{0x01091C, 0x0001},
+{0x01091F, 0x0020},
+{0x010920, 0x0004},
+{0x01093A, 0x0001},
+{0x01093F, 0x0020},
+{0x010940, 0x0001},
+{0x010980, 0x0004},
+{0x0109B8, 0x0001},
+{0x0109BC, 0x0002},
+{0x0109BE, 0x0004},
+{0x0109C0, 0x0002},
+{0x0109D0, 0x0001},
+{0x0109D2, 0x0002},
+{0x010A00, 0x0004},
+{0x010A01, 0x0010},
+{0x010A04, 0x0001},
+{0x010A05, 0x0010},
+{0x010A07, 0x0001},
+{0x010A0C, 0x0010},
+{0x010A10, 0x0004},
+{0x010A14, 0x0001},
+{0x010A15, 0x0004},
+{0x010A18, 0x0001},
+{0x010A19, 0x0004},
+{0x010A36, 0x0001},
+{0x010A38, 0x0010},
+{0x010A3B, 0x0001},
+{0x010A3F, 0x0010},
+{0x010A40, 0x0002},
+{0x010A49, 0x0001},
+{0x010A50, 0x0020},
+{0x010A59, 0x0001},
+{0x010A60, 0x0004},
+{0x010A7D, 0x0002},
+{0x010A7F, 0x0020},
+{0x010A80, 0x0004},
+{0x010A9D, 0x0002},
+{0x010AA0, 0x0001},
+{0x010AC0, 0x0004},
+{0x010AC8, 0x0040},
+{0x010AC9, 0x0004},
+{0x010AE5, 0x0010},
+{0x010AE7, 0x0001},
+{0x010AEB, 0x0002},
+{0x010AF0, 0x0020},
+{0x010AF7, 0x0001},
+{0x010B00, 0x0004},
+{0x010B36, 0x0001},
+{0x010B39, 0x0020},
+{0x010B40, 0x0004},
+{0x010B56, 0x0001},
+{0x010B58, 0x0002},
+{0x010B60, 0x0004},
+{0x010B73, 0x0001},
+{0x010B78, 0x0002},
+{0x010B80, 0x0004},
+{0x010B92, 0x0001},
+{0x010B99, 0x0020},
+{0x010B9D, 0x0001},
+{0x010BA9, 0x0002},
+{0x010BB0, 0x0001},
+{0x010C00, 0x0004},
+{0x010C49, 0x0001},
+{0x010C80, 0x0004},
+{0x010CB3, 0x0001},
+{0x010CC0, 0x0004},
+{0x010CF3, 0x0001},
+{0x010CFA, 0x0002},
+{0x010D00, 0x0004},
+{0x010D24, 0x0010},
+{0x010D28, 0x0001},
+{0x010D30, 0x0002},
+{0x010D3A, 0x0001},
+{0x010E60, 0x0002},
+{0x010E7F, 0x0001},
+{0x010E80, 0x0004},
+{0x010EAA, 0x0001},
+{0x010EAB, 0x0010},
+{0x010EAD, 0x0020},
+{0x010EAE, 0x0001},
+{0x010EB0, 0x0004},
+{0x010EB2, 0x0001},
+{0x010EFD, 0x0010},
+{0x010F00, 0x0004},
+{0x010F1D, 0x0002},
+{0x010F27, 0x0004},
+{0x010F28, 0x0001},
+{0x010F30, 0x0004},
+{0x010F46, 0x0010},
+{0x010F51, 0x0002},
+{0x010F55, 0x0020},
+{0x010F5A, 0x0001},
+{0x010F70, 0x0004},
+{0x010F82, 0x0010},
+{0x010F86, 0x0020},
+{0x010F8A, 0x0001},
+{0x010FB0, 0x0004},
+{0x010FC5, 0x0002},
+{0x010FCC, 0x0001},
+{0x010FE0, 0x0004},
+{0x010FF7, 0x0001},
+{0x011000, 0x0010},
+{0x011003, 0x0004},
+{0x011038, 0x0010},
+{0x011047, 0x0020},
+{0x01104E, 0x0001},
+{0x011052, 0x0002},
+{0x011070, 0x0010},
+{0x011071, 0x0004},
+{0x011073, 0x0010},
+{0x011075, 0x0004},
+{0x011076, 0x0001},
+{0x01107F, 0x0010},
+{0x011083, 0x0004},
+{0x0110B0, 0x0010},
+{0x0110BB, 0x0020},
+{0x0110BD, 0x0080},
+{0x0110BE, 0x0020},
+{0x0110C2, 0x0010},
+{0x0110C3, 0x0001},
+{0x0110CD, 0x0080},
+{0x0110CE, 0x0001},
+{0x0110D0, 0x0004},
+{0x0110E9, 0x0001},
+{0x0110F0, 0x0002},
+{0x0110FA, 0x0001},
+{0x011100, 0x0010},
+{0x011103, 0x0004},
+{0x011127, 0x0010},
+{0x011135, 0x0001},
+{0x011136, 0x0002},
+{0x011140, 0x0020},
+{0x011144, 0x0004},
+{0x011145, 0x0010},
+{0x011147, 0x0004},
+{0x011148, 0x0001},
+{0x011150, 0x0004},
+{0x011173, 0x0010},
+{0x011174, 0x0020},
+{0x011176, 0x0004},
+{0x011177, 0x0001},
+{0x011180, 0x0010},
+{0x011183, 0x0004},
+{0x0111B3, 0x0010},
+{0x0111C1, 0x0004},
+{0x0111C5, 0x0020},
+{0x0111C9, 0x0010},
+{0x0111CD, 0x0020},
+{0x0111CE, 0x0010},
+{0x0111D0, 0x0002},
+{0x0111DA, 0x0004},
+{0x0111DB, 0x0020},
+{0x0111DC, 0x0004},
+{0x0111DD, 0x0020},
+{0x0111E0, 0x0001},
+{0x0111E1, 0x0002},
+{0x0111F5, 0x0001},
+{0x011200, 0x0004},
+{0x011212, 0x0001},
+{0x011213, 0x0004},
+{0x01122C, 0x0010},
+{0x011238, 0x0020},
+{0x01123E, 0x0010},
+{0x01123F, 0x0004},
+{0x011241, 0x0010},
+{0x011242, 0x0001},
+{0x011280, 0x0004},
+{0x011287, 0x0001},
+{0x011288, 0x0004},
+{0x011289, 0x0001},
+{0x01128A, 0x0004},
+{0x01128E, 0x0001},
+{0x01128F, 0x0004},
+{0x01129E, 0x0001},
+{0x01129F, 0x0004},
+{0x0112A9, 0x0020},
+{0x0112AA, 0x0001},
+{0x0112B0, 0x0004},
+{0x0112DF, 0x0010},
+{0x0112EB, 0x0001},
+{0x0112F0, 0x0002},
+{0x0112FA, 0x0001},
+{0x011300, 0x0010},
+{0x011304, 0x0001},
+{0x011305, 0x0004},
+{0x01130D, 0x0001},
+{0x01130F, 0x0004},
+{0x011311, 0x0001},
+{0x011313, 0x0004},
+{0x011329, 0x0001},
+{0x01132A, 0x0004},
+{0x011331, 0x0001},
+{0x011332, 0x0004},
+{0x011334, 0x0001},
+{0x011335, 0x0004},
+{0x01133A, 0x0001},
+{0x01133B, 0x0010},
+{0x01133D, 0x0004},
+{0x01133E, 0x0010},
+{0x011345, 0x0001},
+{0x011347, 0x0010},
+{0x011349, 0x0001},
+{0x01134B, 0x0010},
+{0x01134E, 0x0001},
+{0x011350, 0x0004},
+{0x011351, 0x0001},
+{0x011357, 0x0010},
+{0x011358, 0x0001},
+{0x01135D, 0x0004},
+{0x011362, 0x0010},
+{0x011364, 0x0001},
+{0x011366, 0x0010},
+{0x01136D, 0x0001},
+{0x011370, 0x0010},
+{0x011375, 0x0001},
+{0x011400, 0x0004},
+{0x011435, 0x0010},
+{0x011447, 0x0004},
+{0x01144B, 0x0020},
+{0x011450, 0x0002},
+{0x01145A, 0x0020},
+{0x01145C, 0x0001},
+{0x01145D, 0x0020},
+{0x01145E, 0x0010},
+{0x01145F, 0x0004},
+{0x011462, 0x0001},
+{0x011480, 0x0004},
+{0x0114B0, 0x0010},
+{0x0114C4, 0x0004},
+{0x0114C6, 0x0020},
+{0x0114C7, 0x0004},
+{0x0114C8, 0x0001},
+{0x0114D0, 0x0002},
+{0x0114DA, 0x0001},
+{0x011580, 0x0004},
+{0x0115AF, 0x0010},
+{0x0115B6, 0x0001},
+{0x0115B8, 0x0010},
+{0x0115C1, 0x0020},
+{0x0115D8, 0x0004},
+{0x0115DC, 0x0010},
+{0x0115DE, 0x0001},
+{0x011600, 0x0004},
+{0x011630, 0x0010},
+{0x011641, 0x0020},
+{0x011644, 0x0004},
+{0x011645, 0x0001},
+{0x011650, 0x0002},
+{0x01165A, 0x0001},
+{0x011660, 0x0020},
+{0x01166D, 0x0001},
+{0x011680, 0x0004},
+{0x0116AB, 0x0010},
+{0x0116B8, 0x0004},
+{0x0116B9, 0x0020},
+{0x0116BA, 0x0001},
+{0x0116C0, 0x0002},
+{0x0116CA, 0x0001},
+{0x011700, 0x0004},
+{0x01171B, 0x0001},
+{0x01171D, 0x0010},
+{0x01172C, 0x0001},
+{0x011730, 0x0002},
+{0x01173C, 0x0020},
+{0x01173F, 0x0040},
+{0x011740, 0x0004},
+{0x011747, 0x0001},
+{0x011800, 0x0004},
+{0x01182C, 0x0010},
+{0x01183B, 0x0020},
+{0x01183C, 0x0001},
+{0x0118A0, 0x0004},
+{0x0118E0, 0x0002},
+{0x0118F3, 0x0001},
+{0x0118FF, 0x0004},
+{0x011907, 0x0001},
+{0x011909, 0x0004},
+{0x01190A, 0x0001},
+{0x01190C, 0x0004},
+{0x011914, 0x0001},
+{0x011915, 0x0004},
+{0x011917, 0x0001},
+{0x011918, 0x0004},
+{0x011930, 0x0010},
+{0x011936, 0x0001},
+{0x011937, 0x0010},
+{0x011939, 0x0001},
+{0x01193B, 0x0010},
+{0x01193F, 0x0004},
+{0x011940, 0x0010},
+{0x011941, 0x0004},
+{0x011942, 0x0010},
+{0x011944, 0x0020},
+{0x011947, 0x0001},
+{0x011950, 0x0002},
+{0x01195A, 0x0001},
+{0x0119A0, 0x0004},
+{0x0119A8, 0x0001},
+{0x0119AA, 0x0004},
+{0x0119D1, 0x0010},
+{0x0119D8, 0x0001},
+{0x0119DA, 0x0010},
+{0x0119E1, 0x0004},
+{0x0119E2, 0x0020},
+{0x0119E3, 0x0004},
+{0x0119E4, 0x0010},
+{0x0119E5, 0x0001},
+{0x011A00, 0x0004},
+{0x011A01, 0x0010},
+{0x011A0B, 0x0004},
+{0x011A33, 0x0010},
+{0x011A3A, 0x0004},
+{0x011A3B, 0x0010},
+{0x011A3F, 0x0020},
+{0x011A47, 0x0010},
+{0x011A48, 0x0001},
+{0x011A50, 0x0004},
+{0x011A51, 0x0010},
+{0x011A5C, 0x0004},
+{0x011A8A, 0x0010},
+{0x011A9A, 0x0020},
+{0x011A9D, 0x0004},
+{0x011A9E, 0x0020},
+{0x011AA3, 0x0001},
+{0x011AB0, 0x0004},
+{0x011AF9, 0x0001},
+{0x011B00, 0x0020},
+{0x011B0A, 0x0001},
+{0x011C00, 0x0004},
+{0x011C09, 0x0001},
+{0x011C0A, 0x0004},
+{0x011C2F, 0x0010},
+{0x011C37, 0x0001},
+{0x011C38, 0x0010},
+{0x011C40, 0x0004},
+{0x011C41, 0x0020},
+{0x011C46, 0x0001},
+{0x011C50, 0x0002},
+{0x011C6D, 0x0001},
+{0x011C70, 0x0020},
+{0x011C72, 0x0004},
+{0x011C90, 0x0001},
+{0x011C92, 0x0010},
+{0x011CA8, 0x0001},
+{0x011CA9, 0x0010},
+{0x011CB7, 0x0001},
+{0x011D00, 0x0004},
+{0x011D07, 0x0001},
+{0x011D08, 0x0004},
+{0x011D0A, 0x0001},
+{0x011D0B, 0x0004},
+{0x011D31, 0x0010},
+{0x011D37, 0x0001},
+{0x011D3A, 0x0010},
+{0x011D3B, 0x0001},
+{0x011D3C, 0x0010},
+{0x011D3E, 0x0001},
+{0x011D3F, 0x0010},
+{0x011D46, 0x0004},
+{0x011D47, 0x0010},
+{0x011D48, 0x0001},
+{0x011D50, 0x0002},
+{0x011D5A, 0x0001},
+{0x011D60, 0x0004},
+{0x011D66, 0x0001},
+{0x011D67, 0x0004},
+{0x011D69, 0x0001},
+{0x011D6A, 0x0004},
+{0x011D8A, 0x0010},
+{0x011D8F, 0x0001},
+{0x011D90, 0x0010},
+{0x011D92, 0x0001},
+{0x011D93, 0x0010},
+{0x011D98, 0x0004},
+{0x011D99, 0x0001},
+{0x011DA0, 0x0002},
+{0x011DAA, 0x0001},
+{0x011EE0, 0x0004},
+{0x011EF3, 0x0010},
+{0x011EF7, 0x0020},
+{0x011EF9, 0x0001},
+{0x011F00, 0x0010},
+{0x011F02, 0x0004},
+{0x011F03, 0x0010},
+{0x011F04, 0x0004},
+{0x011F11, 0x0001},
+{0x011F12, 0x0004},
+{0x011F34, 0x0010},
+{0x011F3B, 0x0001},
+{0x011F3E, 0x0010},
+{0x011F43, 0x0020},
+{0x011F50, 0x0002},
+{0x011F5A, 0x0001},
+{0x011FB0, 0x0004},
+{0x011FB1, 0x0001},
+{0x011FC0, 0x0002},
+{0x011FD5, 0x0040},
+{0x011FF2, 0x0001},
+{0x011FFF, 0x0020},
+{0x012000, 0x0004},
+{0x01239A, 0x0001},
+{0x012400, 0x0002},
+{0x01246F, 0x0001},
+{0x012470, 0x0020},
+{0x012475, 0x0001},
+{0x012480, 0x0004},
+{0x012544, 0x0001},
+{0x012F90, 0x0004},
+{0x012FF1, 0x0020},
+{0x012FF3, 0x0001},
+{0x013000, 0x0004},
+{0x013430, 0x0080},
+{0x013440, 0x0010},
+{0x013441, 0x0004},
+{0x013447, 0x0010},
+{0x013456, 0x0001},
+{0x014400, 0x0004},
+{0x014647, 0x0001},
+{0x016800, 0x0004},
+{0x016A39, 0x0001},
+{0x016A40, 0x0004},
+{0x016A5F, 0x0001},
+{0x016A60, 0x0002},
+{0x016A6A, 0x0001},
+{0x016A6E, 0x0020},
+{0x016A70, 0x0004},
+{0x016ABF, 0x0001},
+{0x016AC0, 0x0002},
+{0x016ACA, 0x0001},
+{0x016AD0, 0x0004},
+{0x016AEE, 0x0001},
+{0x016AF0, 0x0010},
+{0x016AF5, 0x0020},
+{0x016AF6, 0x0001},
+{0x016B00, 0x0004},
+{0x016B30, 0x0010},
+{0x016B37, 0x0020},
+{0x016B3C, 0x0040},
+{0x016B40, 0x0004},
+{0x016B44, 0x0020},
+{0x016B45, 0x0040},
+{0x016B46, 0x0001},
+{0x016B50, 0x0002},
+{0x016B5A, 0x0001},
+{0x016B5B, 0x0002},
+{0x016B62, 0x0001},
+{0x016B63, 0x0004},
+{0x016B78, 0x0001},
+{0x016B7D, 0x0004},
+{0x016B90, 0x0001},
+{0x016E40, 0x0004},
+{0x016E80, 0x0002},
+{0x016E97, 0x0020},
+{0x016E9B, 0x0001},
+{0x016F00, 0x0004},
+{0x016F4B, 0x0001},
+{0x016F4F, 0x0010},
+{0x016F50, 0x0004},
+{0x016F51, 0x0010},
+{0x016F88, 0x0001},
+{0x016F8F, 0x0010},
+{0x016F93, 0x0004},
+{0x016FA0, 0x0001},
+{0x016FE0, 0x0004},
+{0x016FE2, 0x0020},
+{0x016FE3, 0x0004},
+{0x016FE4, 0x0010},
+{0x016FE5, 0x0001},
+{0x016FF0, 0x0010},
+{0x016FF2, 0x0001},
+{0x017000, 0x0004},
+{0x0187F8, 0x0001},
+{0x018800, 0x0004},
+{0x018CD6, 0x0001},
+{0x018D00, 0x0004},
+{0x018D09, 0x0001},
+{0x01AFF0, 0x0004},
+{0x01AFF4, 0x0001},
+{0x01AFF5, 0x0004},
+{0x01AFFC, 0x0001},
+{0x01AFFD, 0x0004},
+{0x01AFFF, 0x0001},
+{0x01B000, 0x0004},
+{0x01B123, 0x0001},
+{0x01B132, 0x0004},
+{0x01B133, 0x0001},
+{0x01B150, 0x0004},
+{0x01B153, 0x0001},
+{0x01B155, 0x0004},
+{0x01B156, 0x0001},
+{0x01B164, 0x0004},
+{0x01B168, 0x0001},
+{0x01B170, 0x0004},
+{0x01B2FC, 0x0001},
+{0x01BC00, 0x0004},
+{0x01BC6B, 0x0001},
+{0x01BC70, 0x0004},
+{0x01BC7D, 0x0001},
+{0x01BC80, 0x0004},
+{0x01BC89, 0x0001},
+{0x01BC90, 0x0004},
+{0x01BC9A, 0x0001},
+{0x01BC9C, 0x0040},
+{0x01BC9D, 0x0010},
+{0x01BC9F, 0x0020},
+{0x01BCA0, 0x0080},
+{0x01BCA4, 0x0001},
+{0x01CF00, 0x0010},
+{0x01CF2E, 0x0001},
+{0x01CF30, 0x0010},
+{0x01CF47, 0x0001},
+{0x01CF50, 0x0040},
+{0x01CFC4, 0x0001},
+{0x01D000, 0x0040},
+{0x01D0F6, 0x0001},
+{0x01D100, 0x0040},
+{0x01D127, 0x0001},
+{0x01D129, 0x0040},
+{0x01D165, 0x0010},
+{0x01D16A, 0x0040},
+{0x01D16D, 0x0010},
+{0x01D173, 0x0080},
+{0x01D17B, 0x0010},
+{0x01D183, 0x0040},
+{0x01D185, 0x0010},
+{0x01D18C, 0x0040},
+{0x01D1AA, 0x0010},
+{0x01D1AE, 0x0040},
+{0x01D1EB, 0x0001},
+{0x01D200, 0x0040},
+{0x01D242, 0x0010},
+{0x01D245, 0x0040},
+{0x01D246, 0x0001},
+{0x01D2C0, 0x0002},
+{0x01D2D4, 0x0001},
+{0x01D2E0, 0x0002},
+{0x01D2F4, 0x0001},
+{0x01D300, 0x0040},
+{0x01D357, 0x0001},
+{0x01D360, 0x0002},
+{0x01D379, 0x0001},
+{0x01D400, 0x0004},
+{0x01D455, 0x0001},
+{0x01D456, 0x0004},
+{0x01D49D, 0x0001},
+{0x01D49E, 0x0004},
+{0x01D4A0, 0x0001},
+{0x01D4A2, 0x0004},
+{0x01D4A3, 0x0001},
+{0x01D4A5, 0x0004},
+{0x01D4A7, 0x0001},
+{0x01D4A9, 0x0004},
+{0x01D4AD, 0x0001},
+{0x01D4AE, 0x0004},
+{0x01D4BA, 0x0001},
+{0x01D4BB, 0x0004},
+{0x01D4BC, 0x0001},
+{0x01D4BD, 0x0004},
+{0x01D4C4, 0x0001},
+{0x01D4C5, 0x0004},
+{0x01D506, 0x0001},
+{0x01D507, 0x0004},
+{0x01D50B, 0x0001},
+{0x01D50D, 0x0004},
+{0x01D515, 0x0001},
+{0x01D516, 0x0004},
+{0x01D51D, 0x0001},
+{0x01D51E, 0x0004},
+{0x01D53A, 0x0001},
+{0x01D53B, 0x0004},
+{0x01D53F, 0x0001},
+{0x01D540, 0x0004},
+{0x01D545, 0x0001},
+{0x01D546, 0x0004},
+{0x01D547, 0x0001},
+{0x01D54A, 0x0004},
+{0x01D551, 0x0001},
+{0x01D552, 0x0004},
+{0x01D6A6, 0x0001},
+{0x01D6A8, 0x0004},
+{0x01D6C1, 0x0040},
+{0x01D6C2, 0x0004},
+{0x01D6DB, 0x0040},
+{0x01D6DC, 0x0004},
+{0x01D6FB, 0x0040},
+{0x01D6FC, 0x0004},
+{0x01D715, 0x0040},
+{0x01D716, 0x0004},
+{0x01D735, 0x0040},
+{0x01D736, 0x0004},
+{0x01D74F, 0x0040},
+{0x01D750, 0x0004},
+{0x01D76F, 0x0040},
+{0x01D770, 0x0004},
+{0x01D789, 0x0040},
+{0x01D78A, 0x0004},
+{0x01D7A9, 0x0040},
+{0x01D7AA, 0x0004},
+{0x01D7C3, 0x0040},
+{0x01D7C4, 0x0004},
+{0x01D7CC, 0x0001},
+{0x01D7CE, 0x0002},
+{0x01D800, 0x0040},
+{0x01DA00, 0x0010},
+{0x01DA37, 0x0040},
+{0x01DA3B, 0x0010},
+{0x01DA6D, 0x0040},
+{0x01DA75, 0x0010},
+{0x01DA76, 0x0040},
+{0x01DA84, 0x0010},
+{0x01DA85, 0x0040},
+{0x01DA87, 0x0020},
+{0x01DA8C, 0x0001},
+{0x01DA9B, 0x0010},
+{0x01DAA0, 0x0001},
+{0x01DAA1, 0x0010},
+{0x01DAB0, 0x0001},
+{0x01DF00, 0x0004},
+{0x01DF1F, 0x0001},
+{0x01DF25, 0x0004},
+{0x01DF2B, 0x0001},
+{0x01E000, 0x0010},
+{0x01E007, 0x0001},
+{0x01E008, 0x0010},
+{0x01E019, 0x0001},
+{0x01E01B, 0x0010},
+{0x01E022, 0x0001},
+{0x01E023, 0x0010},
+{0x01E025, 0x0001},
+{0x01E026, 0x0010},
+{0x01E02B, 0x0001},
+{0x01E030, 0x0004},
+{0x01E06E, 0x0001},
+{0x01E08F, 0x0010},
+{0x01E090, 0x0001},
+{0x01E100, 0x0004},
+{0x01E12D, 0x0001},
+{0x01E130, 0x0010},
+{0x01E137, 0x0004},
+{0x01E13E, 0x0001},
+{0x01E140, 0x0002},
+{0x01E14A, 0x0001},
+{0x01E14E, 0x0004},
+{0x01E14F, 0x0040},
+{0x01E150, 0x0001},
+{0x01E290, 0x0004},
+{0x01E2AE, 0x0010},
+{0x01E2AF, 0x0001},
+{0x01E2C0, 0x0004},
+{0x01E2EC, 0x0010},
+{0x01E2F0, 0x0002},
+{0x01E2FA, 0x0001},
+{0x01E2FF, 0x0040},
+{0x01E300, 0x0001},
+{0x01E4D0, 0x0004},
+{0x01E4EC, 0x0010},
+{0x01E4F0, 0x0002},
+{0x01E4FA, 0x0001},
+{0x01E7E0, 0x0004},
+{0x01E7E7, 0x0001},
+{0x01E7E8, 0x0004},
+{0x01E7EC, 0x0001},
+{0x01E7ED, 0x0004},
+{0x01E7EF, 0x0001},
+{0x01E7F0, 0x0004},
+{0x01E7FF, 0x0001},
+{0x01E800, 0x0004},
+{0x01E8C5, 0x0001},
+{0x01E8C7, 0x0002},
+{0x01E8D0, 0x0010},
+{0x01E8D7, 0x0001},
+{0x01E900, 0x0004},
+{0x01E944, 0x0010},
+{0x01E94B, 0x0004},
+{0x01E94C, 0x0001},
+{0x01E950, 0x0002},
+{0x01E95A, 0x0001},
+{0x01E95E, 0x0020},
+{0x01E960, 0x0001},
+{0x01EC71, 0x0002},
+{0x01ECAC, 0x0040},
+{0x01ECAD, 0x0002},
+{0x01ECB0, 0x0040},
+{0x01ECB1, 0x0002},
+{0x01ECB5, 0x0001},
+{0x01ED01, 0x0002},
+{0x01ED2E, 0x0040},
+{0x01ED2F, 0x0002},
+{0x01ED3E, 0x0001},
+{0x01EE00, 0x0004},
+{0x01EE04, 0x0001},
+{0x01EE05, 0x0004},
+{0x01EE20, 0x0001},
+{0x01EE21, 0x0004},
+{0x01EE23, 0x0001},
+{0x01EE24, 0x0004},
+{0x01EE25, 0x0001},
+{0x01EE27, 0x0004},
+{0x01EE28, 0x0001},
+{0x01EE29, 0x0004},
+{0x01EE33, 0x0001},
+{0x01EE34, 0x0004},
+{0x01EE38, 0x0001},
+{0x01EE39, 0x0004},
+{0x01EE3A, 0x0001},
+{0x01EE3B, 0x0004},
+{0x01EE3C, 0x0001},
+{0x01EE42, 0x0004},
+{0x01EE43, 0x0001},
+{0x01EE47, 0x0004},
+{0x01EE48, 0x0001},
+{0x01EE49, 0x0004},
+{0x01EE4A, 0x0001},
+{0x01EE4B, 0x0004},
+{0x01EE4C, 0x0001},
+{0x01EE4D, 0x0004},
+{0x01EE50, 0x0001},
+{0x01EE51, 0x0004},
+{0x01EE53, 0x0001},
+{0x01EE54, 0x0004},
+{0x01EE55, 0x0001},
+{0x01EE57, 0x0004},
+{0x01EE58, 0x0001},
+{0x01EE59, 0x0004},
+{0x01EE5A, 0x0001},
+{0x01EE5B, 0x0004},
+{0x01EE5C, 0x0001},
+{0x01EE5D, 0x0004},
+{0x01EE5E, 0x0001},
+{0x01EE5F, 0x0004},
+{0x01EE60, 0x0001},
+{0x01EE61, 0x0004},
+{0x01EE63, 0x0001},
+{0x01EE64, 0x0004},
+{0x01EE65, 0x0001},
+{0x01EE67, 0x0004},
+{0x01EE6B, 0x0001},
+{0x01EE6C, 0x0004},
+{0x01EE73, 0x0001},
+{0x01EE74, 0x0004},
+{0x01EE78, 0x0001},
+{0x01EE79, 0x0004},
+{0x01EE7D, 0x0001},
+{0x01EE7E, 0x0004},
+{0x01EE7F, 0x0001},
+{0x01EE80, 0x0004},
+{0x01EE8A, 0x0001},
+{0x01EE8B, 0x0004},
+{0x01EE9C, 0x0001},
+{0x01EEA1, 0x0004},
+{0x01EEA4, 0x0001},
+{0x01EEA5, 0x0004},
+{0x01EEAA, 0x0001},
+{0x01EEAB, 0x0004},
+{0x01EEBC, 0x0001},
+{0x01EEF0, 0x0040},
+{0x01EEF2, 0x0001},
+{0x01F000, 0x0040},
+{0x01F02C, 0x0001},
+{0x01F030, 0x0040},
+{0x01F094, 0x0001},
+{0x01F0A0, 0x0040},
+{0x01F0AF, 0x0001},
+{0x01F0B1, 0x0040},
+{0x01F0C0, 0x0001},
+{0x01F0C1, 0x0040},
+{0x01F0D0, 0x0001},
+{0x01F0D1, 0x0040},
+{0x01F0F6, 0x0001},
+{0x01F100, 0x0002},
+{0x01F10D, 0x0040},
+{0x01F1AE, 0x0001},
+{0x01F1E6, 0x0040},
+{0x01F203, 0x0001},
+{0x01F210, 0x0040},
+{0x01F23C, 0x0001},
+{0x01F240, 0x0040},
+{0x01F249, 0x0001},
+{0x01F250, 0x0040},
+{0x01F252, 0x0001},
+{0x01F260, 0x0040},
+{0x01F266, 0x0001},
+{0x01F300, 0x0040},
+{0x01F6D8, 0x0001},
+{0x01F6DC, 0x0040},
+{0x01F6ED, 0x0001},
+{0x01F6F0, 0x0040},
+{0x01F6FD, 0x0001},
+{0x01F700, 0x0040},
+{0x01F777, 0x0001},
+{0x01F77B, 0x0040},
+{0x01F7DA, 0x0001},
+{0x01F7E0, 0x0040},
+{0x01F7EC, 0x0001},
+{0x01F7F0, 0x0040},
+{0x01F7F1, 0x0001},
+{0x01F800, 0x0040},
+{0x01F80C, 0x0001},
+{0x01F810, 0x0040},
+{0x01F848, 0x0001},
+{0x01F850, 0x0040},
+{0x01F85A, 0x0001},
+{0x01F860, 0x0040},
+{0x01F888, 0x0001},
+{0x01F890, 0x0040},
+{0x01F8AE, 0x0001},
+{0x01F8B0, 0x0040},
+{0x01F8B2, 0x0001},
+{0x01F900, 0x0040},
+{0x01FA54, 0x0001},
+{0x01FA60, 0x0040},
+{0x01FA6E, 0x0001},
+{0x01FA70, 0x0040},
+{0x01FA7D, 0x0001},
+{0x01FA80, 0x0040},
+{0x01FA89, 0x0001},
+{0x01FA90, 0x0040},
+{0x01FABE, 0x0001},
+{0x01FABF, 0x0040},
+{0x01FAC6, 0x0001},
+{0x01FACE, 0x0040},
+{0x01FADC, 0x0001},
+{0x01FAE0, 0x0040},
+{0x01FAE9, 0x0001},
+{0x01FAF0, 0x0040},
+{0x01FAF9, 0x0001},
+{0x01FB00, 0x0040},
+{0x01FB93, 0x0001},
+{0x01FB94, 0x0040},
+{0x01FBCB, 0x0001},
+{0x01FBF0, 0x0002},
+{0x01FBFA, 0x0001},
+{0x020000, 0x0004},
+{0x02A6E0, 0x0001},
+{0x02A700, 0x0004},
+{0x02B73A, 0x0001},
+{0x02B740, 0x0004},
+{0x02B81E, 0x0001},
+{0x02B820, 0x0004},
+{0x02CEA2, 0x0001},
+{0x02CEB0, 0x0004},
+{0x02EBE1, 0x0001},
+{0x02EBF0, 0x0004},
+{0x02EE5E, 0x0001},
+{0x02F800, 0x0004},
+{0x02FA1E, 0x0001},
+{0x030000, 0x0004},
+{0x03134B, 0x0001},
+{0x031350, 0x0004},
+{0x0323B0, 0x0001},
+{0x0E0001, 0x0080},
+{0x0E0002, 0x0001},
+{0x0E0020, 0x0080},
+{0x0E0080, 0x0001},
+{0x0E0100, 0x0010},
+{0x0E01F0, 0x0001},
+{0x0F0000, 0x0080},
+{0x0FFFFE, 0x0001},
+{0x100000, 0x0080},
+{0x10FFFE, 0x0001},
+{0x110000, 0x0000},
+};
+
+const std::unordered_set<uint32_t> unicode_set_whitespace = {
+0x000009,
+0x00000A,
+0x00000B,
+0x00000C,
+0x00000D,
+0x000020,
+0x000085,
+0x0000A0,
+0x001680,
+0x002000,
+0x002001,
+0x002002,
+0x002003,
+0x002004,
+0x002005,
+0x002006,
+0x002007,
+0x002008,
+0x002009,
+0x00200A,
+0x002028,
+0x002029,
+0x00202F,
+0x00205F,
+0x003000,
+};
+
+// list is always in ascending order, to enable binary search
+const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
+{0x000041, 0x000061},
+{0x000042, 0x000062},
+{0x000043, 0x000063},
+{0x000044, 0x000064},
+{0x000045, 0x000065},
+{0x000046, 0x000066},
+{0x000047, 0x000067},
+{0x000048, 0x000068},
+{0x000049, 0x000069},
+{0x00004A, 0x00006A},
+{0x00004B, 0x00006B},
+{0x00004C, 0x00006C},
+{0x00004D, 0x00006D},
+{0x00004E, 0x00006E},
+{0x00004F, 0x00006F},
+{0x000050, 0x000070},
+{0x000051, 0x000071},
+{0x000052, 0x000072},
+{0x000053, 0x000073},
+{0x000054, 0x000074},
+{0x000055, 0x000075},
+{0x000056, 0x000076},
+{0x000057, 0x000077},
+{0x000058, 0x000078},
+{0x000059, 0x000079},
+{0x00005A, 0x00007A},
+{0x0000C0, 0x0000E0},
+{0x0000C1, 0x0000E1},
+{0x0000C2, 0x0000E2},
+{0x0000C3, 0x0000E3},
+{0x0000C4, 0x0000E4},
+{0x0000C5, 0x0000E5},
+{0x0000C6, 0x0000E6},
+{0x0000C7, 0x0000E7},
+{0x0000C8, 0x0000E8},
+{0x0000C9, 0x0000E9},
+{0x0000CA, 0x0000EA},
+{0x0000CB, 0x0000EB},
+{0x0000CC, 0x0000EC},
+{0x0000CD, 0x0000ED},
+{0x0000CE, 0x0000EE},
+{0x0000CF, 0x0000EF},
+{0x0000D0, 0x0000F0},
+{0x0000D1, 0x0000F1},
+{0x0000D2, 0x0000F2},
+{0x0000D3, 0x0000F3},
+{0x0000D4, 0x0000F4},
+{0x0000D5, 0x0000F5},
+{0x0000D6, 0x0000F6},
+{0x0000D8, 0x0000F8},
+{0x0000D9, 0x0000F9},
+{0x0000DA, 0x0000FA},
+{0x0000DB, 0x0000FB},
+{0x0000DC, 0x0000FC},
+{0x0000DD, 0x0000FD},
+{0x0000DE, 0x0000FE},
+{0x000100, 0x000101},
+{0x000102, 0x000103},
+{0x000104, 0x000105},
+{0x000106, 0x000107},
+{0x000108, 0x000109},
+{0x00010A, 0x00010B},
+{0x00010C, 0x00010D},
+{0x00010E, 0x00010F},
+{0x000110, 0x000111},
+{0x000112, 0x000113},
+{0x000114, 0x000115},
+{0x000116, 0x000117},
+{0x000118, 0x000119},
+{0x00011A, 0x00011B},
+{0x00011C, 0x00011D},
+{0x00011E, 0x00011F},
+{0x000120, 0x000121},
+{0x000122, 0x000123},
+{0x000124, 0x000125},
+{0x000126, 0x000127},
+{0x000128, 0x000129},
+{0x00012A, 0x00012B},
+{0x00012C, 0x00012D},
+{0x00012E, 0x00012F},
+{0x000130, 0x000069},
+{0x000132, 0x000133},
+{0x000134, 0x000135},
+{0x000136, 0x000137},
+{0x000139, 0x00013A},
+{0x00013B, 0x00013C},
+{0x00013D, 0x00013E},
+{0x00013F, 0x000140},
+{0x000141, 0x000142},
+{0x000143, 0x000144},
+{0x000145, 0x000146},
+{0x000147, 0x000148},
+{0x00014A, 0x00014B},
+{0x00014C, 0x00014D},
+{0x00014E, 0x00014F},
+{0x000150, 0x000151},
+{0x000152, 0x000153},
+{0x000154, 0x000155},
+{0x000156, 0x000157},
+{0x000158, 0x000159},
+{0x00015A, 0x00015B},
+{0x00015C, 0x00015D},
+{0x00015E, 0x00015F},
+{0x000160, 0x000161},
+{0x000162, 0x000163},
+{0x000164, 0x000165},
+{0x000166, 0x000167},
+{0x000168, 0x000169},
+{0x00016A, 0x00016B},
+{0x00016C, 0x00016D},
+{0x00016E, 0x00016F},
+{0x000170, 0x000171},
+{0x000172, 0x000173},
+{0x000174, 0x000175},
+{0x000176, 0x000177},
+{0x000178, 0x0000FF},
+{0x000179, 0x00017A},
+{0x00017B, 0x00017C},
+{0x00017D, 0x00017E},
+{0x000181, 0x000253},
+{0x000182, 0x000183},
+{0x000184, 0x000185},
+{0x000186, 0x000254},
+{0x000187, 0x000188},
+{0x000189, 0x000256},
+{0x00018A, 0x000257},
+{0x00018B, 0x00018C},
+{0x00018E, 0x0001DD},
+{0x00018F, 0x000259},
+{0x000190, 0x00025B},
+{0x000191, 0x000192},
+{0x000193, 0x000260},
+{0x000194, 0x000263},
+{0x000196, 0x000269},
+{0x000197, 0x000268},
+{0x000198, 0x000199},
+{0x00019C, 0x00026F},
+{0x00019D, 0x000272},
+{0x00019F, 0x000275},
+{0x0001A0, 0x0001A1},
+{0x0001A2, 0x0001A3},
+{0x0001A4, 0x0001A5},
+{0x0001A6, 0x000280},
+{0x0001A7, 0x0001A8},
+{0x0001A9, 0x000283},
+{0x0001AC, 0x0001AD},
+{0x0001AE, 0x000288},
+{0x0001AF, 0x0001B0},
+{0x0001B1, 0x00028A},
+{0x0001B2, 0x00028B},
+{0x0001B3, 0x0001B4},
+{0x0001B5, 0x0001B6},
+{0x0001B7, 0x000292},
+{0x0001B8, 0x0001B9},
+{0x0001BC, 0x0001BD},
+{0x0001C4, 0x0001C6},
+{0x0001C5, 0x0001C6},
+{0x0001C7, 0x0001C9},
+{0x0001C8, 0x0001C9},
+{0x0001CA, 0x0001CC},
+{0x0001CB, 0x0001CC},
+{0x0001CD, 0x0001CE},
+{0x0001CF, 0x0001D0},
+{0x0001D1, 0x0001D2},
+{0x0001D3, 0x0001D4},
+{0x0001D5, 0x0001D6},
+{0x0001D7, 0x0001D8},
+{0x0001D9, 0x0001DA},
+{0x0001DB, 0x0001DC},
+{0x0001DE, 0x0001DF},
+{0x0001E0, 0x0001E1},
+{0x0001E2, 0x0001E3},
+{0x0001E4, 0x0001E5},
+{0x0001E6, 0x0001E7},
+{0x0001E8, 0x0001E9},
+{0x0001EA, 0x0001EB},
+{0x0001EC, 0x0001ED},
+{0x0001EE, 0x0001EF},
+{0x0001F1, 0x0001F3},
+{0x0001F2, 0x0001F3},
+{0x0001F4, 0x0001F5},
+{0x0001F6, 0x000195},
+{0x0001F7, 0x0001BF},
+{0x0001F8, 0x0001F9},
+{0x0001FA, 0x0001FB},
+{0x0001FC, 0x0001FD},
+{0x0001FE, 0x0001FF},
+{0x000200, 0x000201},
+{0x000202, 0x000203},
+{0x000204, 0x000205},
+{0x000206, 0x000207},
+{0x000208, 0x000209},
+{0x00020A, 0x00020B},
+{0x00020C, 0x00020D},
+{0x00020E, 0x00020F},
+{0x000210, 0x000211},
+{0x000212, 0x000213},
+{0x000214, 0x000215},
+{0x000216, 0x000217},
+{0x000218, 0x000219},
+{0x00021A, 0x00021B},
+{0x00021C, 0x00021D},
+{0x00021E, 0x00021F},
+{0x000220, 0x00019E},
+{0x000222, 0x000223},
+{0x000224, 0x000225},
+{0x000226, 0x000227},
+{0x000228, 0x000229},
+{0x00022A, 0x00022B},
+{0x00022C, 0x00022D},
+{0x00022E, 0x00022F},
+{0x000230, 0x000231},
+{0x000232, 0x000233},
+{0x00023A, 0x002C65},
+{0x00023B, 0x00023C},
+{0x00023D, 0x00019A},
+{0x00023E, 0x002C66},
+{0x000241, 0x000242},
+{0x000243, 0x000180},
+{0x000244, 0x000289},
+{0x000245, 0x00028C},
+{0x000246, 0x000247},
+{0x000248, 0x000249},
+{0x00024A, 0x00024B},
+{0x00024C, 0x00024D},
+{0x00024E, 0x00024F},
+{0x000370, 0x000371},
+{0x000372, 0x000373},
+{0x000376, 0x000377},
+{0x00037F, 0x0003F3},
+{0x000386, 0x0003AC},
+{0x000388, 0x0003AD},
+{0x000389, 0x0003AE},
+{0x00038A, 0x0003AF},
+{0x00038C, 0x0003CC},
+{0x00038E, 0x0003CD},
+{0x00038F, 0x0003CE},
+{0x000391, 0x0003B1},
+{0x000392, 0x0003B2},
+{0x000393, 0x0003B3},
+{0x000394, 0x0003B4},
+{0x000395, 0x0003B5},
+{0x000396, 0x0003B6},
+{0x000397, 0x0003B7},
+{0x000398, 0x0003B8},
+{0x000399, 0x0003B9},
+{0x00039A, 0x0003BA},
+{0x00039B, 0x0003BB},
+{0x00039C, 0x0003BC},
+{0x00039D, 0x0003BD},
+{0x00039E, 0x0003BE},
+{0x00039F, 0x0003BF},
+{0x0003A0, 0x0003C0},
+{0x0003A1, 0x0003C1},
+{0x0003A3, 0x0003C3},
+{0x0003A4, 0x0003C4},
+{0x0003A5, 0x0003C5},
+{0x0003A6, 0x0003C6},
+{0x0003A7, 0x0003C7},
+{0x0003A8, 0x0003C8},
+{0x0003A9, 0x0003C9},
+{0x0003AA, 0x0003CA},
+{0x0003AB, 0x0003CB},
+{0x0003CF, 0x0003D7},
+{0x0003D8, 0x0003D9},
+{0x0003DA, 0x0003DB},
+{0x0003DC, 0x0003DD},
+{0x0003DE, 0x0003DF},
+{0x0003E0, 0x0003E1},
+{0x0003E2, 0x0003E3},
+{0x0003E4, 0x0003E5},
+{0x0003E6, 0x0003E7},
+{0x0003E8, 0x0003E9},
+{0x0003EA, 0x0003EB},
+{0x0003EC, 0x0003ED},
+{0x0003EE, 0x0003EF},
+{0x0003F4, 0x0003B8},
+{0x0003F7, 0x0003F8},
+{0x0003F9, 0x0003F2},
+{0x0003FA, 0x0003FB},
+{0x0003FD, 0x00037B},
+{0x0003FE, 0x00037C},
+{0x0003FF, 0x00037D},
+{0x000400, 0x000450},
+{0x000401, 0x000451},
+{0x000402, 0x000452},
+{0x000403, 0x000453},
+{0x000404, 0x000454},
+{0x000405, 0x000455},
+{0x000406, 0x000456},
+{0x000407, 0x000457},
+{0x000408, 0x000458},
+{0x000409, 0x000459},
+{0x00040A, 0x00045A},
+{0x00040B, 0x00045B},
+{0x00040C, 0x00045C},
+{0x00040D, 0x00045D},
+{0x00040E, 0x00045E},
+{0x00040F, 0x00045F},
+{0x000410, 0x000430},
+{0x000411, 0x000431},
+{0x000412, 0x000432},
+{0x000413, 0x000433},
+{0x000414, 0x000434},
+{0x000415, 0x000435},
+{0x000416, 0x000436},
+{0x000417, 0x000437},
+{0x000418, 0x000438},
+{0x000419, 0x000439},
+{0x00041A, 0x00043A},
+{0x00041B, 0x00043B},
+{0x00041C, 0x00043C},
+{0x00041D, 0x00043D},
+{0x00041E, 0x00043E},
+{0x00041F, 0x00043F},
+{0x000420, 0x000440},
+{0x000421, 0x000441},
+{0x000422, 0x000442},
+{0x000423, 0x000443},
+{0x000424, 0x000444},
+{0x000425, 0x000445},
+{0x000426, 0x000446},
+{0x000427, 0x000447},
+{0x000428, 0x000448},
+{0x000429, 0x000449},
+{0x00042A, 0x00044A},
+{0x00042B, 0x00044B},
+{0x00042C, 0x00044C},
+{0x00042D, 0x00044D},
+{0x00042E, 0x00044E},
+{0x00042F, 0x00044F},
+{0x000460, 0x000461},
+{0x000462, 0x000463},
+{0x000464, 0x000465},
+{0x000466, 0x000467},
+{0x000468, 0x000469},
+{0x00046A, 0x00046B},
+{0x00046C, 0x00046D},
+{0x00046E, 0x00046F},
+{0x000470, 0x000471},
+{0x000472, 0x000473},
+{0x000474, 0x000475},
+{0x000476, 0x000477},
+{0x000478, 0x000479},
+{0x00047A, 0x00047B},
+{0x00047C, 0x00047D},
+{0x00047E, 0x00047F},
+{0x000480, 0x000481},
+{0x00048A, 0x00048B},
+{0x00048C, 0x00048D},
+{0x00048E, 0x00048F},
+{0x000490, 0x000491},
+{0x000492, 0x000493},
+{0x000494, 0x000495},
+{0x000496, 0x000497},
+{0x000498, 0x000499},
+{0x00049A, 0x00049B},
+{0x00049C, 0x00049D},
+{0x00049E, 0x00049F},
+{0x0004A0, 0x0004A1},
+{0x0004A2, 0x0004A3},
+{0x0004A4, 0x0004A5},
+{0x0004A6, 0x0004A7},
+{0x0004A8, 0x0004A9},
+{0x0004AA, 0x0004AB},
+{0x0004AC, 0x0004AD},
+{0x0004AE, 0x0004AF},
+{0x0004B0, 0x0004B1},
+{0x0004B2, 0x0004B3},
+{0x0004B4, 0x0004B5},
+{0x0004B6, 0x0004B7},
+{0x0004B8, 0x0004B9},
+{0x0004BA, 0x0004BB},
+{0x0004BC, 0x0004BD},
+{0x0004BE, 0x0004BF},
+{0x0004C0, 0x0004CF},
+{0x0004C1, 0x0004C2},
+{0x0004C3, 0x0004C4},
+{0x0004C5, 0x0004C6},
+{0x0004C7, 0x0004C8},
+{0x0004C9, 0x0004CA},
+{0x0004CB, 0x0004CC},
+{0x0004CD, 0x0004CE},
+{0x0004D0, 0x0004D1},
+{0x0004D2, 0x0004D3},
+{0x0004D4, 0x0004D5},
+{0x0004D6, 0x0004D7},
+{0x0004D8, 0x0004D9},
+{0x0004DA, 0x0004DB},
+{0x0004DC, 0x0004DD},
+{0x0004DE, 0x0004DF},
+{0x0004E0, 0x0004E1},
+{0x0004E2, 0x0004E3},
+{0x0004E4, 0x0004E5},
+{0x0004E6, 0x0004E7},
+{0x0004E8, 0x0004E9},
+{0x0004EA, 0x0004EB},
+{0x0004EC, 0x0004ED},
+{0x0004EE, 0x0004EF},
+{0x0004F0, 0x0004F1},
+{0x0004F2, 0x0004F3},
+{0x0004F4, 0x0004F5},
+{0x0004F6, 0x0004F7},
+{0x0004F8, 0x0004F9},
+{0x0004FA, 0x0004FB},
+{0x0004FC, 0x0004FD},
+{0x0004FE, 0x0004FF},
+{0x000500, 0x000501},
+{0x000502, 0x000503},
+{0x000504, 0x000505},
+{0x000506, 0x000507},
+{0x000508, 0x000509},
+{0x00050A, 0x00050B},
+{0x00050C, 0x00050D},
+{0x00050E, 0x00050F},
+{0x000510, 0x000511},
+{0x000512, 0x000513},
+{0x000514, 0x000515},
+{0x000516, 0x000517},
+{0x000518, 0x000519},
+{0x00051A, 0x00051B},
+{0x00051C, 0x00051D},
+{0x00051E, 0x00051F},
+{0x000520, 0x000521},
+{0x000522, 0x000523},
+{0x000524, 0x000525},
+{0x000526, 0x000527},
+{0x000528, 0x000529},
+{0x00052A, 0x00052B},
+{0x00052C, 0x00052D},
+{0x00052E, 0x00052F},
+{0x000531, 0x000561},
+{0x000532, 0x000562},
+{0x000533, 0x000563},
+{0x000534, 0x000564},
+{0x000535, 0x000565},
+{0x000536, 0x000566},
+{0x000537, 0x000567},
+{0x000538, 0x000568},
+{0x000539, 0x000569},
+{0x00053A, 0x00056A},
+{0x00053B, 0x00056B},
+{0x00053C, 0x00056C},
+{0x00053D, 0x00056D},
+{0x00053E, 0x00056E},
+{0x00053F, 0x00056F},
+{0x000540, 0x000570},
+{0x000541, 0x000571},
+{0x000542, 0x000572},
+{0x000543, 0x000573},
+{0x000544, 0x000574},
+{0x000545, 0x000575},
+{0x000546, 0x000576},
+{0x000547, 0x000577},
+{0x000548, 0x000578},
+{0x000549, 0x000579},
+{0x00054A, 0x00057A},
+{0x00054B, 0x00057B},
+{0x00054C, 0x00057C},
+{0x00054D, 0x00057D},
+{0x00054E, 0x00057E},
+{0x00054F, 0x00057F},
+{0x000550, 0x000580},
+{0x000551, 0x000581},
+{0x000552, 0x000582},
+{0x000553, 0x000583},
+{0x000554, 0x000584},
+{0x000555, 0x000585},
+{0x000556, 0x000586},
+{0x0010A0, 0x002D00},
+{0x0010A1, 0x002D01},
+{0x0010A2, 0x002D02},
+{0x0010A3, 0x002D03},
+{0x0010A4, 0x002D04},
+{0x0010A5, 0x002D05},
+{0x0010A6, 0x002D06},
+{0x0010A7, 0x002D07},
+{0x0010A8, 0x002D08},
+{0x0010A9, 0x002D09},
+{0x0010AA, 0x002D0A},
+{0x0010AB, 0x002D0B},
+{0x0010AC, 0x002D0C},
+{0x0010AD, 0x002D0D},
+{0x0010AE, 0x002D0E},
+{0x0010AF, 0x002D0F},
+{0x0010B0, 0x002D10},
+{0x0010B1, 0x002D11},
+{0x0010B2, 0x002D12},
+{0x0010B3, 0x002D13},
+{0x0010B4, 0x002D14},
+{0x0010B5, 0x002D15},
+{0x0010B6, 0x002D16},
+{0x0010B7, 0x002D17},
+{0x0010B8, 0x002D18},
+{0x0010B9, 0x002D19},
+{0x0010BA, 0x002D1A},
+{0x0010BB, 0x002D1B},
+{0x0010BC, 0x002D1C},
+{0x0010BD, 0x002D1D},
+{0x0010BE, 0x002D1E},
+{0x0010BF, 0x002D1F},
+{0x0010C0, 0x002D20},
+{0x0010C1, 0x002D21},
+{0x0010C2, 0x002D22},
+{0x0010C3, 0x002D23},
+{0x0010C4, 0x002D24},
+{0x0010C5, 0x002D25},
+{0x0010C7, 0x002D27},
+{0x0010CD, 0x002D2D},
+{0x0013A0, 0x00AB70},
+{0x0013A1, 0x00AB71},
+{0x0013A2, 0x00AB72},
+{0x0013A3, 0x00AB73},
+{0x0013A4, 0x00AB74},
+{0x0013A5, 0x00AB75},
+{0x0013A6, 0x00AB76},
+{0x0013A7, 0x00AB77},
+{0x0013A8, 0x00AB78},
+{0x0013A9, 0x00AB79},
+{0x0013AA, 0x00AB7A},
+{0x0013AB, 0x00AB7B},
+{0x0013AC, 0x00AB7C},
+{0x0013AD, 0x00AB7D},
+{0x0013AE, 0x00AB7E},
+{0x0013AF, 0x00AB7F},
+{0x0013B0, 0x00AB80},
+{0x0013B1, 0x00AB81},
+{0x0013B2, 0x00AB82},
+{0x0013B3, 0x00AB83},
+{0x0013B4, 0x00AB84},
+{0x0013B5, 0x00AB85},
+{0x0013B6, 0x00AB86},
+{0x0013B7, 0x00AB87},
+{0x0013B8, 0x00AB88},
+{0x0013B9, 0x00AB89},
+{0x0013BA, 0x00AB8A},
+{0x0013BB, 0x00AB8B},
+{0x0013BC, 0x00AB8C},
+{0x0013BD, 0x00AB8D},
+{0x0013BE, 0x00AB8E},
+{0x0013BF, 0x00AB8F},
+{0x0013C0, 0x00AB90},
+{0x0013C1, 0x00AB91},
+{0x0013C2, 0x00AB92},
+{0x0013C3, 0x00AB93},
+{0x0013C4, 0x00AB94},
+{0x0013C5, 0x00AB95},
+{0x0013C6, 0x00AB96},
+{0x0013C7, 0x00AB97},
+{0x0013C8, 0x00AB98},
+{0x0013C9, 0x00AB99},
+{0x0013CA, 0x00AB9A},
+{0x0013CB, 0x00AB9B},
+{0x0013CC, 0x00AB9C},
+{0x0013CD, 0x00AB9D},
+{0x0013CE, 0x00AB9E},
+{0x0013CF, 0x00AB9F},
+{0x0013D0, 0x00ABA0},
+{0x0013D1, 0x00ABA1},
+{0x0013D2, 0x00ABA2},
+{0x0013D3, 0x00ABA3},
+{0x0013D4, 0x00ABA4},
+{0x0013D5, 0x00ABA5},
+{0x0013D6, 0x00ABA6},
+{0x0013D7, 0x00ABA7},
+{0x0013D8, 0x00ABA8},
+{0x0013D9, 0x00ABA9},
+{0x0013DA, 0x00ABAA},
+{0x0013DB, 0x00ABAB},
+{0x0013DC, 0x00ABAC},
+{0x0013DD, 0x00ABAD},
+{0x0013DE, 0x00ABAE},
+{0x0013DF, 0x00ABAF},
+{0x0013E0, 0x00ABB0},
+{0x0013E1, 0x00ABB1},
+{0x0013E2, 0x00ABB2},
+{0x0013E3, 0x00ABB3},
+{0x0013E4, 0x00ABB4},
+{0x0013E5, 0x00ABB5},
+{0x0013E6, 0x00ABB6},
+{0x0013E7, 0x00ABB7},
+{0x0013E8, 0x00ABB8},
+{0x0013E9, 0x00ABB9},
+{0x0013EA, 0x00ABBA},
+{0x0013EB, 0x00ABBB},
+{0x0013EC, 0x00ABBC},
+{0x0013ED, 0x00ABBD},
+{0x0013EE, 0x00ABBE},
+{0x0013EF, 0x00ABBF},
+{0x0013F0, 0x0013F8},
+{0x0013F1, 0x0013F9},
+{0x0013F2, 0x0013FA},
+{0x0013F3, 0x0013FB},
+{0x0013F4, 0x0013FC},
+{0x0013F5, 0x0013FD},
+{0x001C90, 0x0010D0},
+{0x001C91, 0x0010D1},
+{0x001C92, 0x0010D2},
+{0x001C93, 0x0010D3},
+{0x001C94, 0x0010D4},
+{0x001C95, 0x0010D5},
+{0x001C96, 0x0010D6},
+{0x001C97, 0x0010D7},
+{0x001C98, 0x0010D8},
+{0x001C99, 0x0010D9},
+{0x001C9A, 0x0010DA},
+{0x001C9B, 0x0010DB},
+{0x001C9C, 0x0010DC},
+{0x001C9D, 0x0010DD},
+{0x001C9E, 0x0010DE},
+{0x001C9F, 0x0010DF},
+{0x001CA0, 0x0010E0},
+{0x001CA1, 0x0010E1},
+{0x001CA2, 0x0010E2},
+{0x001CA3, 0x0010E3},
+{0x001CA4, 0x0010E4},
+{0x001CA5, 0x0010E5},
+{0x001CA6, 0x0010E6},
+{0x001CA7, 0x0010E7},
+{0x001CA8, 0x0010E8},
+{0x001CA9, 0x0010E9},
+{0x001CAA, 0x0010EA},
+{0x001CAB, 0x0010EB},
+{0x001CAC, 0x0010EC},
+{0x001CAD, 0x0010ED},
+{0x001CAE, 0x0010EE},
+{0x001CAF, 0x0010EF},
+{0x001CB0, 0x0010F0},
+{0x001CB1, 0x0010F1},
+{0x001CB2, 0x0010F2},
+{0x001CB3, 0x0010F3},
+{0x001CB4, 0x0010F4},
+{0x001CB5, 0x0010F5},
+{0x001CB6, 0x0010F6},
+{0x001CB7, 0x0010F7},
+{0x001CB8, 0x0010F8},
+{0x001CB9, 0x0010F9},
+{0x001CBA, 0x0010FA},
+{0x001CBD, 0x0010FD},
+{0x001CBE, 0x0010FE},
+{0x001CBF, 0x0010FF},
+{0x001E00, 0x001E01},
+{0x001E02, 0x001E03},
+{0x001E04, 0x001E05},
+{0x001E06, 0x001E07},
+{0x001E08, 0x001E09},
+{0x001E0A, 0x001E0B},
+{0x001E0C, 0x001E0D},
+{0x001E0E, 0x001E0F},
+{0x001E10, 0x001E11},
+{0x001E12, 0x001E13},
+{0x001E14, 0x001E15},
+{0x001E16, 0x001E17},
+{0x001E18, 0x001E19},
+{0x001E1A, 0x001E1B},
+{0x001E1C, 0x001E1D},
+{0x001E1E, 0x001E1F},
+{0x001E20, 0x001E21},
+{0x001E22, 0x001E23},
+{0x001E24, 0x001E25},
+{0x001E26, 0x001E27},
+{0x001E28, 0x001E29},
+{0x001E2A, 0x001E2B},
+{0x001E2C, 0x001E2D},
+{0x001E2E, 0x001E2F},
+{0x001E30, 0x001E31},
+{0x001E32, 0x001E33},
+{0x001E34, 0x001E35},
+{0x001E36, 0x001E37},
+{0x001E38, 0x001E39},
+{0x001E3A, 0x001E3B},
+{0x001E3C, 0x001E3D},
+{0x001E3E, 0x001E3F},
+{0x001E40, 0x001E41},
+{0x001E42, 0x001E43},
+{0x001E44, 0x001E45},
+{0x001E46, 0x001E47},
+{0x001E48, 0x001E49},
+{0x001E4A, 0x001E4B},
+{0x001E4C, 0x001E4D},
+{0x001E4E, 0x001E4F},
+{0x001E50, 0x001E51},
+{0x001E52, 0x001E53},
+{0x001E54, 0x001E55},
+{0x001E56, 0x001E57},
+{0x001E58, 0x001E59},
+{0x001E5A, 0x001E5B},
+{0x001E5C, 0x001E5D},
+{0x001E5E, 0x001E5F},
+{0x001E60, 0x001E61},
+{0x001E62, 0x001E63},
+{0x001E64, 0x001E65},
+{0x001E66, 0x001E67},
+{0x001E68, 0x001E69},
+{0x001E6A, 0x001E6B},
+{0x001E6C, 0x001E6D},
+{0x001E6E, 0x001E6F},
+{0x001E70, 0x001E71},
+{0x001E72, 0x001E73},
+{0x001E74, 0x001E75},
+{0x001E76, 0x001E77},
+{0x001E78, 0x001E79},
+{0x001E7A, 0x001E7B},
+{0x001E7C, 0x001E7D},
+{0x001E7E, 0x001E7F},
+{0x001E80, 0x001E81},
+{0x001E82, 0x001E83},
+{0x001E84, 0x001E85},
+{0x001E86, 0x001E87},
+{0x001E88, 0x001E89},
+{0x001E8A, 0x001E8B},
+{0x001E8C, 0x001E8D},
+{0x001E8E, 0x001E8F},
+{0x001E90, 0x001E91},
+{0x001E92, 0x001E93},
+{0x001E94, 0x001E95},
+{0x001E9E, 0x0000DF},
+{0x001EA0, 0x001EA1},
+{0x001EA2, 0x001EA3},
+{0x001EA4, 0x001EA5},
+{0x001EA6, 0x001EA7},
+{0x001EA8, 0x001EA9},
+{0x001EAA, 0x001EAB},
+{0x001EAC, 0x001EAD},
+{0x001EAE, 0x001EAF},
+{0x001EB0, 0x001EB1},
+{0x001EB2, 0x001EB3},
+{0x001EB4, 0x001EB5},
+{0x001EB6, 0x001EB7},
+{0x001EB8, 0x001EB9},
+{0x001EBA, 0x001EBB},
+{0x001EBC, 0x001EBD},
+{0x001EBE, 0x001EBF},
+{0x001EC0, 0x001EC1},
+{0x001EC2, 0x001EC3},
+{0x001EC4, 0x001EC5},
+{0x001EC6, 0x001EC7},
+{0x001EC8, 0x001EC9},
+{0x001ECA, 0x001ECB},
+{0x001ECC, 0x001ECD},
+{0x001ECE, 0x001ECF},
+{0x001ED0, 0x001ED1},
+{0x001ED2, 0x001ED3},
+{0x001ED4, 0x001ED5},
+{0x001ED6, 0x001ED7},
+{0x001ED8, 0x001ED9},
+{0x001EDA, 0x001EDB},
+{0x001EDC, 0x001EDD},
+{0x001EDE, 0x001EDF},
+{0x001EE0, 0x001EE1},
+{0x001EE2, 0x001EE3},
+{0x001EE4, 0x001EE5},
+{0x001EE6, 0x001EE7},
+{0x001EE8, 0x001EE9},
+{0x001EEA, 0x001EEB},
+{0x001EEC, 0x001EED},
+{0x001EEE, 0x001EEF},
+{0x001EF0, 0x001EF1},
+{0x001EF2, 0x001EF3},
+{0x001EF4, 0x001EF5},
+{0x001EF6, 0x001EF7},
+{0x001EF8, 0x001EF9},
+{0x001EFA, 0x001EFB},
+{0x001EFC, 0x001EFD},
+{0x001EFE, 0x001EFF},
+{0x001F08, 0x001F00},
+{0x001F09, 0x001F01},
+{0x001F0A, 0x001F02},
+{0x001F0B, 0x001F03},
+{0x001F0C, 0x001F04},
+{0x001F0D, 0x001F05},
+{0x001F0E, 0x001F06},
+{0x001F0F, 0x001F07},
+{0x001F18, 0x001F10},
+{0x001F19, 0x001F11},
+{0x001F1A, 0x001F12},
+{0x001F1B, 0x001F13},
+{0x001F1C, 0x001F14},
+{0x001F1D, 0x001F15},
+{0x001F28, 0x001F20},
+{0x001F29, 0x001F21},
+{0x001F2A, 0x001F22},
+{0x001F2B, 0x001F23},
+{0x001F2C, 0x001F24},
+{0x001F2D, 0x001F25},
+{0x001F2E, 0x001F26},
+{0x001F2F, 0x001F27},
+{0x001F38, 0x001F30},
+{0x001F39, 0x001F31},
+{0x001F3A, 0x001F32},
+{0x001F3B, 0x001F33},
+{0x001F3C, 0x001F34},
+{0x001F3D, 0x001F35},
+{0x001F3E, 0x001F36},
+{0x001F3F, 0x001F37},
+{0x001F48, 0x001F40},
+{0x001F49, 0x001F41},
+{0x001F4A, 0x001F42},
+{0x001F4B, 0x001F43},
+{0x001F4C, 0x001F44},
+{0x001F4D, 0x001F45},
+{0x001F59, 0x001F51},
+{0x001F5B, 0x001F53},
+{0x001F5D, 0x001F55},
+{0x001F5F, 0x001F57},
+{0x001F68, 0x001F60},
+{0x001F69, 0x001F61},
+{0x001F6A, 0x001F62},
+{0x001F6B, 0x001F63},
+{0x001F6C, 0x001F64},
+{0x001F6D, 0x001F65},
+{0x001F6E, 0x001F66},
+{0x001F6F, 0x001F67},
+{0x001F88, 0x001F80},
+{0x001F89, 0x001F81},
+{0x001F8A, 0x001F82},
+{0x001F8B, 0x001F83},
+{0x001F8C, 0x001F84},
+{0x001F8D, 0x001F85},
+{0x001F8E, 0x001F86},
+{0x001F8F, 0x001F87},
+{0x001F98, 0x001F90},
+{0x001F99, 0x001F91},
+{0x001F9A, 0x001F92},
+{0x001F9B, 0x001F93},
+{0x001F9C, 0x001F94},
+{0x001F9D, 0x001F95},
+{0x001F9E, 0x001F96},
+{0x001F9F, 0x001F97},
+{0x001FA8, 0x001FA0},
+{0x001FA9, 0x001FA1},
+{0x001FAA, 0x001FA2},
+{0x001FAB, 0x001FA3},
+{0x001FAC, 0x001FA4},
+{0x001FAD, 0x001FA5},
+{0x001FAE, 0x001FA6},
+{0x001FAF, 0x001FA7},
+{0x001FB8, 0x001FB0},
+{0x001FB9, 0x001FB1},
+{0x001FBA, 0x001F70},
+{0x001FBB, 0x001F71},
+{0x001FBC, 0x001FB3},
+{0x001FC8, 0x001F72},
+{0x001FC9, 0x001F73},
+{0x001FCA, 0x001F74},
+{0x001FCB, 0x001F75},
+{0x001FCC, 0x001FC3},
+{0x001FD8, 0x001FD0},
+{0x001FD9, 0x001FD1},
+{0x001FDA, 0x001F76},
+{0x001FDB, 0x001F77},
+{0x001FE8, 0x001FE0},
+{0x001FE9, 0x001FE1},
+{0x001FEA, 0x001F7A},
+{0x001FEB, 0x001F7B},
+{0x001FEC, 0x001FE5},
+{0x001FF8, 0x001F78},
+{0x001FF9, 0x001F79},
+{0x001FFA, 0x001F7C},
+{0x001FFB, 0x001F7D},
+{0x001FFC, 0x001FF3},
+{0x002126, 0x0003C9},
+{0x00212A, 0x00006B},
+{0x00212B, 0x0000E5},
+{0x002132, 0x00214E},
+{0x002160, 0x002170},
+{0x002161, 0x002171},
+{0x002162, 0x002172},
+{0x002163, 0x002173},
+{0x002164, 0x002174},
+{0x002165, 0x002175},
+{0x002166, 0x002176},
+{0x002167, 0x002177},
+{0x002168, 0x002178},
+{0x002169, 0x002179},
+{0x00216A, 0x00217A},
+{0x00216B, 0x00217B},
+{0x00216C, 0x00217C},
+{0x00216D, 0x00217D},
+{0x00216E, 0x00217E},
+{0x00216F, 0x00217F},
+{0x002183, 0x002184},
+{0x0024B6, 0x0024D0},
+{0x0024B7, 0x0024D1},
+{0x0024B8, 0x0024D2},
+{0x0024B9, 0x0024D3},
+{0x0024BA, 0x0024D4},
+{0x0024BB, 0x0024D5},
+{0x0024BC, 0x0024D6},
+{0x0024BD, 0x0024D7},
+{0x0024BE, 0x0024D8},
+{0x0024BF, 0x0024D9},
+{0x0024C0, 0x0024DA},
+{0x0024C1, 0x0024DB},
+{0x0024C2, 0x0024DC},
+{0x0024C3, 0x0024DD},
+{0x0024C4, 0x0024DE},
+{0x0024C5, 0x0024DF},
+{0x0024C6, 0x0024E0},
+{0x0024C7, 0x0024E1},
+{0x0024C8, 0x0024E2},
+{0x0024C9, 0x0024E3},
+{0x0024CA, 0x0024E4},
+{0x0024CB, 0x0024E5},
+{0x0024CC, 0x0024E6},
+{0x0024CD, 0x0024E7},
+{0x0024CE, 0x0024E8},
+{0x0024CF, 0x0024E9},
+{0x002C00, 0x002C30},
+{0x002C01, 0x002C31},
+{0x002C02, 0x002C32},
+{0x002C03, 0x002C33},
+{0x002C04, 0x002C34},
+{0x002C05, 0x002C35},
+{0x002C06, 0x002C36},
+{0x002C07, 0x002C37},
+{0x002C08, 0x002C38},
+{0x002C09, 0x002C39},
+{0x002C0A, 0x002C3A},
+{0x002C0B, 0x002C3B},
+{0x002C0C, 0x002C3C},
+{0x002C0D, 0x002C3D},
+{0x002C0E, 0x002C3E},
+{0x002C0F, 0x002C3F},
+{0x002C10, 0x002C40},
+{0x002C11, 0x002C41},
+{0x002C12, 0x002C42},
+{0x002C13, 0x002C43},
+{0x002C14, 0x002C44},
+{0x002C15, 0x002C45},
+{0x002C16, 0x002C46},
+{0x002C17, 0x002C47},
+{0x002C18, 0x002C48},
+{0x002C19, 0x002C49},
+{0x002C1A, 0x002C4A},
+{0x002C1B, 0x002C4B},
+{0x002C1C, 0x002C4C},
+{0x002C1D, 0x002C4D},
+{0x002C1E, 0x002C4E},
+{0x002C1F, 0x002C4F},
+{0x002C20, 0x002C50},
+{0x002C21, 0x002C51},
+{0x002C22, 0x002C52},
+{0x002C23, 0x002C53},
+{0x002C24, 0x002C54},
+{0x002C25, 0x002C55},
+{0x002C26, 0x002C56},
+{0x002C27, 0x002C57},
+{0x002C28, 0x002C58},
+{0x002C29, 0x002C59},
+{0x002C2A, 0x002C5A},
+{0x002C2B, 0x002C5B},
+{0x002C2C, 0x002C5C},
+{0x002C2D, 0x002C5D},
+{0x002C2E, 0x002C5E},
+{0x002C2F, 0x002C5F},
+{0x002C60, 0x002C61},
+{0x002C62, 0x00026B},
+{0x002C63, 0x001D7D},
+{0x002C64, 0x00027D},
+{0x002C67, 0x002C68},
+{0x002C69, 0x002C6A},
+{0x002C6B, 0x002C6C},
+{0x002C6D, 0x000251},
+{0x002C6E, 0x000271},
+{0x002C6F, 0x000250},
+{0x002C70, 0x000252},
+{0x002C72, 0x002C73},
+{0x002C75, 0x002C76},
+{0x002C7E, 0x00023F},
+{0x002C7F, 0x000240},
+{0x002C80, 0x002C81},
+{0x002C82, 0x002C83},
+{0x002C84, 0x002C85},
+{0x002C86, 0x002C87},
+{0x002C88, 0x002C89},
+{0x002C8A, 0x002C8B},
+{0x002C8C, 0x002C8D},
+{0x002C8E, 0x002C8F},
+{0x002C90, 0x002C91},
+{0x002C92, 0x002C93},
+{0x002C94, 0x002C95},
+{0x002C96, 0x002C97},
+{0x002C98, 0x002C99},
+{0x002C9A, 0x002C9B},
+{0x002C9C, 0x002C9D},
+{0x002C9E, 0x002C9F},
+{0x002CA0, 0x002CA1},
+{0x002CA2, 0x002CA3},
+{0x002CA4, 0x002CA5},
+{0x002CA6, 0x002CA7},
+{0x002CA8, 0x002CA9},
+{0x002CAA, 0x002CAB},
+{0x002CAC, 0x002CAD},
+{0x002CAE, 0x002CAF},
+{0x002CB0, 0x002CB1},
+{0x002CB2, 0x002CB3},
+{0x002CB4, 0x002CB5},
+{0x002CB6, 0x002CB7},
+{0x002CB8, 0x002CB9},
+{0x002CBA, 0x002CBB},
+{0x002CBC, 0x002CBD},
+{0x002CBE, 0x002CBF},
+{0x002CC0, 0x002CC1},
+{0x002CC2, 0x002CC3},
+{0x002CC4, 0x002CC5},
+{0x002CC6, 0x002CC7},
+{0x002CC8, 0x002CC9},
+{0x002CCA, 0x002CCB},
+{0x002CCC, 0x002CCD},
+{0x002CCE, 0x002CCF},
+{0x002CD0, 0x002CD1},
+{0x002CD2, 0x002CD3},
+{0x002CD4, 0x002CD5},
+{0x002CD6, 0x002CD7},
+{0x002CD8, 0x002CD9},
+{0x002CDA, 0x002CDB},
+{0x002CDC, 0x002CDD},
+{0x002CDE, 0x002CDF},
+{0x002CE0, 0x002CE1},
+{0x002CE2, 0x002CE3},
+{0x002CEB, 0x002CEC},
+{0x002CED, 0x002CEE},
+{0x002CF2, 0x002CF3},
+{0x00A640, 0x00A641},
+{0x00A642, 0x00A643},
+{0x00A644, 0x00A645},
+{0x00A646, 0x00A647},
+{0x00A648, 0x00A649},
+{0x00A64A, 0x00A64B},
+{0x00A64C, 0x00A64D},
+{0x00A64E, 0x00A64F},
+{0x00A650, 0x00A651},
+{0x00A652, 0x00A653},
+{0x00A654, 0x00A655},
+{0x00A656, 0x00A657},
+{0x00A658, 0x00A659},
+{0x00A65A, 0x00A65B},
+{0x00A65C, 0x00A65D},
+{0x00A65E, 0x00A65F},
+{0x00A660, 0x00A661},
+{0x00A662, 0x00A663},
+{0x00A664, 0x00A665},
+{0x00A666, 0x00A667},
+{0x00A668, 0x00A669},
+{0x00A66A, 0x00A66B},
+{0x00A66C, 0x00A66D},
+{0x00A680, 0x00A681},
+{0x00A682, 0x00A683},
+{0x00A684, 0x00A685},
+{0x00A686, 0x00A687},
+{0x00A688, 0x00A689},
+{0x00A68A, 0x00A68B},
+{0x00A68C, 0x00A68D},
+{0x00A68E, 0x00A68F},
+{0x00A690, 0x00A691},
+{0x00A692, 0x00A693},
+{0x00A694, 0x00A695},
+{0x00A696, 0x00A697},
+{0x00A698, 0x00A699},
+{0x00A69A, 0x00A69B},
+{0x00A722, 0x00A723},
+{0x00A724, 0x00A725},
+{0x00A726, 0x00A727},
+{0x00A728, 0x00A729},
+{0x00A72A, 0x00A72B},
+{0x00A72C, 0x00A72D},
+{0x00A72E, 0x00A72F},
+{0x00A732, 0x00A733},
+{0x00A734, 0x00A735},
+{0x00A736, 0x00A737},
+{0x00A738, 0x00A739},
+{0x00A73A, 0x00A73B},
+{0x00A73C, 0x00A73D},
+{0x00A73E, 0x00A73F},
+{0x00A740, 0x00A741},
+{0x00A742, 0x00A743},
+{0x00A744, 0x00A745},
+{0x00A746, 0x00A747},
+{0x00A748, 0x00A749},
+{0x00A74A, 0x00A74B},
+{0x00A74C, 0x00A74D},
+{0x00A74E, 0x00A74F},
+{0x00A750, 0x00A751},
+{0x00A752, 0x00A753},
+{0x00A754, 0x00A755},
+{0x00A756, 0x00A757},
+{0x00A758, 0x00A759},
+{0x00A75A, 0x00A75B},
+{0x00A75C, 0x00A75D},
+{0x00A75E, 0x00A75F},
+{0x00A760, 0x00A761},
+{0x00A762, 0x00A763},
+{0x00A764, 0x00A765},
+{0x00A766, 0x00A767},
+{0x00A768, 0x00A769},
+{0x00A76A, 0x00A76B},
+{0x00A76C, 0x00A76D},
+{0x00A76E, 0x00A76F},
+{0x00A779, 0x00A77A},
+{0x00A77B, 0x00A77C},
+{0x00A77D, 0x001D79},
+{0x00A77E, 0x00A77F},
+{0x00A780, 0x00A781},
+{0x00A782, 0x00A783},
+{0x00A784, 0x00A785},
+{0x00A786, 0x00A787},
+{0x00A78B, 0x00A78C},
+{0x00A78D, 0x000265},
+{0x00A790, 0x00A791},
+{0x00A792, 0x00A793},
+{0x00A796, 0x00A797},
+{0x00A798, 0x00A799},
+{0x00A79A, 0x00A79B},
+{0x00A79C, 0x00A79D},
+{0x00A79E, 0x00A79F},
+{0x00A7A0, 0x00A7A1},
+{0x00A7A2, 0x00A7A3},
+{0x00A7A4, 0x00A7A5},
+{0x00A7A6, 0x00A7A7},
+{0x00A7A8, 0x00A7A9},
+{0x00A7AA, 0x000266},
+{0x00A7AB, 0x00025C},
+{0x00A7AC, 0x000261},
+{0x00A7AD, 0x00026C},
+{0x00A7AE, 0x00026A},
+{0x00A7B0, 0x00029E},
+{0x00A7B1, 0x000287},
+{0x00A7B2, 0x00029D},
+{0x00A7B3, 0x00AB53},
+{0x00A7B4, 0x00A7B5},
+{0x00A7B6, 0x00A7B7},
+{0x00A7B8, 0x00A7B9},
+{0x00A7BA, 0x00A7BB},
+{0x00A7BC, 0x00A7BD},
+{0x00A7BE, 0x00A7BF},
+{0x00A7C0, 0x00A7C1},
+{0x00A7C2, 0x00A7C3},
+{0x00A7C4, 0x00A794},
+{0x00A7C5, 0x000282},
+{0x00A7C6, 0x001D8E},
+{0x00A7C7, 0x00A7C8},
+{0x00A7C9, 0x00A7CA},
+{0x00A7D0, 0x00A7D1},
+{0x00A7D6, 0x00A7D7},
+{0x00A7D8, 0x00A7D9},
+{0x00A7F5, 0x00A7F6},
+{0x00FF21, 0x00FF41},
+{0x00FF22, 0x00FF42},
+{0x00FF23, 0x00FF43},
+{0x00FF24, 0x00FF44},
+{0x00FF25, 0x00FF45},
+{0x00FF26, 0x00FF46},
+{0x00FF27, 0x00FF47},
+{0x00FF28, 0x00FF48},
+{0x00FF29, 0x00FF49},
+{0x00FF2A, 0x00FF4A},
+{0x00FF2B, 0x00FF4B},
+{0x00FF2C, 0x00FF4C},
+{0x00FF2D, 0x00FF4D},
+{0x00FF2E, 0x00FF4E},
+{0x00FF2F, 0x00FF4F},
+{0x00FF30, 0x00FF50},
+{0x00FF31, 0x00FF51},
+{0x00FF32, 0x00FF52},
+{0x00FF33, 0x00FF53},
+{0x00FF34, 0x00FF54},
+{0x00FF35, 0x00FF55},
+{0x00FF36, 0x00FF56},
+{0x00FF37, 0x00FF57},
+{0x00FF38, 0x00FF58},
+{0x00FF39, 0x00FF59},
+{0x00FF3A, 0x00FF5A},
+{0x010400, 0x010428},
+{0x010401, 0x010429},
+{0x010402, 0x01042A},
+{0x010403, 0x01042B},
+{0x010404, 0x01042C},
+{0x010405, 0x01042D},
+{0x010406, 0x01042E},
+{0x010407, 0x01042F},
+{0x010408, 0x010430},
+{0x010409, 0x010431},
+{0x01040A, 0x010432},
+{0x01040B, 0x010433},
+{0x01040C, 0x010434},
+{0x01040D, 0x010435},
+{0x01040E, 0x010436},
+{0x01040F, 0x010437},
+{0x010410, 0x010438},
+{0x010411, 0x010439},
+{0x010412, 0x01043A},
+{0x010413, 0x01043B},
+{0x010414, 0x01043C},
+{0x010415, 0x01043D},
+{0x010416, 0x01043E},
+{0x010417, 0x01043F},
+{0x010418, 0x010440},
+{0x010419, 0x010441},
+{0x01041A, 0x010442},
+{0x01041B, 0x010443},
+{0x01041C, 0x010444},
+{0x01041D, 0x010445},
+{0x01041E, 0x010446},
+{0x01041F, 0x010447},
+{0x010420, 0x010448},
+{0x010421, 0x010449},
+{0x010422, 0x01044A},
+{0x010423, 0x01044B},
+{0x010424, 0x01044C},
+{0x010425, 0x01044D},
+{0x010426, 0x01044E},
+{0x010427, 0x01044F},
+{0x0104B0, 0x0104D8},
+{0x0104B1, 0x0104D9},
+{0x0104B2, 0x0104DA},
+{0x0104B3, 0x0104DB},
+{0x0104B4, 0x0104DC},
+{0x0104B5, 0x0104DD},
+{0x0104B6, 0x0104DE},
+{0x0104B7, 0x0104DF},
+{0x0104B8, 0x0104E0},
+{0x0104B9, 0x0104E1},
+{0x0104BA, 0x0104E2},
+{0x0104BB, 0x0104E3},
+{0x0104BC, 0x0104E4},
+{0x0104BD, 0x0104E5},
+{0x0104BE, 0x0104E6},
+{0x0104BF, 0x0104E7},
+{0x0104C0, 0x0104E8},
+{0x0104C1, 0x0104E9},
+{0x0104C2, 0x0104EA},
+{0x0104C3, 0x0104EB},
+{0x0104C4, 0x0104EC},
+{0x0104C5, 0x0104ED},
+{0x0104C6, 0x0104EE},
+{0x0104C7, 0x0104EF},
+{0x0104C8, 0x0104F0},
+{0x0104C9, 0x0104F1},
+{0x0104CA, 0x0104F2},
+{0x0104CB, 0x0104F3},
+{0x0104CC, 0x0104F4},
+{0x0104CD, 0x0104F5},
+{0x0104CE, 0x0104F6},
+{0x0104CF, 0x0104F7},
+{0x0104D0, 0x0104F8},
+{0x0104D1, 0x0104F9},
+{0x0104D2, 0x0104FA},
+{0x0104D3, 0x0104FB},
+{0x010570, 0x010597},
+{0x010571, 0x010598},
+{0x010572, 0x010599},
+{0x010573, 0x01059A},
+{0x010574, 0x01059B},
+{0x010575, 0x01059C},
+{0x010576, 0x01059D},
+{0x010577, 0x01059E},
+{0x010578, 0x01059F},
+{0x010579, 0x0105A0},
+{0x01057A, 0x0105A1},
+{0x01057C, 0x0105A3},
+{0x01057D, 0x0105A4},
+{0x01057E, 0x0105A5},
+{0x01057F, 0x0105A6},
+{0x010580, 0x0105A7},
+{0x010581, 0x0105A8},
+{0x010582, 0x0105A9},
+{0x010583, 0x0105AA},
+{0x010584, 0x0105AB},
+{0x010585, 0x0105AC},
+{0x010586, 0x0105AD},
+{0x010587, 0x0105AE},
+{0x010588, 0x0105AF},
+{0x010589, 0x0105B0},
+{0x01058A, 0x0105B1},
+{0x01058C, 0x0105B3},
+{0x01058D, 0x0105B4},
+{0x01058E, 0x0105B5},
+{0x01058F, 0x0105B6},
+{0x010590, 0x0105B7},
+{0x010591, 0x0105B8},
+{0x010592, 0x0105B9},
+{0x010594, 0x0105BB},
+{0x010595, 0x0105BC},
+{0x010C80, 0x010CC0},
+{0x010C81, 0x010CC1},
+{0x010C82, 0x010CC2},
+{0x010C83, 0x010CC3},
+{0x010C84, 0x010CC4},
+{0x010C85, 0x010CC5},
+{0x010C86, 0x010CC6},
+{0x010C87, 0x010CC7},
+{0x010C88, 0x010CC8},
+{0x010C89, 0x010CC9},
+{0x010C8A, 0x010CCA},
+{0x010C8B, 0x010CCB},
+{0x010C8C, 0x010CCC},
+{0x010C8D, 0x010CCD},
+{0x010C8E, 0x010CCE},
+{0x010C8F, 0x010CCF},
+{0x010C90, 0x010CD0},
+{0x010C91, 0x010CD1},
+{0x010C92, 0x010CD2},
+{0x010C93, 0x010CD3},
+{0x010C94, 0x010CD4},
+{0x010C95, 0x010CD5},
+{0x010C96, 0x010CD6},
+{0x010C97, 0x010CD7},
+{0x010C98, 0x010CD8},
+{0x010C99, 0x010CD9},
+{0x010C9A, 0x010CDA},
+{0x010C9B, 0x010CDB},
+{0x010C9C, 0x010CDC},
+{0x010C9D, 0x010CDD},
+{0x010C9E, 0x010CDE},
+{0x010C9F, 0x010CDF},
+{0x010CA0, 0x010CE0},
+{0x010CA1, 0x010CE1},
+{0x010CA2, 0x010CE2},
+{0x010CA3, 0x010CE3},
+{0x010CA4, 0x010CE4},
+{0x010CA5, 0x010CE5},
+{0x010CA6, 0x010CE6},
+{0x010CA7, 0x010CE7},
+{0x010CA8, 0x010CE8},
+{0x010CA9, 0x010CE9},
+{0x010CAA, 0x010CEA},
+{0x010CAB, 0x010CEB},
+{0x010CAC, 0x010CEC},
+{0x010CAD, 0x010CED},
+{0x010CAE, 0x010CEE},
+{0x010CAF, 0x010CEF},
+{0x010CB0, 0x010CF0},
+{0x010CB1, 0x010CF1},
+{0x010CB2, 0x010CF2},
+{0x0118A0, 0x0118C0},
+{0x0118A1, 0x0118C1},
+{0x0118A2, 0x0118C2},
+{0x0118A3, 0x0118C3},
+{0x0118A4, 0x0118C4},
+{0x0118A5, 0x0118C5},
+{0x0118A6, 0x0118C6},
+{0x0118A7, 0x0118C7},
+{0x0118A8, 0x0118C8},
+{0x0118A9, 0x0118C9},
+{0x0118AA, 0x0118CA},
+{0x0118AB, 0x0118CB},
+{0x0118AC, 0x0118CC},
+{0x0118AD, 0x0118CD},
+{0x0118AE, 0x0118CE},
+{0x0118AF, 0x0118CF},
+{0x0118B0, 0x0118D0},
+{0x0118B1, 0x0118D1},
+{0x0118B2, 0x0118D2},
+{0x0118B3, 0x0118D3},
+{0x0118B4, 0x0118D4},
+{0x0118B5, 0x0118D5},
+{0x0118B6, 0x0118D6},
+{0x0118B7, 0x0118D7},
+{0x0118B8, 0x0118D8},
+{0x0118B9, 0x0118D9},
+{0x0118BA, 0x0118DA},
+{0x0118BB, 0x0118DB},
+{0x0118BC, 0x0118DC},
+{0x0118BD, 0x0118DD},
+{0x0118BE, 0x0118DE},
+{0x0118BF, 0x0118DF},
+{0x016E40, 0x016E60},
+{0x016E41, 0x016E61},
+{0x016E42, 0x016E62},
+{0x016E43, 0x016E63},
+{0x016E44, 0x016E64},
+{0x016E45, 0x016E65},
+{0x016E46, 0x016E66},
+{0x016E47, 0x016E67},
+{0x016E48, 0x016E68},
+{0x016E49, 0x016E69},
+{0x016E4A, 0x016E6A},
+{0x016E4B, 0x016E6B},
+{0x016E4C, 0x016E6C},
+{0x016E4D, 0x016E6D},
+{0x016E4E, 0x016E6E},
+{0x016E4F, 0x016E6F},
+{0x016E50, 0x016E70},
+{0x016E51, 0x016E71},
+{0x016E52, 0x016E72},
+{0x016E53, 0x016E73},
+{0x016E54, 0x016E74},
+{0x016E55, 0x016E75},
+{0x016E56, 0x016E76},
+{0x016E57, 0x016E77},
+{0x016E58, 0x016E78},
+{0x016E59, 0x016E79},
+{0x016E5A, 0x016E7A},
+{0x016E5B, 0x016E7B},
+{0x016E5C, 0x016E7C},
+{0x016E5D, 0x016E7D},
+{0x016E5E, 0x016E7E},
+{0x016E5F, 0x016E7F},
+{0x01E900, 0x01E922},
+{0x01E901, 0x01E923},
+{0x01E902, 0x01E924},
+{0x01E903, 0x01E925},
+{0x01E904, 0x01E926},
+{0x01E905, 0x01E927},
+{0x01E906, 0x01E928},
+{0x01E907, 0x01E929},
+{0x01E908, 0x01E92A},
+{0x01E909, 0x01E92B},
+{0x01E90A, 0x01E92C},
+{0x01E90B, 0x01E92D},
+{0x01E90C, 0x01E92E},
+{0x01E90D, 0x01E92F},
+{0x01E90E, 0x01E930},
+{0x01E90F, 0x01E931},
+{0x01E910, 0x01E932},
+{0x01E911, 0x01E933},
+{0x01E912, 0x01E934},
+{0x01E913, 0x01E935},
+{0x01E914, 0x01E936},
+{0x01E915, 0x01E937},
+{0x01E916, 0x01E938},
+{0x01E917, 0x01E939},
+{0x01E918, 0x01E93A},
+{0x01E919, 0x01E93B},
+{0x01E91A, 0x01E93C},
+{0x01E91B, 0x01E93D},
+{0x01E91C, 0x01E93E},
+{0x01E91D, 0x01E93F},
+{0x01E91E, 0x01E940},
+{0x01E91F, 0x01E941},
+{0x01E920, 0x01E942},
+{0x01E921, 0x01E943},
+};
+
+// list is always in ascending order, to enable binary search
+const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
+{0x000061, 0x000041},
+{0x000062, 0x000042},
+{0x000063, 0x000043},
+{0x000064, 0x000044},
+{0x000065, 0x000045},
+{0x000066, 0x000046},
+{0x000067, 0x000047},
+{0x000068, 0x000048},
+{0x000069, 0x000049},
+{0x00006A, 0x00004A},
+{0x00006B, 0x00004B},
+{0x00006C, 0x00004C},
+{0x00006D, 0x00004D},
+{0x00006E, 0x00004E},
+{0x00006F, 0x00004F},
+{0x000070, 0x000050},
+{0x000071, 0x000051},
+{0x000072, 0x000052},
+{0x000073, 0x000053},
+{0x000074, 0x000054},
+{0x000075, 0x000055},
+{0x000076, 0x000056},
+{0x000077, 0x000057},
+{0x000078, 0x000058},
+{0x000079, 0x000059},
+{0x00007A, 0x00005A},
+{0x0000B5, 0x00039C},
+{0x0000E0, 0x0000C0},
+{0x0000E1, 0x0000C1},
+{0x0000E2, 0x0000C2},
+{0x0000E3, 0x0000C3},
+{0x0000E4, 0x0000C4},
+{0x0000E5, 0x0000C5},
+{0x0000E6, 0x0000C6},
+{0x0000E7, 0x0000C7},
+{0x0000E8, 0x0000C8},
+{0x0000E9, 0x0000C9},
+{0x0000EA, 0x0000CA},
+{0x0000EB, 0x0000CB},
+{0x0000EC, 0x0000CC},
+{0x0000ED, 0x0000CD},
+{0x0000EE, 0x0000CE},
+{0x0000EF, 0x0000CF},
+{0x0000F0, 0x0000D0},
+{0x0000F1, 0x0000D1},
+{0x0000F2, 0x0000D2},
+{0x0000F3, 0x0000D3},
+{0x0000F4, 0x0000D4},
+{0x0000F5, 0x0000D5},
+{0x0000F6, 0x0000D6},
+{0x0000F8, 0x0000D8},
+{0x0000F9, 0x0000D9},
+{0x0000FA, 0x0000DA},
+{0x0000FB, 0x0000DB},
+{0x0000FC, 0x0000DC},
+{0x0000FD, 0x0000DD},
+{0x0000FE, 0x0000DE},
+{0x0000FF, 0x000178},
+{0x000101, 0x000100},
+{0x000103, 0x000102},
+{0x000105, 0x000104},
+{0x000107, 0x000106},
+{0x000109, 0x000108},
+{0x00010B, 0x00010A},
+{0x00010D, 0x00010C},
+{0x00010F, 0x00010E},
+{0x000111, 0x000110},
+{0x000113, 0x000112},
+{0x000115, 0x000114},
+{0x000117, 0x000116},
+{0x000119, 0x000118},
+{0x00011B, 0x00011A},
+{0x00011D, 0x00011C},
+{0x00011F, 0x00011E},
+{0x000121, 0x000120},
+{0x000123, 0x000122},
+{0x000125, 0x000124},
+{0x000127, 0x000126},
+{0x000129, 0x000128},
+{0x00012B, 0x00012A},
+{0x00012D, 0x00012C},
+{0x00012F, 0x00012E},
+{0x000131, 0x000049},
+{0x000133, 0x000132},
+{0x000135, 0x000134},
+{0x000137, 0x000136},
+{0x00013A, 0x000139},
+{0x00013C, 0x00013B},
+{0x00013E, 0x00013D},
+{0x000140, 0x00013F},
+{0x000142, 0x000141},
+{0x000144, 0x000143},
+{0x000146, 0x000145},
+{0x000148, 0x000147},
+{0x00014B, 0x00014A},
+{0x00014D, 0x00014C},
+{0x00014F, 0x00014E},
+{0x000151, 0x000150},
+{0x000153, 0x000152},
+{0x000155, 0x000154},
+{0x000157, 0x000156},
+{0x000159, 0x000158},
+{0x00015B, 0x00015A},
+{0x00015D, 0x00015C},
+{0x00015F, 0x00015E},
+{0x000161, 0x000160},
+{0x000163, 0x000162},
+{0x000165, 0x000164},
+{0x000167, 0x000166},
+{0x000169, 0x000168},
+{0x00016B, 0x00016A},
+{0x00016D, 0x00016C},
+{0x00016F, 0x00016E},
+{0x000171, 0x000170},
+{0x000173, 0x000172},
+{0x000175, 0x000174},
+{0x000177, 0x000176},
+{0x00017A, 0x000179},
+{0x00017C, 0x00017B},
+{0x00017E, 0x00017D},
+{0x00017F, 0x000053},
+{0x000180, 0x000243},
+{0x000183, 0x000182},
+{0x000185, 0x000184},
+{0x000188, 0x000187},
+{0x00018C, 0x00018B},
+{0x000192, 0x000191},
+{0x000195, 0x0001F6},
+{0x000199, 0x000198},
+{0x00019A, 0x00023D},
+{0x00019E, 0x000220},
+{0x0001A1, 0x0001A0},
+{0x0001A3, 0x0001A2},
+{0x0001A5, 0x0001A4},
+{0x0001A8, 0x0001A7},
+{0x0001AD, 0x0001AC},
+{0x0001B0, 0x0001AF},
+{0x0001B4, 0x0001B3},
+{0x0001B6, 0x0001B5},
+{0x0001B9, 0x0001B8},
+{0x0001BD, 0x0001BC},
+{0x0001BF, 0x0001F7},
+{0x0001C5, 0x0001C4},
+{0x0001C6, 0x0001C4},
+{0x0001C8, 0x0001C7},
+{0x0001C9, 0x0001C7},
+{0x0001CB, 0x0001CA},
+{0x0001CC, 0x0001CA},
+{0x0001CE, 0x0001CD},
+{0x0001D0, 0x0001CF},
+{0x0001D2, 0x0001D1},
+{0x0001D4, 0x0001D3},
+{0x0001D6, 0x0001D5},
+{0x0001D8, 0x0001D7},
+{0x0001DA, 0x0001D9},
+{0x0001DC, 0x0001DB},
+{0x0001DD, 0x00018E},
+{0x0001DF, 0x0001DE},
+{0x0001E1, 0x0001E0},
+{0x0001E3, 0x0001E2},
+{0x0001E5, 0x0001E4},
+{0x0001E7, 0x0001E6},
+{0x0001E9, 0x0001E8},
+{0x0001EB, 0x0001EA},
+{0x0001ED, 0x0001EC},
+{0x0001EF, 0x0001EE},
+{0x0001F2, 0x0001F1},
+{0x0001F3, 0x0001F1},
+{0x0001F5, 0x0001F4},
+{0x0001F9, 0x0001F8},
+{0x0001FB, 0x0001FA},
+{0x0001FD, 0x0001FC},
+{0x0001FF, 0x0001FE},
+{0x000201, 0x000200},
+{0x000203, 0x000202},
+{0x000205, 0x000204},
+{0x000207, 0x000206},
+{0x000209, 0x000208},
+{0x00020B, 0x00020A},
+{0x00020D, 0x00020C},
+{0x00020F, 0x00020E},
+{0x000211, 0x000210},
+{0x000213, 0x000212},
+{0x000215, 0x000214},
+{0x000217, 0x000216},
+{0x000219, 0x000218},
+{0x00021B, 0x00021A},
+{0x00021D, 0x00021C},
+{0x00021F, 0x00021E},
+{0x000223, 0x000222},
+{0x000225, 0x000224},
+{0x000227, 0x000226},
+{0x000229, 0x000228},
+{0x00022B, 0x00022A},
+{0x00022D, 0x00022C},
+{0x00022F, 0x00022E},
+{0x000231, 0x000230},
+{0x000233, 0x000232},
+{0x00023C, 0x00023B},
+{0x00023F, 0x002C7E},
+{0x000240, 0x002C7F},
+{0x000242, 0x000241},
+{0x000247, 0x000246},
+{0x000249, 0x000248},
+{0x00024B, 0x00024A},
+{0x00024D, 0x00024C},
+{0x00024F, 0x00024E},
+{0x000250, 0x002C6F},
+{0x000251, 0x002C6D},
+{0x000252, 0x002C70},
+{0x000253, 0x000181},
+{0x000254, 0x000186},
+{0x000256, 0x000189},
+{0x000257, 0x00018A},
+{0x000259, 0x00018F},
+{0x00025B, 0x000190},
+{0x00025C, 0x00A7AB},
+{0x000260, 0x000193},
+{0x000261, 0x00A7AC},
+{0x000263, 0x000194},
+{0x000265, 0x00A78D},
+{0x000266, 0x00A7AA},
+{0x000268, 0x000197},
+{0x000269, 0x000196},
+{0x00026A, 0x00A7AE},
+{0x00026B, 0x002C62},
+{0x00026C, 0x00A7AD},
+{0x00026F, 0x00019C},
+{0x000271, 0x002C6E},
+{0x000272, 0x00019D},
+{0x000275, 0x00019F},
+{0x00027D, 0x002C64},
+{0x000280, 0x0001A6},
+{0x000282, 0x00A7C5},
+{0x000283, 0x0001A9},
+{0x000287, 0x00A7B1},
+{0x000288, 0x0001AE},
+{0x000289, 0x000244},
+{0x00028A, 0x0001B1},
+{0x00028B, 0x0001B2},
+{0x00028C, 0x000245},
+{0x000292, 0x0001B7},
+{0x00029D, 0x00A7B2},
+{0x00029E, 0x00A7B0},
+{0x000345, 0x000399},
+{0x000371, 0x000370},
+{0x000373, 0x000372},
+{0x000377, 0x000376},
+{0x00037B, 0x0003FD},
+{0x00037C, 0x0003FE},
+{0x00037D, 0x0003FF},
+{0x0003AC, 0x000386},
+{0x0003AD, 0x000388},
+{0x0003AE, 0x000389},
+{0x0003AF, 0x00038A},
+{0x0003B1, 0x000391},
+{0x0003B2, 0x000392},
+{0x0003B3, 0x000393},
+{0x0003B4, 0x000394},
+{0x0003B5, 0x000395},
+{0x0003B6, 0x000396},
+{0x0003B7, 0x000397},
+{0x0003B8, 0x000398},
+{0x0003B9, 0x000399},
+{0x0003BA, 0x00039A},
+{0x0003BB, 0x00039B},
+{0x0003BC, 0x00039C},
+{0x0003BD, 0x00039D},
+{0x0003BE, 0x00039E},
+{0x0003BF, 0x00039F},
+{0x0003C0, 0x0003A0},
+{0x0003C1, 0x0003A1},
+{0x0003C2, 0x0003A3},
+{0x0003C3, 0x0003A3},
+{0x0003C4, 0x0003A4},
+{0x0003C5, 0x0003A5},
+{0x0003C6, 0x0003A6},
+{0x0003C7, 0x0003A7},
+{0x0003C8, 0x0003A8},
+{0x0003C9, 0x0003A9},
+{0x0003CA, 0x0003AA},
+{0x0003CB, 0x0003AB},
+{0x0003CC, 0x00038C},
+{0x0003CD, 0x00038E},
+{0x0003CE, 0x00038F},
+{0x0003D0, 0x000392},
+{0x0003D1, 0x000398},
+{0x0003D5, 0x0003A6},
+{0x0003D6, 0x0003A0},
+{0x0003D7, 0x0003CF},
+{0x0003D9, 0x0003D8},
+{0x0003DB, 0x0003DA},
+{0x0003DD, 0x0003DC},
+{0x0003DF, 0x0003DE},
+{0x0003E1, 0x0003E0},
+{0x0003E3, 0x0003E2},
+{0x0003E5, 0x0003E4},
+{0x0003E7, 0x0003E6},
+{0x0003E9, 0x0003E8},
+{0x0003EB, 0x0003EA},
+{0x0003ED, 0x0003EC},
+{0x0003EF, 0x0003EE},
+{0x0003F0, 0x00039A},
+{0x0003F1, 0x0003A1},
+{0x0003F2, 0x0003F9},
+{0x0003F3, 0x00037F},
+{0x0003F5, 0x000395},
+{0x0003F8, 0x0003F7},
+{0x0003FB, 0x0003FA},
+{0x000430, 0x000410},
+{0x000431, 0x000411},
+{0x000432, 0x000412},
+{0x000433, 0x000413},
+{0x000434, 0x000414},
+{0x000435, 0x000415},
+{0x000436, 0x000416},
+{0x000437, 0x000417},
+{0x000438, 0x000418},
+{0x000439, 0x000419},
+{0x00043A, 0x00041A},
+{0x00043B, 0x00041B},
+{0x00043C, 0x00041C},
+{0x00043D, 0x00041D},
+{0x00043E, 0x00041E},
+{0x00043F, 0x00041F},
+{0x000440, 0x000420},
+{0x000441, 0x000421},
+{0x000442, 0x000422},
+{0x000443, 0x000423},
+{0x000444, 0x000424},
+{0x000445, 0x000425},
+{0x000446, 0x000426},
+{0x000447, 0x000427},
+{0x000448, 0x000428},
+{0x000449, 0x000429},
+{0x00044A, 0x00042A},
+{0x00044B, 0x00042B},
+{0x00044C, 0x00042C},
+{0x00044D, 0x00042D},
+{0x00044E, 0x00042E},
+{0x00044F, 0x00042F},
+{0x000450, 0x000400},
+{0x000451, 0x000401},
+{0x000452, 0x000402},
+{0x000453, 0x000403},
+{0x000454, 0x000404},
+{0x000455, 0x000405},
+{0x000456, 0x000406},
+{0x000457, 0x000407},
+{0x000458, 0x000408},
+{0x000459, 0x000409},
+{0x00045A, 0x00040A},
+{0x00045B, 0x00040B},
+{0x00045C, 0x00040C},
+{0x00045D, 0x00040D},
+{0x00045E, 0x00040E},
+{0x00045F, 0x00040F},
+{0x000461, 0x000460},
+{0x000463, 0x000462},
+{0x000465, 0x000464},
+{0x000467, 0x000466},
+{0x000469, 0x000468},
+{0x00046B, 0x00046A},
+{0x00046D, 0x00046C},
+{0x00046F, 0x00046E},
+{0x000471, 0x000470},
+{0x000473, 0x000472},
+{0x000475, 0x000474},
+{0x000477, 0x000476},
+{0x000479, 0x000478},
+{0x00047B, 0x00047A},
+{0x00047D, 0x00047C},
+{0x00047F, 0x00047E},
+{0x000481, 0x000480},
+{0x00048B, 0x00048A},
+{0x00048D, 0x00048C},
+{0x00048F, 0x00048E},
+{0x000491, 0x000490},
+{0x000493, 0x000492},
+{0x000495, 0x000494},
+{0x000497, 0x000496},
+{0x000499, 0x000498},
+{0x00049B, 0x00049A},
+{0x00049D, 0x00049C},
+{0x00049F, 0x00049E},
+{0x0004A1, 0x0004A0},
+{0x0004A3, 0x0004A2},
+{0x0004A5, 0x0004A4},
+{0x0004A7, 0x0004A6},
+{0x0004A9, 0x0004A8},
+{0x0004AB, 0x0004AA},
+{0x0004AD, 0x0004AC},
+{0x0004AF, 0x0004AE},
+{0x0004B1, 0x0004B0},
+{0x0004B3, 0x0004B2},
+{0x0004B5, 0x0004B4},
+{0x0004B7, 0x0004B6},
+{0x0004B9, 0x0004B8},
+{0x0004BB, 0x0004BA},
+{0x0004BD, 0x0004BC},
+{0x0004BF, 0x0004BE},
+{0x0004C2, 0x0004C1},
+{0x0004C4, 0x0004C3},
+{0x0004C6, 0x0004C5},
+{0x0004C8, 0x0004C7},
+{0x0004CA, 0x0004C9},
+{0x0004CC, 0x0004CB},
+{0x0004CE, 0x0004CD},
+{0x0004CF, 0x0004C0},
+{0x0004D1, 0x0004D0},
+{0x0004D3, 0x0004D2},
+{0x0004D5, 0x0004D4},
+{0x0004D7, 0x0004D6},
+{0x0004D9, 0x0004D8},
+{0x0004DB, 0x0004DA},
+{0x0004DD, 0x0004DC},
+{0x0004DF, 0x0004DE},
+{0x0004E1, 0x0004E0},
+{0x0004E3, 0x0004E2},
+{0x0004E5, 0x0004E4},
+{0x0004E7, 0x0004E6},
+{0x0004E9, 0x0004E8},
+{0x0004EB, 0x0004EA},
+{0x0004ED, 0x0004EC},
+{0x0004EF, 0x0004EE},
+{0x0004F1, 0x0004F0},
+{0x0004F3, 0x0004F2},
+{0x0004F5, 0x0004F4},
+{0x0004F7, 0x0004F6},
+{0x0004F9, 0x0004F8},
+{0x0004FB, 0x0004FA},
+{0x0004FD, 0x0004FC},
+{0x0004FF, 0x0004FE},
+{0x000501, 0x000500},
+{0x000503, 0x000502},
+{0x000505, 0x000504},
+{0x000507, 0x000506},
+{0x000509, 0x000508},
+{0x00050B, 0x00050A},
+{0x00050D, 0x00050C},
+{0x00050F, 0x00050E},
+{0x000511, 0x000510},
+{0x000513, 0x000512},
+{0x000515, 0x000514},
+{0x000517, 0x000516},
+{0x000519, 0x000518},
+{0x00051B, 0x00051A},
+{0x00051D, 0x00051C},
+{0x00051F, 0x00051E},
+{0x000521, 0x000520},
+{0x000523, 0x000522},
+{0x000525, 0x000524},
+{0x000527, 0x000526},
+{0x000529, 0x000528},
+{0x00052B, 0x00052A},
+{0x00052D, 0x00052C},
+{0x00052F, 0x00052E},
+{0x000561, 0x000531},
+{0x000562, 0x000532},
+{0x000563, 0x000533},
+{0x000564, 0x000534},
+{0x000565, 0x000535},
+{0x000566, 0x000536},
+{0x000567, 0x000537},
+{0x000568, 0x000538},
+{0x000569, 0x000539},
+{0x00056A, 0x00053A},
+{0x00056B, 0x00053B},
+{0x00056C, 0x00053C},
+{0x00056D, 0x00053D},
+{0x00056E, 0x00053E},
+{0x00056F, 0x00053F},
+{0x000570, 0x000540},
+{0x000571, 0x000541},
+{0x000572, 0x000542},
+{0x000573, 0x000543},
+{0x000574, 0x000544},
+{0x000575, 0x000545},
+{0x000576, 0x000546},
+{0x000577, 0x000547},
+{0x000578, 0x000548},
+{0x000579, 0x000549},
+{0x00057A, 0x00054A},
+{0x00057B, 0x00054B},
+{0x00057C, 0x00054C},
+{0x00057D, 0x00054D},
+{0x00057E, 0x00054E},
+{0x00057F, 0x00054F},
+{0x000580, 0x000550},
+{0x000581, 0x000551},
+{0x000582, 0x000552},
+{0x000583, 0x000553},
+{0x000584, 0x000554},
+{0x000585, 0x000555},
+{0x000586, 0x000556},
+{0x0010D0, 0x001C90},
+{0x0010D1, 0x001C91},
+{0x0010D2, 0x001C92},
+{0x0010D3, 0x001C93},
+{0x0010D4, 0x001C94},
+{0x0010D5, 0x001C95},
+{0x0010D6, 0x001C96},
+{0x0010D7, 0x001C97},
+{0x0010D8, 0x001C98},
+{0x0010D9, 0x001C99},
+{0x0010DA, 0x001C9A},
+{0x0010DB, 0x001C9B},
+{0x0010DC, 0x001C9C},
+{0x0010DD, 0x001C9D},
+{0x0010DE, 0x001C9E},
+{0x0010DF, 0x001C9F},
+{0x0010E0, 0x001CA0},
+{0x0010E1, 0x001CA1},
+{0x0010E2, 0x001CA2},
+{0x0010E3, 0x001CA3},
+{0x0010E4, 0x001CA4},
+{0x0010E5, 0x001CA5},
+{0x0010E6, 0x001CA6},
+{0x0010E7, 0x001CA7},
+{0x0010E8, 0x001CA8},
+{0x0010E9, 0x001CA9},
+{0x0010EA, 0x001CAA},
+{0x0010EB, 0x001CAB},
+{0x0010EC, 0x001CAC},
+{0x0010ED, 0x001CAD},
+{0x0010EE, 0x001CAE},
+{0x0010EF, 0x001CAF},
+{0x0010F0, 0x001CB0},
+{0x0010F1, 0x001CB1},
+{0x0010F2, 0x001CB2},
+{0x0010F3, 0x001CB3},
+{0x0010F4, 0x001CB4},
+{0x0010F5, 0x001CB5},
+{0x0010F6, 0x001CB6},
+{0x0010F7, 0x001CB7},
+{0x0010F8, 0x001CB8},
+{0x0010F9, 0x001CB9},
+{0x0010FA, 0x001CBA},
+{0x0010FD, 0x001CBD},
+{0x0010FE, 0x001CBE},
+{0x0010FF, 0x001CBF},
+{0x0013F8, 0x0013F0},
+{0x0013F9, 0x0013F1},
+{0x0013FA, 0x0013F2},
+{0x0013FB, 0x0013F3},
+{0x0013FC, 0x0013F4},
+{0x0013FD, 0x0013F5},
+{0x001C80, 0x000412},
+{0x001C81, 0x000414},
+{0x001C82, 0x00041E},
+{0x001C83, 0x000421},
+{0x001C84, 0x000422},
+{0x001C85, 0x000422},
+{0x001C86, 0x00042A},
+{0x001C87, 0x000462},
+{0x001C88, 0x00A64A},
+{0x001D79, 0x00A77D},
+{0x001D7D, 0x002C63},
+{0x001D8E, 0x00A7C6},
+{0x001E01, 0x001E00},
+{0x001E03, 0x001E02},
+{0x001E05, 0x001E04},
+{0x001E07, 0x001E06},
+{0x001E09, 0x001E08},
+{0x001E0B, 0x001E0A},
+{0x001E0D, 0x001E0C},
+{0x001E0F, 0x001E0E},
+{0x001E11, 0x001E10},
+{0x001E13, 0x001E12},
+{0x001E15, 0x001E14},
+{0x001E17, 0x001E16},
+{0x001E19, 0x001E18},
+{0x001E1B, 0x001E1A},
+{0x001E1D, 0x001E1C},
+{0x001E1F, 0x001E1E},
+{0x001E21, 0x001E20},
+{0x001E23, 0x001E22},
+{0x001E25, 0x001E24},
+{0x001E27, 0x001E26},
+{0x001E29, 0x001E28},
+{0x001E2B, 0x001E2A},
+{0x001E2D, 0x001E2C},
+{0x001E2F, 0x001E2E},
+{0x001E31, 0x001E30},
+{0x001E33, 0x001E32},
+{0x001E35, 0x001E34},
+{0x001E37, 0x001E36},
+{0x001E39, 0x001E38},
+{0x001E3B, 0x001E3A},
+{0x001E3D, 0x001E3C},
+{0x001E3F, 0x001E3E},
+{0x001E41, 0x001E40},
+{0x001E43, 0x001E42},
+{0x001E45, 0x001E44},
+{0x001E47, 0x001E46},
+{0x001E49, 0x001E48},
+{0x001E4B, 0x001E4A},
+{0x001E4D, 0x001E4C},
+{0x001E4F, 0x001E4E},
+{0x001E51, 0x001E50},
+{0x001E53, 0x001E52},
+{0x001E55, 0x001E54},
+{0x001E57, 0x001E56},
+{0x001E59, 0x001E58},
+{0x001E5B, 0x001E5A},
+{0x001E5D, 0x001E5C},
+{0x001E5F, 0x001E5E},
+{0x001E61, 0x001E60},
+{0x001E63, 0x001E62},
+{0x001E65, 0x001E64},
+{0x001E67, 0x001E66},
+{0x001E69, 0x001E68},
+{0x001E6B, 0x001E6A},
+{0x001E6D, 0x001E6C},
+{0x001E6F, 0x001E6E},
+{0x001E71, 0x001E70},
+{0x001E73, 0x001E72},
+{0x001E75, 0x001E74},
+{0x001E77, 0x001E76},
+{0x001E79, 0x001E78},
+{0x001E7B, 0x001E7A},
+{0x001E7D, 0x001E7C},
+{0x001E7F, 0x001E7E},
+{0x001E81, 0x001E80},
+{0x001E83, 0x001E82},
+{0x001E85, 0x001E84},
+{0x001E87, 0x001E86},
+{0x001E89, 0x001E88},
+{0x001E8B, 0x001E8A},
+{0x001E8D, 0x001E8C},
+{0x001E8F, 0x001E8E},
+{0x001E91, 0x001E90},
+{0x001E93, 0x001E92},
+{0x001E95, 0x001E94},
+{0x001E9B, 0x001E60},
+{0x001EA1, 0x001EA0},
+{0x001EA3, 0x001EA2},
+{0x001EA5, 0x001EA4},
+{0x001EA7, 0x001EA6},
+{0x001EA9, 0x001EA8},
+{0x001EAB, 0x001EAA},
+{0x001EAD, 0x001EAC},
+{0x001EAF, 0x001EAE},
+{0x001EB1, 0x001EB0},
+{0x001EB3, 0x001EB2},
+{0x001EB5, 0x001EB4},
+{0x001EB7, 0x001EB6},
+{0x001EB9, 0x001EB8},
+{0x001EBB, 0x001EBA},
+{0x001EBD, 0x001EBC},
+{0x001EBF, 0x001EBE},
+{0x001EC1, 0x001EC0},
+{0x001EC3, 0x001EC2},
+{0x001EC5, 0x001EC4},
+{0x001EC7, 0x001EC6},
+{0x001EC9, 0x001EC8},
+{0x001ECB, 0x001ECA},
+{0x001ECD, 0x001ECC},
+{0x001ECF, 0x001ECE},
+{0x001ED1, 0x001ED0},
+{0x001ED3, 0x001ED2},
+{0x001ED5, 0x001ED4},
+{0x001ED7, 0x001ED6},
+{0x001ED9, 0x001ED8},
+{0x001EDB, 0x001EDA},
+{0x001EDD, 0x001EDC},
+{0x001EDF, 0x001EDE},
+{0x001EE1, 0x001EE0},
+{0x001EE3, 0x001EE2},
+{0x001EE5, 0x001EE4},
+{0x001EE7, 0x001EE6},
+{0x001EE9, 0x001EE8},
+{0x001EEB, 0x001EEA},
+{0x001EED, 0x001EEC},
+{0x001EEF, 0x001EEE},
+{0x001EF1, 0x001EF0},
+{0x001EF3, 0x001EF2},
+{0x001EF5, 0x001EF4},
+{0x001EF7, 0x001EF6},
+{0x001EF9, 0x001EF8},
+{0x001EFB, 0x001EFA},
+{0x001EFD, 0x001EFC},
+{0x001EFF, 0x001EFE},
+{0x001F00, 0x001F08},
+{0x001F01, 0x001F09},
+{0x001F02, 0x001F0A},
+{0x001F03, 0x001F0B},
+{0x001F04, 0x001F0C},
+{0x001F05, 0x001F0D},
+{0x001F06, 0x001F0E},
+{0x001F07, 0x001F0F},
+{0x001F10, 0x001F18},
+{0x001F11, 0x001F19},
+{0x001F12, 0x001F1A},
+{0x001F13, 0x001F1B},
+{0x001F14, 0x001F1C},
+{0x001F15, 0x001F1D},
+{0x001F20, 0x001F28},
+{0x001F21, 0x001F29},
+{0x001F22, 0x001F2A},
+{0x001F23, 0x001F2B},
+{0x001F24, 0x001F2C},
+{0x001F25, 0x001F2D},
+{0x001F26, 0x001F2E},
+{0x001F27, 0x001F2F},
+{0x001F30, 0x001F38},
+{0x001F31, 0x001F39},
+{0x001F32, 0x001F3A},
+{0x001F33, 0x001F3B},
+{0x001F34, 0x001F3C},
+{0x001F35, 0x001F3D},
+{0x001F36, 0x001F3E},
+{0x001F37, 0x001F3F},
+{0x001F40, 0x001F48},
+{0x001F41, 0x001F49},
+{0x001F42, 0x001F4A},
+{0x001F43, 0x001F4B},
+{0x001F44, 0x001F4C},
+{0x001F45, 0x001F4D},
+{0x001F51, 0x001F59},
+{0x001F53, 0x001F5B},
+{0x001F55, 0x001F5D},
+{0x001F57, 0x001F5F},
+{0x001F60, 0x001F68},
+{0x001F61, 0x001F69},
+{0x001F62, 0x001F6A},
+{0x001F63, 0x001F6B},
+{0x001F64, 0x001F6C},
+{0x001F65, 0x001F6D},
+{0x001F66, 0x001F6E},
+{0x001F67, 0x001F6F},
+{0x001F70, 0x001FBA},
+{0x001F71, 0x001FBB},
+{0x001F72, 0x001FC8},
+{0x001F73, 0x001FC9},
+{0x001F74, 0x001FCA},
+{0x001F75, 0x001FCB},
+{0x001F76, 0x001FDA},
+{0x001F77, 0x001FDB},
+{0x001F78, 0x001FF8},
+{0x001F79, 0x001FF9},
+{0x001F7A, 0x001FEA},
+{0x001F7B, 0x001FEB},
+{0x001F7C, 0x001FFA},
+{0x001F7D, 0x001FFB},
+{0x001F80, 0x001F88},
+{0x001F81, 0x001F89},
+{0x001F82, 0x001F8A},
+{0x001F83, 0x001F8B},
+{0x001F84, 0x001F8C},
+{0x001F85, 0x001F8D},
+{0x001F86, 0x001F8E},
+{0x001F87, 0x001F8F},
+{0x001F90, 0x001F98},
+{0x001F91, 0x001F99},
+{0x001F92, 0x001F9A},
+{0x001F93, 0x001F9B},
+{0x001F94, 0x001F9C},
+{0x001F95, 0x001F9D},
+{0x001F96, 0x001F9E},
+{0x001F97, 0x001F9F},
+{0x001FA0, 0x001FA8},
+{0x001FA1, 0x001FA9},
+{0x001FA2, 0x001FAA},
+{0x001FA3, 0x001FAB},
+{0x001FA4, 0x001FAC},
+{0x001FA5, 0x001FAD},
+{0x001FA6, 0x001FAE},
+{0x001FA7, 0x001FAF},
+{0x001FB0, 0x001FB8},
+{0x001FB1, 0x001FB9},
+{0x001FB3, 0x001FBC},
+{0x001FBE, 0x000399},
+{0x001FC3, 0x001FCC},
+{0x001FD0, 0x001FD8},
+{0x001FD1, 0x001FD9},
+{0x001FE0, 0x001FE8},
+{0x001FE1, 0x001FE9},
+{0x001FE5, 0x001FEC},
+{0x001FF3, 0x001FFC},
+{0x00214E, 0x002132},
+{0x002170, 0x002160},
+{0x002171, 0x002161},
+{0x002172, 0x002162},
+{0x002173, 0x002163},
+{0x002174, 0x002164},
+{0x002175, 0x002165},
+{0x002176, 0x002166},
+{0x002177, 0x002167},
+{0x002178, 0x002168},
+{0x002179, 0x002169},
+{0x00217A, 0x00216A},
+{0x00217B, 0x00216B},
+{0x00217C, 0x00216C},
+{0x00217D, 0x00216D},
+{0x00217E, 0x00216E},
+{0x00217F, 0x00216F},
+{0x002184, 0x002183},
+{0x0024D0, 0x0024B6},
+{0x0024D1, 0x0024B7},
+{0x0024D2, 0x0024B8},
+{0x0024D3, 0x0024B9},
+{0x0024D4, 0x0024BA},
+{0x0024D5, 0x0024BB},
+{0x0024D6, 0x0024BC},
+{0x0024D7, 0x0024BD},
+{0x0024D8, 0x0024BE},
+{0x0024D9, 0x0024BF},
+{0x0024DA, 0x0024C0},
+{0x0024DB, 0x0024C1},
+{0x0024DC, 0x0024C2},
+{0x0024DD, 0x0024C3},
+{0x0024DE, 0x0024C4},
+{0x0024DF, 0x0024C5},
+{0x0024E0, 0x0024C6},
+{0x0024E1, 0x0024C7},
+{0x0024E2, 0x0024C8},
+{0x0024E3, 0x0024C9},
+{0x0024E4, 0x0024CA},
+{0x0024E5, 0x0024CB},
+{0x0024E6, 0x0024CC},
+{0x0024E7, 0x0024CD},
+{0x0024E8, 0x0024CE},
+{0x0024E9, 0x0024CF},
+{0x002C30, 0x002C00},
+{0x002C31, 0x002C01},
+{0x002C32, 0x002C02},
+{0x002C33, 0x002C03},
+{0x002C34, 0x002C04},
+{0x002C35, 0x002C05},
+{0x002C36, 0x002C06},
+{0x002C37, 0x002C07},
+{0x002C38, 0x002C08},
+{0x002C39, 0x002C09},
+{0x002C3A, 0x002C0A},
+{0x002C3B, 0x002C0B},
+{0x002C3C, 0x002C0C},
+{0x002C3D, 0x002C0D},
+{0x002C3E, 0x002C0E},
+{0x002C3F, 0x002C0F},
+{0x002C40, 0x002C10},
+{0x002C41, 0x002C11},
+{0x002C42, 0x002C12},
+{0x002C43, 0x002C13},
+{0x002C44, 0x002C14},
+{0x002C45, 0x002C15},
+{0x002C46, 0x002C16},
+{0x002C47, 0x002C17},
+{0x002C48, 0x002C18},
+{0x002C49, 0x002C19},
+{0x002C4A, 0x002C1A},
+{0x002C4B, 0x002C1B},
+{0x002C4C, 0x002C1C},
+{0x002C4D, 0x002C1D},
+{0x002C4E, 0x002C1E},
+{0x002C4F, 0x002C1F},
+{0x002C50, 0x002C20},
+{0x002C51, 0x002C21},
+{0x002C52, 0x002C22},
+{0x002C53, 0x002C23},
+{0x002C54, 0x002C24},
+{0x002C55, 0x002C25},
+{0x002C56, 0x002C26},
+{0x002C57, 0x002C27},
+{0x002C58, 0x002C28},
+{0x002C59, 0x002C29},
+{0x002C5A, 0x002C2A},
+{0x002C5B, 0x002C2B},
+{0x002C5C, 0x002C2C},
+{0x002C5D, 0x002C2D},
+{0x002C5E, 0x002C2E},
+{0x002C5F, 0x002C2F},
+{0x002C61, 0x002C60},
+{0x002C65, 0x00023A},
+{0x002C66, 0x00023E},
+{0x002C68, 0x002C67},
+{0x002C6A, 0x002C69},
+{0x002C6C, 0x002C6B},
+{0x002C73, 0x002C72},
+{0x002C76, 0x002C75},
+{0x002C81, 0x002C80},
+{0x002C83, 0x002C82},
+{0x002C85, 0x002C84},
+{0x002C87, 0x002C86},
+{0x002C89, 0x002C88},
+{0x002C8B, 0x002C8A},
+{0x002C8D, 0x002C8C},
+{0x002C8F, 0x002C8E},
+{0x002C91, 0x002C90},
+{0x002C93, 0x002C92},
+{0x002C95, 0x002C94},
+{0x002C97, 0x002C96},
+{0x002C99, 0x002C98},
+{0x002C9B, 0x002C9A},
+{0x002C9D, 0x002C9C},
+{0x002C9F, 0x002C9E},
+{0x002CA1, 0x002CA0},
+{0x002CA3, 0x002CA2},
+{0x002CA5, 0x002CA4},
+{0x002CA7, 0x002CA6},
+{0x002CA9, 0x002CA8},
+{0x002CAB, 0x002CAA},
+{0x002CAD, 0x002CAC},
+{0x002CAF, 0x002CAE},
+{0x002CB1, 0x002CB0},
+{0x002CB3, 0x002CB2},
+{0x002CB5, 0x002CB4},
+{0x002CB7, 0x002CB6},
+{0x002CB9, 0x002CB8},
+{0x002CBB, 0x002CBA},
+{0x002CBD, 0x002CBC},
+{0x002CBF, 0x002CBE},
+{0x002CC1, 0x002CC0},
+{0x002CC3, 0x002CC2},
+{0x002CC5, 0x002CC4},
+{0x002CC7, 0x002CC6},
+{0x002CC9, 0x002CC8},
+{0x002CCB, 0x002CCA},
+{0x002CCD, 0x002CCC},
+{0x002CCF, 0x002CCE},
+{0x002CD1, 0x002CD0},
+{0x002CD3, 0x002CD2},
+{0x002CD5, 0x002CD4},
+{0x002CD7, 0x002CD6},
+{0x002CD9, 0x002CD8},
+{0x002CDB, 0x002CDA},
+{0x002CDD, 0x002CDC},
+{0x002CDF, 0x002CDE},
+{0x002CE1, 0x002CE0},
+{0x002CE3, 0x002CE2},
+{0x002CEC, 0x002CEB},
+{0x002CEE, 0x002CED},
+{0x002CF3, 0x002CF2},
+{0x002D00, 0x0010A0},
+{0x002D01, 0x0010A1},
+{0x002D02, 0x0010A2},
+{0x002D03, 0x0010A3},
+{0x002D04, 0x0010A4},
+{0x002D05, 0x0010A5},
+{0x002D06, 0x0010A6},
+{0x002D07, 0x0010A7},
+{0x002D08, 0x0010A8},
+{0x002D09, 0x0010A9},
+{0x002D0A, 0x0010AA},
+{0x002D0B, 0x0010AB},
+{0x002D0C, 0x0010AC},
+{0x002D0D, 0x0010AD},
+{0x002D0E, 0x0010AE},
+{0x002D0F, 0x0010AF},
+{0x002D10, 0x0010B0},
+{0x002D11, 0x0010B1},
+{0x002D12, 0x0010B2},
+{0x002D13, 0x0010B3},
+{0x002D14, 0x0010B4},
+{0x002D15, 0x0010B5},
+{0x002D16, 0x0010B6},
+{0x002D17, 0x0010B7},
+{0x002D18, 0x0010B8},
+{0x002D19, 0x0010B9},
+{0x002D1A, 0x0010BA},
+{0x002D1B, 0x0010BB},
+{0x002D1C, 0x0010BC},
+{0x002D1D, 0x0010BD},
+{0x002D1E, 0x0010BE},
+{0x002D1F, 0x0010BF},
+{0x002D20, 0x0010C0},
+{0x002D21, 0x0010C1},
+{0x002D22, 0x0010C2},
+{0x002D23, 0x0010C3},
+{0x002D24, 0x0010C4},
+{0x002D25, 0x0010C5},
+{0x002D27, 0x0010C7},
+{0x002D2D, 0x0010CD},
+{0x00A641, 0x00A640},
+{0x00A643, 0x00A642},
+{0x00A645, 0x00A644},
+{0x00A647, 0x00A646},
+{0x00A649, 0x00A648},
+{0x00A64B, 0x00A64A},
+{0x00A64D, 0x00A64C},
+{0x00A64F, 0x00A64E},
+{0x00A651, 0x00A650},
+{0x00A653, 0x00A652},
+{0x00A655, 0x00A654},
+{0x00A657, 0x00A656},
+{0x00A659, 0x00A658},
+{0x00A65B, 0x00A65A},
+{0x00A65D, 0x00A65C},
+{0x00A65F, 0x00A65E},
+{0x00A661, 0x00A660},
+{0x00A663, 0x00A662},
+{0x00A665, 0x00A664},
+{0x00A667, 0x00A666},
+{0x00A669, 0x00A668},
+{0x00A66B, 0x00A66A},
+{0x00A66D, 0x00A66C},
+{0x00A681, 0x00A680},
+{0x00A683, 0x00A682},
+{0x00A685, 0x00A684},
+{0x00A687, 0x00A686},
+{0x00A689, 0x00A688},
+{0x00A68B, 0x00A68A},
+{0x00A68D, 0x00A68C},
+{0x00A68F, 0x00A68E},
+{0x00A691, 0x00A690},
+{0x00A693, 0x00A692},
+{0x00A695, 0x00A694},
+{0x00A697, 0x00A696},
+{0x00A699, 0x00A698},
+{0x00A69B, 0x00A69A},
+{0x00A723, 0x00A722},
+{0x00A725, 0x00A724},
+{0x00A727, 0x00A726},
+{0x00A729, 0x00A728},
+{0x00A72B, 0x00A72A},
+{0x00A72D, 0x00A72C},
+{0x00A72F, 0x00A72E},
+{0x00A733, 0x00A732},
+{0x00A735, 0x00A734},
+{0x00A737, 0x00A736},
+{0x00A739, 0x00A738},
+{0x00A73B, 0x00A73A},
+{0x00A73D, 0x00A73C},
+{0x00A73F, 0x00A73E},
+{0x00A741, 0x00A740},
+{0x00A743, 0x00A742},
+{0x00A745, 0x00A744},
+{0x00A747, 0x00A746},
+{0x00A749, 0x00A748},
+{0x00A74B, 0x00A74A},
+{0x00A74D, 0x00A74C},
+{0x00A74F, 0x00A74E},
+{0x00A751, 0x00A750},
+{0x00A753, 0x00A752},
+{0x00A755, 0x00A754},
+{0x00A757, 0x00A756},
+{0x00A759, 0x00A758},
+{0x00A75B, 0x00A75A},
+{0x00A75D, 0x00A75C},
+{0x00A75F, 0x00A75E},
+{0x00A761, 0x00A760},
+{0x00A763, 0x00A762},
+{0x00A765, 0x00A764},
+{0x00A767, 0x00A766},
+{0x00A769, 0x00A768},
+{0x00A76B, 0x00A76A},
+{0x00A76D, 0x00A76C},
+{0x00A76F, 0x00A76E},
+{0x00A77A, 0x00A779},
+{0x00A77C, 0x00A77B},
+{0x00A77F, 0x00A77E},
+{0x00A781, 0x00A780},
+{0x00A783, 0x00A782},
+{0x00A785, 0x00A784},
+{0x00A787, 0x00A786},
+{0x00A78C, 0x00A78B},
+{0x00A791, 0x00A790},
+{0x00A793, 0x00A792},
+{0x00A794, 0x00A7C4},
+{0x00A797, 0x00A796},
+{0x00A799, 0x00A798},
+{0x00A79B, 0x00A79A},
+{0x00A79D, 0x00A79C},
+{0x00A79F, 0x00A79E},
+{0x00A7A1, 0x00A7A0},
+{0x00A7A3, 0x00A7A2},
+{0x00A7A5, 0x00A7A4},
+{0x00A7A7, 0x00A7A6},
+{0x00A7A9, 0x00A7A8},
+{0x00A7B5, 0x00A7B4},
+{0x00A7B7, 0x00A7B6},
+{0x00A7B9, 0x00A7B8},
+{0x00A7BB, 0x00A7BA},
+{0x00A7BD, 0x00A7BC},
+{0x00A7BF, 0x00A7BE},
+{0x00A7C1, 0x00A7C0},
+{0x00A7C3, 0x00A7C2},
+{0x00A7C8, 0x00A7C7},
+{0x00A7CA, 0x00A7C9},
+{0x00A7D1, 0x00A7D0},
+{0x00A7D7, 0x00A7D6},
+{0x00A7D9, 0x00A7D8},
+{0x00A7F6, 0x00A7F5},
+{0x00AB53, 0x00A7B3},
+{0x00AB70, 0x0013A0},
+{0x00AB71, 0x0013A1},
+{0x00AB72, 0x0013A2},
+{0x00AB73, 0x0013A3},
+{0x00AB74, 0x0013A4},
+{0x00AB75, 0x0013A5},
+{0x00AB76, 0x0013A6},
+{0x00AB77, 0x0013A7},
+{0x00AB78, 0x0013A8},
+{0x00AB79, 0x0013A9},
+{0x00AB7A, 0x0013AA},
+{0x00AB7B, 0x0013AB},
+{0x00AB7C, 0x0013AC},
+{0x00AB7D, 0x0013AD},
+{0x00AB7E, 0x0013AE},
+{0x00AB7F, 0x0013AF},
+{0x00AB80, 0x0013B0},
+{0x00AB81, 0x0013B1},
+{0x00AB82, 0x0013B2},
+{0x00AB83, 0x0013B3},
+{0x00AB84, 0x0013B4},
+{0x00AB85, 0x0013B5},
+{0x00AB86, 0x0013B6},
+{0x00AB87, 0x0013B7},
+{0x00AB88, 0x0013B8},
+{0x00AB89, 0x0013B9},
+{0x00AB8A, 0x0013BA},
+{0x00AB8B, 0x0013BB},
+{0x00AB8C, 0x0013BC},
+{0x00AB8D, 0x0013BD},
+{0x00AB8E, 0x0013BE},
+{0x00AB8F, 0x0013BF},
+{0x00AB90, 0x0013C0},
+{0x00AB91, 0x0013C1},
+{0x00AB92, 0x0013C2},
+{0x00AB93, 0x0013C3},
+{0x00AB94, 0x0013C4},
+{0x00AB95, 0x0013C5},
+{0x00AB96, 0x0013C6},
+{0x00AB97, 0x0013C7},
+{0x00AB98, 0x0013C8},
+{0x00AB99, 0x0013C9},
+{0x00AB9A, 0x0013CA},
+{0x00AB9B, 0x0013CB},
+{0x00AB9C, 0x0013CC},
+{0x00AB9D, 0x0013CD},
+{0x00AB9E, 0x0013CE},
+{0x00AB9F, 0x0013CF},
+{0x00ABA0, 0x0013D0},
+{0x00ABA1, 0x0013D1},
+{0x00ABA2, 0x0013D2},
+{0x00ABA3, 0x0013D3},
+{0x00ABA4, 0x0013D4},
+{0x00ABA5, 0x0013D5},
+{0x00ABA6, 0x0013D6},
+{0x00ABA7, 0x0013D7},
+{0x00ABA8, 0x0013D8},
+{0x00ABA9, 0x0013D9},
+{0x00ABAA, 0x0013DA},
+{0x00ABAB, 0x0013DB},
+{0x00ABAC, 0x0013DC},
+{0x00ABAD, 0x0013DD},
+{0x00ABAE, 0x0013DE},
+{0x00ABAF, 0x0013DF},
+{0x00ABB0, 0x0013E0},
+{0x00ABB1, 0x0013E1},
+{0x00ABB2, 0x0013E2},
+{0x00ABB3, 0x0013E3},
+{0x00ABB4, 0x0013E4},
+{0x00ABB5, 0x0013E5},
+{0x00ABB6, 0x0013E6},
+{0x00ABB7, 0x0013E7},
+{0x00ABB8, 0x0013E8},
+{0x00ABB9, 0x0013E9},
+{0x00ABBA, 0x0013EA},
+{0x00ABBB, 0x0013EB},
+{0x00ABBC, 0x0013EC},
+{0x00ABBD, 0x0013ED},
+{0x00ABBE, 0x0013EE},
+{0x00ABBF, 0x0013EF},
+{0x00FF41, 0x00FF21},
+{0x00FF42, 0x00FF22},
+{0x00FF43, 0x00FF23},
+{0x00FF44, 0x00FF24},
+{0x00FF45, 0x00FF25},
+{0x00FF46, 0x00FF26},
+{0x00FF47, 0x00FF27},
+{0x00FF48, 0x00FF28},
+{0x00FF49, 0x00FF29},
+{0x00FF4A, 0x00FF2A},
+{0x00FF4B, 0x00FF2B},
+{0x00FF4C, 0x00FF2C},
+{0x00FF4D, 0x00FF2D},
+{0x00FF4E, 0x00FF2E},
+{0x00FF4F, 0x00FF2F},
+{0x00FF50, 0x00FF30},
+{0x00FF51, 0x00FF31},
+{0x00FF52, 0x00FF32},
+{0x00FF53, 0x00FF33},
+{0x00FF54, 0x00FF34},
+{0x00FF55, 0x00FF35},
+{0x00FF56, 0x00FF36},
+{0x00FF57, 0x00FF37},
+{0x00FF58, 0x00FF38},
+{0x00FF59, 0x00FF39},
+{0x00FF5A, 0x00FF3A},
+{0x010428, 0x010400},
+{0x010429, 0x010401},
+{0x01042A, 0x010402},
+{0x01042B, 0x010403},
+{0x01042C, 0x010404},
+{0x01042D, 0x010405},
+{0x01042E, 0x010406},
+{0x01042F, 0x010407},
+{0x010430, 0x010408},
+{0x010431, 0x010409},
+{0x010432, 0x01040A},
+{0x010433, 0x01040B},
+{0x010434, 0x01040C},
+{0x010435, 0x01040D},
+{0x010436, 0x01040E},
+{0x010437, 0x01040F},
+{0x010438, 0x010410},
+{0x010439, 0x010411},
+{0x01043A, 0x010412},
+{0x01043B, 0x010413},
+{0x01043C, 0x010414},
+{0x01043D, 0x010415},
+{0x01043E, 0x010416},
+{0x01043F, 0x010417},
+{0x010440, 0x010418},
+{0x010441, 0x010419},
+{0x010442, 0x01041A},
+{0x010443, 0x01041B},
+{0x010444, 0x01041C},
+{0x010445, 0x01041D},
+{0x010446, 0x01041E},
+{0x010447, 0x01041F},
+{0x010448, 0x010420},
+{0x010449, 0x010421},
+{0x01044A, 0x010422},
+{0x01044B, 0x010423},
+{0x01044C, 0x010424},
+{0x01044D, 0x010425},
+{0x01044E, 0x010426},
+{0x01044F, 0x010427},
+{0x0104D8, 0x0104B0},
+{0x0104D9, 0x0104B1},
+{0x0104DA, 0x0104B2},
+{0x0104DB, 0x0104B3},
+{0x0104DC, 0x0104B4},
+{0x0104DD, 0x0104B5},
+{0x0104DE, 0x0104B6},
+{0x0104DF, 0x0104B7},
+{0x0104E0, 0x0104B8},
+{0x0104E1, 0x0104B9},
+{0x0104E2, 0x0104BA},
+{0x0104E3, 0x0104BB},
+{0x0104E4, 0x0104BC},
+{0x0104E5, 0x0104BD},
+{0x0104E6, 0x0104BE},
+{0x0104E7, 0x0104BF},
+{0x0104E8, 0x0104C0},
+{0x0104E9, 0x0104C1},
+{0x0104EA, 0x0104C2},
+{0x0104EB, 0x0104C3},
+{0x0104EC, 0x0104C4},
+{0x0104ED, 0x0104C5},
+{0x0104EE, 0x0104C6},
+{0x0104EF, 0x0104C7},
+{0x0104F0, 0x0104C8},
+{0x0104F1, 0x0104C9},
+{0x0104F2, 0x0104CA},
+{0x0104F3, 0x0104CB},
+{0x0104F4, 0x0104CC},
+{0x0104F5, 0x0104CD},
+{0x0104F6, 0x0104CE},
+{0x0104F7, 0x0104CF},
+{0x0104F8, 0x0104D0},
+{0x0104F9, 0x0104D1},
+{0x0104FA, 0x0104D2},
+{0x0104FB, 0x0104D3},
+{0x010597, 0x010570},
+{0x010598, 0x010571},
+{0x010599, 0x010572},
+{0x01059A, 0x010573},
+{0x01059B, 0x010574},
+{0x01059C, 0x010575},
+{0x01059D, 0x010576},
+{0x01059E, 0x010577},
+{0x01059F, 0x010578},
+{0x0105A0, 0x010579},
+{0x0105A1, 0x01057A},
+{0x0105A3, 0x01057C},
+{0x0105A4, 0x01057D},
+{0x0105A5, 0x01057E},
+{0x0105A6, 0x01057F},
+{0x0105A7, 0x010580},
+{0x0105A8, 0x010581},
+{0x0105A9, 0x010582},
+{0x0105AA, 0x010583},
+{0x0105AB, 0x010584},
+{0x0105AC, 0x010585},
+{0x0105AD, 0x010586},
+{0x0105AE, 0x010587},
+{0x0105AF, 0x010588},
+{0x0105B0, 0x010589},
+{0x0105B1, 0x01058A},
+{0x0105B3, 0x01058C},
+{0x0105B4, 0x01058D},
+{0x0105B5, 0x01058E},
+{0x0105B6, 0x01058F},
+{0x0105B7, 0x010590},
+{0x0105B8, 0x010591},
+{0x0105B9, 0x010592},
+{0x0105BB, 0x010594},
+{0x0105BC, 0x010595},
+{0x010CC0, 0x010C80},
+{0x010CC1, 0x010C81},
+{0x010CC2, 0x010C82},
+{0x010CC3, 0x010C83},
+{0x010CC4, 0x010C84},
+{0x010CC5, 0x010C85},
+{0x010CC6, 0x010C86},
+{0x010CC7, 0x010C87},
+{0x010CC8, 0x010C88},
+{0x010CC9, 0x010C89},
+{0x010CCA, 0x010C8A},
+{0x010CCB, 0x010C8B},
+{0x010CCC, 0x010C8C},
+{0x010CCD, 0x010C8D},
+{0x010CCE, 0x010C8E},
+{0x010CCF, 0x010C8F},
+{0x010CD0, 0x010C90},
+{0x010CD1, 0x010C91},
+{0x010CD2, 0x010C92},
+{0x010CD3, 0x010C93},
+{0x010CD4, 0x010C94},
+{0x010CD5, 0x010C95},
+{0x010CD6, 0x010C96},
+{0x010CD7, 0x010C97},
+{0x010CD8, 0x010C98},
+{0x010CD9, 0x010C99},
+{0x010CDA, 0x010C9A},
+{0x010CDB, 0x010C9B},
+{0x010CDC, 0x010C9C},
+{0x010CDD, 0x010C9D},
+{0x010CDE, 0x010C9E},
+{0x010CDF, 0x010C9F},
+{0x010CE0, 0x010CA0},
+{0x010CE1, 0x010CA1},
+{0x010CE2, 0x010CA2},
+{0x010CE3, 0x010CA3},
+{0x010CE4, 0x010CA4},
+{0x010CE5, 0x010CA5},
+{0x010CE6, 0x010CA6},
+{0x010CE7, 0x010CA7},
+{0x010CE8, 0x010CA8},
+{0x010CE9, 0x010CA9},
+{0x010CEA, 0x010CAA},
+{0x010CEB, 0x010CAB},
+{0x010CEC, 0x010CAC},
+{0x010CED, 0x010CAD},
+{0x010CEE, 0x010CAE},
+{0x010CEF, 0x010CAF},
+{0x010CF0, 0x010CB0},
+{0x010CF1, 0x010CB1},
+{0x010CF2, 0x010CB2},
+{0x0118C0, 0x0118A0},
+{0x0118C1, 0x0118A1},
+{0x0118C2, 0x0118A2},
+{0x0118C3, 0x0118A3},
+{0x0118C4, 0x0118A4},
+{0x0118C5, 0x0118A5},
+{0x0118C6, 0x0118A6},
+{0x0118C7, 0x0118A7},
+{0x0118C8, 0x0118A8},
+{0x0118C9, 0x0118A9},
+{0x0118CA, 0x0118AA},
+{0x0118CB, 0x0118AB},
+{0x0118CC, 0x0118AC},
+{0x0118CD, 0x0118AD},
+{0x0118CE, 0x0118AE},
+{0x0118CF, 0x0118AF},
+{0x0118D0, 0x0118B0},
+{0x0118D1, 0x0118B1},
+{0x0118D2, 0x0118B2},
+{0x0118D3, 0x0118B3},
+{0x0118D4, 0x0118B4},
+{0x0118D5, 0x0118B5},
+{0x0118D6, 0x0118B6},
+{0x0118D7, 0x0118B7},
+{0x0118D8, 0x0118B8},
+{0x0118D9, 0x0118B9},
+{0x0118DA, 0x0118BA},
+{0x0118DB, 0x0118BB},
+{0x0118DC, 0x0118BC},
+{0x0118DD, 0x0118BD},
+{0x0118DE, 0x0118BE},
+{0x0118DF, 0x0118BF},
+{0x016E60, 0x016E40},
+{0x016E61, 0x016E41},
+{0x016E62, 0x016E42},
+{0x016E63, 0x016E43},
+{0x016E64, 0x016E44},
+{0x016E65, 0x016E45},
+{0x016E66, 0x016E46},
+{0x016E67, 0x016E47},
+{0x016E68, 0x016E48},
+{0x016E69, 0x016E49},
+{0x016E6A, 0x016E4A},
+{0x016E6B, 0x016E4B},
+{0x016E6C, 0x016E4C},
+{0x016E6D, 0x016E4D},
+{0x016E6E, 0x016E4E},
+{0x016E6F, 0x016E4F},
+{0x016E70, 0x016E50},
+{0x016E71, 0x016E51},
+{0x016E72, 0x016E52},
+{0x016E73, 0x016E53},
+{0x016E74, 0x016E54},
+{0x016E75, 0x016E55},
+{0x016E76, 0x016E56},
+{0x016E77, 0x016E57},
+{0x016E78, 0x016E58},
+{0x016E79, 0x016E59},
+{0x016E7A, 0x016E5A},
+{0x016E7B, 0x016E5B},
+{0x016E7C, 0x016E5C},
+{0x016E7D, 0x016E5D},
+{0x016E7E, 0x016E5E},
+{0x016E7F, 0x016E5F},
+{0x01E922, 0x01E900},
+{0x01E923, 0x01E901},
+{0x01E924, 0x01E902},
+{0x01E925, 0x01E903},
+{0x01E926, 0x01E904},
+{0x01E927, 0x01E905},
+{0x01E928, 0x01E906},
+{0x01E929, 0x01E907},
+{0x01E92A, 0x01E908},
+{0x01E92B, 0x01E909},
+{0x01E92C, 0x01E90A},
+{0x01E92D, 0x01E90B},
+{0x01E92E, 0x01E90C},
+{0x01E92F, 0x01E90D},
+{0x01E930, 0x01E90E},
+{0x01E931, 0x01E90F},
+{0x01E932, 0x01E910},
+{0x01E933, 0x01E911},
+{0x01E934, 0x01E912},
+{0x01E935, 0x01E913},
+{0x01E936, 0x01E914},
+{0x01E937, 0x01E915},
+{0x01E938, 0x01E916},
+{0x01E939, 0x01E917},
+{0x01E93A, 0x01E918},
+{0x01E93B, 0x01E919},
+{0x01E93C, 0x01E91A},
+{0x01E93D, 0x01E91B},
+{0x01E93E, 0x01E91C},
+{0x01E93F, 0x01E91D},
+{0x01E940, 0x01E91E},
+{0x01E941, 0x01E91F},
+{0x01E942, 0x01E920},
+{0x01E943, 0x01E921},
+};
+
+const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, nfd
+{0x000000, 0x000000, 0x000000},
+{0x0000C0, 0x0000C5, 0x000041},
+{0x0000C7, 0x0000C7, 0x000043},
+{0x0000C8, 0x0000CB, 0x000045},
+{0x0000CC, 0x0000CF, 0x000049},
+{0x0000D1, 0x0000D1, 0x00004E},
+{0x0000D2, 0x0000D6, 0x00004F},
+{0x0000D9, 0x0000DC, 0x000055},
+{0x0000DD, 0x0000DD, 0x000059},
+{0x0000E0, 0x0000E5, 0x000061},
+{0x0000E7, 0x0000E7, 0x000063},
+{0x0000E8, 0x0000EB, 0x000065},
+{0x0000EC, 0x0000EF, 0x000069},
+{0x0000F1, 0x0000F1, 0x00006E},
+{0x0000F2, 0x0000F6, 0x00006F},
+{0x0000F9, 0x0000FC, 0x000075},
+{0x0000FD, 0x0000FD, 0x000079},
+{0x0000FF, 0x0000FF, 0x000079},
+{0x000100, 0x000100, 0x000041},
+{0x000101, 0x000101, 0x000061},
+{0x000102, 0x000102, 0x000041},
+{0x000103, 0x000103, 0x000061},
+{0x000104, 0x000104, 0x000041},
+{0x000105, 0x000105, 0x000061},
+{0x000106, 0x000106, 0x000043},
+{0x000107, 0x000107, 0x000063},
+{0x000108, 0x000108, 0x000043},
+{0x000109, 0x000109, 0x000063},
+{0x00010A, 0x00010A, 0x000043},
+{0x00010B, 0x00010B, 0x000063},
+{0x00010C, 0x00010C, 0x000043},
+{0x00010D, 0x00010D, 0x000063},
+{0x00010E, 0x00010E, 0x000044},
+{0x00010F, 0x00010F, 0x000064},
+{0x000112, 0x000112, 0x000045},
+{0x000113, 0x000113, 0x000065},
+{0x000114, 0x000114, 0x000045},
+{0x000115, 0x000115, 0x000065},
+{0x000116, 0x000116, 0x000045},
+{0x000117, 0x000117, 0x000065},
+{0x000118, 0x000118, 0x000045},
+{0x000119, 0x000119, 0x000065},
+{0x00011A, 0x00011A, 0x000045},
+{0x00011B, 0x00011B, 0x000065},
+{0x00011C, 0x00011C, 0x000047},
+{0x00011D, 0x00011D, 0x000067},
+{0x00011E, 0x00011E, 0x000047},
+{0x00011F, 0x00011F, 0x000067},
+{0x000120, 0x000120, 0x000047},
+{0x000121, 0x000121, 0x000067},
+{0x000122, 0x000122, 0x000047},
+{0x000123, 0x000123, 0x000067},
+{0x000124, 0x000124, 0x000048},
+{0x000125, 0x000125, 0x000068},
+{0x000128, 0x000128, 0x000049},
+{0x000129, 0x000129, 0x000069},
+{0x00012A, 0x00012A, 0x000049},
+{0x00012B, 0x00012B, 0x000069},
+{0x00012C, 0x00012C, 0x000049},
+{0x00012D, 0x00012D, 0x000069},
+{0x00012E, 0x00012E, 0x000049},
+{0x00012F, 0x00012F, 0x000069},
+{0x000130, 0x000130, 0x000049},
+{0x000134, 0x000134, 0x00004A},
+{0x000135, 0x000135, 0x00006A},
+{0x000136, 0x000136, 0x00004B},
+{0x000137, 0x000137, 0x00006B},
+{0x000139, 0x000139, 0x00004C},
+{0x00013A, 0x00013A, 0x00006C},
+{0x00013B, 0x00013B, 0x00004C},
+{0x00013C, 0x00013C, 0x00006C},
+{0x00013D, 0x00013D, 0x00004C},
+{0x00013E, 0x00013E, 0x00006C},
+{0x000143, 0x000143, 0x00004E},
+{0x000144, 0x000144, 0x00006E},
+{0x000145, 0x000145, 0x00004E},
+{0x000146, 0x000146, 0x00006E},
+{0x000147, 0x000147, 0x00004E},
+{0x000148, 0x000148, 0x00006E},
+{0x00014C, 0x00014C, 0x00004F},
+{0x00014D, 0x00014D, 0x00006F},
+{0x00014E, 0x00014E, 0x00004F},
+{0x00014F, 0x00014F, 0x00006F},
+{0x000150, 0x000150, 0x00004F},
+{0x000151, 0x000151, 0x00006F},
+{0x000154, 0x000154, 0x000052},
+{0x000155, 0x000155, 0x000072},
+{0x000156, 0x000156, 0x000052},
+{0x000157, 0x000157, 0x000072},
+{0x000158, 0x000158, 0x000052},
+{0x000159, 0x000159, 0x000072},
+{0x00015A, 0x00015A, 0x000053},
+{0x00015B, 0x00015B, 0x000073},
+{0x00015C, 0x00015C, 0x000053},
+{0x00015D, 0x00015D, 0x000073},
+{0x00015E, 0x00015E, 0x000053},
+{0x00015F, 0x00015F, 0x000073},
+{0x000160, 0x000160, 0x000053},
+{0x000161, 0x000161, 0x000073},
+{0x000162, 0x000162, 0x000054},
+{0x000163, 0x000163, 0x000074},
+{0x000164, 0x000164, 0x000054},
+{0x000165, 0x000165, 0x000074},
+{0x000168, 0x000168, 0x000055},
+{0x000169, 0x000169, 0x000075},
+{0x00016A, 0x00016A, 0x000055},
+{0x00016B, 0x00016B, 0x000075},
+{0x00016C, 0x00016C, 0x000055},
+{0x00016D, 0x00016D, 0x000075},
+{0x00016E, 0x00016E, 0x000055},
+{0x00016F, 0x00016F, 0x000075},
+{0x000170, 0x000170, 0x000055},
+{0x000171, 0x000171, 0x000075},
+{0x000172, 0x000172, 0x000055},
+{0x000173, 0x000173, 0x000075},
+{0x000174, 0x000174, 0x000057},
+{0x000175, 0x000175, 0x000077},
+{0x000176, 0x000176, 0x000059},
+{0x000177, 0x000177, 0x000079},
+{0x000178, 0x000178, 0x000059},
+{0x000179, 0x000179, 0x00005A},
+{0x00017A, 0x00017A, 0x00007A},
+{0x00017B, 0x00017B, 0x00005A},
+{0x00017C, 0x00017C, 0x00007A},
+{0x00017D, 0x00017D, 0x00005A},
+{0x00017E, 0x00017E, 0x00007A},
+{0x0001A0, 0x0001A0, 0x00004F},
+{0x0001A1, 0x0001A1, 0x00006F},
+{0x0001AF, 0x0001AF, 0x000055},
+{0x0001B0, 0x0001B0, 0x000075},
+{0x0001CD, 0x0001CD, 0x000041},
+{0x0001CE, 0x0001CE, 0x000061},
+{0x0001CF, 0x0001CF, 0x000049},
+{0x0001D0, 0x0001D0, 0x000069},
+{0x0001D1, 0x0001D1, 0x00004F},
+{0x0001D2, 0x0001D2, 0x00006F},
+{0x0001D3, 0x0001D3, 0x000055},
+{0x0001D4, 0x0001D4, 0x000075},
+{0x0001D5, 0x0001D5, 0x000055},
+{0x0001D6, 0x0001D6, 0x000075},
+{0x0001D7, 0x0001D7, 0x000055},
+{0x0001D8, 0x0001D8, 0x000075},
+{0x0001D9, 0x0001D9, 0x000055},
+{0x0001DA, 0x0001DA, 0x000075},
+{0x0001DB, 0x0001DB, 0x000055},
+{0x0001DC, 0x0001DC, 0x000075},
+{0x0001DE, 0x0001DE, 0x000041},
+{0x0001DF, 0x0001DF, 0x000061},
+{0x0001E0, 0x0001E0, 0x000041},
+{0x0001E1, 0x0001E1, 0x000061},
+{0x0001E2, 0x0001E2, 0x0000C6},
+{0x0001E3, 0x0001E3, 0x0000E6},
+{0x0001E6, 0x0001E6, 0x000047},
+{0x0001E7, 0x0001E7, 0x000067},
+{0x0001E8, 0x0001E8, 0x00004B},
+{0x0001E9, 0x0001E9, 0x00006B},
+{0x0001EA, 0x0001EA, 0x00004F},
+{0x0001EB, 0x0001EB, 0x00006F},
+{0x0001EC, 0x0001EC, 0x00004F},
+{0x0001ED, 0x0001ED, 0x00006F},
+{0x0001EE, 0x0001EE, 0x0001B7},
+{0x0001EF, 0x0001EF, 0x000292},
+{0x0001F0, 0x0001F0, 0x00006A},
+{0x0001F4, 0x0001F4, 0x000047},
+{0x0001F5, 0x0001F5, 0x000067},
+{0x0001F8, 0x0001F8, 0x00004E},
+{0x0001F9, 0x0001F9, 0x00006E},
+{0x0001FA, 0x0001FA, 0x000041},
+{0x0001FB, 0x0001FB, 0x000061},
+{0x0001FC, 0x0001FC, 0x0000C6},
+{0x0001FD, 0x0001FD, 0x0000E6},
+{0x0001FE, 0x0001FE, 0x0000D8},
+{0x0001FF, 0x0001FF, 0x0000F8},
+{0x000200, 0x000200, 0x000041},
+{0x000201, 0x000201, 0x000061},
+{0x000202, 0x000202, 0x000041},
+{0x000203, 0x000203, 0x000061},
+{0x000204, 0x000204, 0x000045},
+{0x000205, 0x000205, 0x000065},
+{0x000206, 0x000206, 0x000045},
+{0x000207, 0x000207, 0x000065},
+{0x000208, 0x000208, 0x000049},
+{0x000209, 0x000209, 0x000069},
+{0x00020A, 0x00020A, 0x000049},
+{0x00020B, 0x00020B, 0x000069},
+{0x00020C, 0x00020C, 0x00004F},
+{0x00020D, 0x00020D, 0x00006F},
+{0x00020E, 0x00020E, 0x00004F},
+{0x00020F, 0x00020F, 0x00006F},
+{0x000210, 0x000210, 0x000052},
+{0x000211, 0x000211, 0x000072},
+{0x000212, 0x000212, 0x000052},
+{0x000213, 0x000213, 0x000072},
+{0x000214, 0x000214, 0x000055},
+{0x000215, 0x000215, 0x000075},
+{0x000216, 0x000216, 0x000055},
+{0x000217, 0x000217, 0x000075},
+{0x000218, 0x000218, 0x000053},
+{0x000219, 0x000219, 0x000073},
+{0x00021A, 0x00021A, 0x000054},
+{0x00021B, 0x00021B, 0x000074},
+{0x00021E, 0x00021E, 0x000048},
+{0x00021F, 0x00021F, 0x000068},
+{0x000226, 0x000226, 0x000041},
+{0x000227, 0x000227, 0x000061},
+{0x000228, 0x000228, 0x000045},
+{0x000229, 0x000229, 0x000065},
+{0x00022A, 0x00022A, 0x00004F},
+{0x00022B, 0x00022B, 0x00006F},
+{0x00022C, 0x00022C, 0x00004F},
+{0x00022D, 0x00022D, 0x00006F},
+{0x00022E, 0x00022E, 0x00004F},
+{0x00022F, 0x00022F, 0x00006F},
+{0x000230, 0x000230, 0x00004F},
+{0x000231, 0x000231, 0x00006F},
+{0x000232, 0x000232, 0x000059},
+{0x000233, 0x000233, 0x000079},
+{0x000340, 0x000340, 0x000300},
+{0x000341, 0x000341, 0x000301},
+{0x000343, 0x000343, 0x000313},
+{0x000344, 0x000344, 0x000308},
+{0x000374, 0x000374, 0x0002B9},
+{0x00037E, 0x00037E, 0x00003B},
+{0x000385, 0x000385, 0x0000A8},
+{0x000386, 0x000386, 0x000391},
+{0x000387, 0x000387, 0x0000B7},
+{0x000388, 0x000388, 0x000395},
+{0x000389, 0x000389, 0x000397},
+{0x00038A, 0x00038A, 0x000399},
+{0x00038C, 0x00038C, 0x00039F},
+{0x00038E, 0x00038E, 0x0003A5},
+{0x00038F, 0x00038F, 0x0003A9},
+{0x000390, 0x000390, 0x0003B9},
+{0x0003AA, 0x0003AA, 0x000399},
+{0x0003AB, 0x0003AB, 0x0003A5},
+{0x0003AC, 0x0003AC, 0x0003B1},
+{0x0003AD, 0x0003AD, 0x0003B5},
+{0x0003AE, 0x0003AE, 0x0003B7},
+{0x0003AF, 0x0003AF, 0x0003B9},
+{0x0003B0, 0x0003B0, 0x0003C5},
+{0x0003CA, 0x0003CA, 0x0003B9},
+{0x0003CB, 0x0003CB, 0x0003C5},
+{0x0003CC, 0x0003CC, 0x0003BF},
+{0x0003CD, 0x0003CD, 0x0003C5},
+{0x0003CE, 0x0003CE, 0x0003C9},
+{0x0003D3, 0x0003D4, 0x0003D2},
+{0x000400, 0x000401, 0x000415},
+{0x000403, 0x000403, 0x000413},
+{0x000407, 0x000407, 0x000406},
+{0x00040C, 0x00040C, 0x00041A},
+{0x00040D, 0x00040D, 0x000418},
+{0x00040E, 0x00040E, 0x000423},
+{0x000419, 0x000419, 0x000418},
+{0x000439, 0x000439, 0x000438},
+{0x000450, 0x000451, 0x000435},
+{0x000453, 0x000453, 0x000433},
+{0x000457, 0x000457, 0x000456},
+{0x00045C, 0x00045C, 0x00043A},
+{0x00045D, 0x00045D, 0x000438},
+{0x00045E, 0x00045E, 0x000443},
+{0x000476, 0x000476, 0x000474},
+{0x000477, 0x000477, 0x000475},
+{0x0004C1, 0x0004C1, 0x000416},
+{0x0004C2, 0x0004C2, 0x000436},
+{0x0004D0, 0x0004D0, 0x000410},
+{0x0004D1, 0x0004D1, 0x000430},
+{0x0004D2, 0x0004D2, 0x000410},
+{0x0004D3, 0x0004D3, 0x000430},
+{0x0004D6, 0x0004D6, 0x000415},
+{0x0004D7, 0x0004D7, 0x000435},
+{0x0004DA, 0x0004DA, 0x0004D8},
+{0x0004DB, 0x0004DB, 0x0004D9},
+{0x0004DC, 0x0004DC, 0x000416},
+{0x0004DD, 0x0004DD, 0x000436},
+{0x0004DE, 0x0004DE, 0x000417},
+{0x0004DF, 0x0004DF, 0x000437},
+{0x0004E2, 0x0004E2, 0x000418},
+{0x0004E3, 0x0004E3, 0x000438},
+{0x0004E4, 0x0004E4, 0x000418},
+{0x0004E5, 0x0004E5, 0x000438},
+{0x0004E6, 0x0004E6, 0x00041E},
+{0x0004E7, 0x0004E7, 0x00043E},
+{0x0004EA, 0x0004EA, 0x0004E8},
+{0x0004EB, 0x0004EB, 0x0004E9},
+{0x0004EC, 0x0004EC, 0x00042D},
+{0x0004ED, 0x0004ED, 0x00044D},
+{0x0004EE, 0x0004EE, 0x000423},
+{0x0004EF, 0x0004EF, 0x000443},
+{0x0004F0, 0x0004F0, 0x000423},
+{0x0004F1, 0x0004F1, 0x000443},
+{0x0004F2, 0x0004F2, 0x000423},
+{0x0004F3, 0x0004F3, 0x000443},
+{0x0004F4, 0x0004F4, 0x000427},
+{0x0004F5, 0x0004F5, 0x000447},
+{0x0004F8, 0x0004F8, 0x00042B},
+{0x0004F9, 0x0004F9, 0x00044B},
+{0x000622, 0x000623, 0x000627},
+{0x000624, 0x000624, 0x000648},
+{0x000625, 0x000625, 0x000627},
+{0x000626, 0x000626, 0x00064A},
+{0x0006C0, 0x0006C0, 0x0006D5},
+{0x0006C2, 0x0006C2, 0x0006C1},
+{0x0006D3, 0x0006D3, 0x0006D2},
+{0x000929, 0x000929, 0x000928},
+{0x000931, 0x000931, 0x000930},
+{0x000934, 0x000934, 0x000933},
+{0x000958, 0x000958, 0x000915},
+{0x000959, 0x000959, 0x000916},
+{0x00095A, 0x00095A, 0x000917},
+{0x00095B, 0x00095B, 0x00091C},
+{0x00095C, 0x00095C, 0x000921},
+{0x00095D, 0x00095D, 0x000922},
+{0x00095E, 0x00095E, 0x00092B},
+{0x00095F, 0x00095F, 0x00092F},
+{0x0009CB, 0x0009CC, 0x0009C7},
+{0x0009DC, 0x0009DC, 0x0009A1},
+{0x0009DD, 0x0009DD, 0x0009A2},
+{0x0009DF, 0x0009DF, 0x0009AF},
+{0x000A33, 0x000A33, 0x000A32},
+{0x000A36, 0x000A36, 0x000A38},
+{0x000A59, 0x000A59, 0x000A16},
+{0x000A5A, 0x000A5A, 0x000A17},
+{0x000A5B, 0x000A5B, 0x000A1C},
+{0x000A5E, 0x000A5E, 0x000A2B},
+{0x000B48, 0x000B48, 0x000B47},
+{0x000B4B, 0x000B4C, 0x000B47},
+{0x000B5C, 0x000B5C, 0x000B21},
+{0x000B5D, 0x000B5D, 0x000B22},
+{0x000B94, 0x000B94, 0x000B92},
+{0x000BCA, 0x000BCA, 0x000BC6},
+{0x000BCB, 0x000BCB, 0x000BC7},
+{0x000BCC, 0x000BCC, 0x000BC6},
+{0x000C48, 0x000C48, 0x000C46},
+{0x000CC0, 0x000CC0, 0x000CBF},
+{0x000CC7, 0x000CC8, 0x000CC6},
+{0x000CCA, 0x000CCB, 0x000CC6},
+{0x000D4A, 0x000D4A, 0x000D46},
+{0x000D4B, 0x000D4B, 0x000D47},
+{0x000D4C, 0x000D4C, 0x000D46},
+{0x000DDA, 0x000DDA, 0x000DD9},
+{0x000DDC, 0x000DDE, 0x000DD9},
+{0x000F43, 0x000F43, 0x000F42},
+{0x000F4D, 0x000F4D, 0x000F4C},
+{0x000F52, 0x000F52, 0x000F51},
+{0x000F57, 0x000F57, 0x000F56},
+{0x000F5C, 0x000F5C, 0x000F5B},
+{0x000F69, 0x000F69, 0x000F40},
+{0x000F73, 0x000F73, 0x000F71},
+{0x000F75, 0x000F75, 0x000F71},
+{0x000F76, 0x000F76, 0x000FB2},
+{0x000F78, 0x000F78, 0x000FB3},
+{0x000F81, 0x000F81, 0x000F71},
+{0x000F93, 0x000F93, 0x000F92},
+{0x000F9D, 0x000F9D, 0x000F9C},
+{0x000FA2, 0x000FA2, 0x000FA1},
+{0x000FA7, 0x000FA7, 0x000FA6},
+{0x000FAC, 0x000FAC, 0x000FAB},
+{0x000FB9, 0x000FB9, 0x000F90},
+{0x001026, 0x001026, 0x001025},
+{0x001B06, 0x001B06, 0x001B05},
+{0x001B08, 0x001B08, 0x001B07},
+{0x001B0A, 0x001B0A, 0x001B09},
+{0x001B0C, 0x001B0C, 0x001B0B},
+{0x001B0E, 0x001B0E, 0x001B0D},
+{0x001B12, 0x001B12, 0x001B11},
+{0x001B3B, 0x001B3B, 0x001B3A},
+{0x001B3D, 0x001B3D, 0x001B3C},
+{0x001B40, 0x001B40, 0x001B3E},
+{0x001B41, 0x001B41, 0x001B3F},
+{0x001B43, 0x001B43, 0x001B42},
+{0x001E00, 0x001E00, 0x000041},
+{0x001E01, 0x001E01, 0x000061},
+{0x001E02, 0x001E02, 0x000042},
+{0x001E03, 0x001E03, 0x000062},
+{0x001E04, 0x001E04, 0x000042},
+{0x001E05, 0x001E05, 0x000062},
+{0x001E06, 0x001E06, 0x000042},
+{0x001E07, 0x001E07, 0x000062},
+{0x001E08, 0x001E08, 0x000043},
+{0x001E09, 0x001E09, 0x000063},
+{0x001E0A, 0x001E0A, 0x000044},
+{0x001E0B, 0x001E0B, 0x000064},
+{0x001E0C, 0x001E0C, 0x000044},
+{0x001E0D, 0x001E0D, 0x000064},
+{0x001E0E, 0x001E0E, 0x000044},
+{0x001E0F, 0x001E0F, 0x000064},
+{0x001E10, 0x001E10, 0x000044},
+{0x001E11, 0x001E11, 0x000064},
+{0x001E12, 0x001E12, 0x000044},
+{0x001E13, 0x001E13, 0x000064},
+{0x001E14, 0x001E14, 0x000045},
+{0x001E15, 0x001E15, 0x000065},
+{0x001E16, 0x001E16, 0x000045},
+{0x001E17, 0x001E17, 0x000065},
+{0x001E18, 0x001E18, 0x000045},
+{0x001E19, 0x001E19, 0x000065},
+{0x001E1A, 0x001E1A, 0x000045},
+{0x001E1B, 0x001E1B, 0x000065},
+{0x001E1C, 0x001E1C, 0x000045},
+{0x001E1D, 0x001E1D, 0x000065},
+{0x001E1E, 0x001E1E, 0x000046},
+{0x001E1F, 0x001E1F, 0x000066},
+{0x001E20, 0x001E20, 0x000047},
+{0x001E21, 0x001E21, 0x000067},
+{0x001E22, 0x001E22, 0x000048},
+{0x001E23, 0x001E23, 0x000068},
+{0x001E24, 0x001E24, 0x000048},
+{0x001E25, 0x001E25, 0x000068},
+{0x001E26, 0x001E26, 0x000048},
+{0x001E27, 0x001E27, 0x000068},
+{0x001E28, 0x001E28, 0x000048},
+{0x001E29, 0x001E29, 0x000068},
+{0x001E2A, 0x001E2A, 0x000048},
+{0x001E2B, 0x001E2B, 0x000068},
+{0x001E2C, 0x001E2C, 0x000049},
+{0x001E2D, 0x001E2D, 0x000069},
+{0x001E2E, 0x001E2E, 0x000049},
+{0x001E2F, 0x001E2F, 0x000069},
+{0x001E30, 0x001E30, 0x00004B},
+{0x001E31, 0x001E31, 0x00006B},
+{0x001E32, 0x001E32, 0x00004B},
+{0x001E33, 0x001E33, 0x00006B},
+{0x001E34, 0x001E34, 0x00004B},
+{0x001E35, 0x001E35, 0x00006B},
+{0x001E36, 0x001E36, 0x00004C},
+{0x001E37, 0x001E37, 0x00006C},
+{0x001E38, 0x001E38, 0x00004C},
+{0x001E39, 0x001E39, 0x00006C},
+{0x001E3A, 0x001E3A, 0x00004C},
+{0x001E3B, 0x001E3B, 0x00006C},
+{0x001E3C, 0x001E3C, 0x00004C},
+{0x001E3D, 0x001E3D, 0x00006C},
+{0x001E3E, 0x001E3E, 0x00004D},
+{0x001E3F, 0x001E3F, 0x00006D},
+{0x001E40, 0x001E40, 0x00004D},
+{0x001E41, 0x001E41, 0x00006D},
+{0x001E42, 0x001E42, 0x00004D},
+{0x001E43, 0x001E43, 0x00006D},
+{0x001E44, 0x001E44, 0x00004E},
+{0x001E45, 0x001E45, 0x00006E},
+{0x001E46, 0x001E46, 0x00004E},
+{0x001E47, 0x001E47, 0x00006E},
+{0x001E48, 0x001E48, 0x00004E},
+{0x001E49, 0x001E49, 0x00006E},
+{0x001E4A, 0x001E4A, 0x00004E},
+{0x001E4B, 0x001E4B, 0x00006E},
+{0x001E4C, 0x001E4C, 0x00004F},
+{0x001E4D, 0x001E4D, 0x00006F},
+{0x001E4E, 0x001E4E, 0x00004F},
+{0x001E4F, 0x001E4F, 0x00006F},
+{0x001E50, 0x001E50, 0x00004F},
+{0x001E51, 0x001E51, 0x00006F},
+{0x001E52, 0x001E52, 0x00004F},
+{0x001E53, 0x001E53, 0x00006F},
+{0x001E54, 0x001E54, 0x000050},
+{0x001E55, 0x001E55, 0x000070},
+{0x001E56, 0x001E56, 0x000050},
+{0x001E57, 0x001E57, 0x000070},
+{0x001E58, 0x001E58, 0x000052},
+{0x001E59, 0x001E59, 0x000072},
+{0x001E5A, 0x001E5A, 0x000052},
+{0x001E5B, 0x001E5B, 0x000072},
+{0x001E5C, 0x001E5C, 0x000052},
+{0x001E5D, 0x001E5D, 0x000072},
+{0x001E5E, 0x001E5E, 0x000052},
+{0x001E5F, 0x001E5F, 0x000072},
+{0x001E60, 0x001E60, 0x000053},
+{0x001E61, 0x001E61, 0x000073},
+{0x001E62, 0x001E62, 0x000053},
+{0x001E63, 0x001E63, 0x000073},
+{0x001E64, 0x001E64, 0x000053},
+{0x001E65, 0x001E65, 0x000073},
+{0x001E66, 0x001E66, 0x000053},
+{0x001E67, 0x001E67, 0x000073},
+{0x001E68, 0x001E68, 0x000053},
+{0x001E69, 0x001E69, 0x000073},
+{0x001E6A, 0x001E6A, 0x000054},
+{0x001E6B, 0x001E6B, 0x000074},
+{0x001E6C, 0x001E6C, 0x000054},
+{0x001E6D, 0x001E6D, 0x000074},
+{0x001E6E, 0x001E6E, 0x000054},
+{0x001E6F, 0x001E6F, 0x000074},
+{0x001E70, 0x001E70, 0x000054},
+{0x001E71, 0x001E71, 0x000074},
+{0x001E72, 0x001E72, 0x000055},
+{0x001E73, 0x001E73, 0x000075},
+{0x001E74, 0x001E74, 0x000055},
+{0x001E75, 0x001E75, 0x000075},
+{0x001E76, 0x001E76, 0x000055},
+{0x001E77, 0x001E77, 0x000075},
+{0x001E78, 0x001E78, 0x000055},
+{0x001E79, 0x001E79, 0x000075},
+{0x001E7A, 0x001E7A, 0x000055},
+{0x001E7B, 0x001E7B, 0x000075},
+{0x001E7C, 0x001E7C, 0x000056},
+{0x001E7D, 0x001E7D, 0x000076},
+{0x001E7E, 0x001E7E, 0x000056},
+{0x001E7F, 0x001E7F, 0x000076},
+{0x001E80, 0x001E80, 0x000057},
+{0x001E81, 0x001E81, 0x000077},
+{0x001E82, 0x001E82, 0x000057},
+{0x001E83, 0x001E83, 0x000077},
+{0x001E84, 0x001E84, 0x000057},
+{0x001E85, 0x001E85, 0x000077},
+{0x001E86, 0x001E86, 0x000057},
+{0x001E87, 0x001E87, 0x000077},
+{0x001E88, 0x001E88, 0x000057},
+{0x001E89, 0x001E89, 0x000077},
+{0x001E8A, 0x001E8A, 0x000058},
+{0x001E8B, 0x001E8B, 0x000078},
+{0x001E8C, 0x001E8C, 0x000058},
+{0x001E8D, 0x001E8D, 0x000078},
+{0x001E8E, 0x001E8E, 0x000059},
+{0x001E8F, 0x001E8F, 0x000079},
+{0x001E90, 0x001E90, 0x00005A},
+{0x001E91, 0x001E91, 0x00007A},
+{0x001E92, 0x001E92, 0x00005A},
+{0x001E93, 0x001E93, 0x00007A},
+{0x001E94, 0x001E94, 0x00005A},
+{0x001E95, 0x001E95, 0x00007A},
+{0x001E96, 0x001E96, 0x000068},
+{0x001E97, 0x001E97, 0x000074},
+{0x001E98, 0x001E98, 0x000077},
+{0x001E99, 0x001E99, 0x000079},
+{0x001E9B, 0x001E9B, 0x00017F},
+{0x001EA0, 0x001EA0, 0x000041},
+{0x001EA1, 0x001EA1, 0x000061},
+{0x001EA2, 0x001EA2, 0x000041},
+{0x001EA3, 0x001EA3, 0x000061},
+{0x001EA4, 0x001EA4, 0x000041},
+{0x001EA5, 0x001EA5, 0x000061},
+{0x001EA6, 0x001EA6, 0x000041},
+{0x001EA7, 0x001EA7, 0x000061},
+{0x001EA8, 0x001EA8, 0x000041},
+{0x001EA9, 0x001EA9, 0x000061},
+{0x001EAA, 0x001EAA, 0x000041},
+{0x001EAB, 0x001EAB, 0x000061},
+{0x001EAC, 0x001EAC, 0x000041},
+{0x001EAD, 0x001EAD, 0x000061},
+{0x001EAE, 0x001EAE, 0x000041},
+{0x001EAF, 0x001EAF, 0x000061},
+{0x001EB0, 0x001EB0, 0x000041},
+{0x001EB1, 0x001EB1, 0x000061},
+{0x001EB2, 0x001EB2, 0x000041},
+{0x001EB3, 0x001EB3, 0x000061},
+{0x001EB4, 0x001EB4, 0x000041},
+{0x001EB5, 0x001EB5, 0x000061},
+{0x001EB6, 0x001EB6, 0x000041},
+{0x001EB7, 0x001EB7, 0x000061},
+{0x001EB8, 0x001EB8, 0x000045},
+{0x001EB9, 0x001EB9, 0x000065},
+{0x001EBA, 0x001EBA, 0x000045},
+{0x001EBB, 0x001EBB, 0x000065},
+{0x001EBC, 0x001EBC, 0x000045},
+{0x001EBD, 0x001EBD, 0x000065},
+{0x001EBE, 0x001EBE, 0x000045},
+{0x001EBF, 0x001EBF, 0x000065},
+{0x001EC0, 0x001EC0, 0x000045},
+{0x001EC1, 0x001EC1, 0x000065},
+{0x001EC2, 0x001EC2, 0x000045},
+{0x001EC3, 0x001EC3, 0x000065},
+{0x001EC4, 0x001EC4, 0x000045},
+{0x001EC5, 0x001EC5, 0x000065},
+{0x001EC6, 0x001EC6, 0x000045},
+{0x001EC7, 0x001EC7, 0x000065},
+{0x001EC8, 0x001EC8, 0x000049},
+{0x001EC9, 0x001EC9, 0x000069},
+{0x001ECA, 0x001ECA, 0x000049},
+{0x001ECB, 0x001ECB, 0x000069},
+{0x001ECC, 0x001ECC, 0x00004F},
+{0x001ECD, 0x001ECD, 0x00006F},
+{0x001ECE, 0x001ECE, 0x00004F},
+{0x001ECF, 0x001ECF, 0x00006F},
+{0x001ED0, 0x001ED0, 0x00004F},
+{0x001ED1, 0x001ED1, 0x00006F},
+{0x001ED2, 0x001ED2, 0x00004F},
+{0x001ED3, 0x001ED3, 0x00006F},
+{0x001ED4, 0x001ED4, 0x00004F},
+{0x001ED5, 0x001ED5, 0x00006F},
+{0x001ED6, 0x001ED6, 0x00004F},
+{0x001ED7, 0x001ED7, 0x00006F},
+{0x001ED8, 0x001ED8, 0x00004F},
+{0x001ED9, 0x001ED9, 0x00006F},
+{0x001EDA, 0x001EDA, 0x00004F},
+{0x001EDB, 0x001EDB, 0x00006F},
+{0x001EDC, 0x001EDC, 0x00004F},
+{0x001EDD, 0x001EDD, 0x00006F},
+{0x001EDE, 0x001EDE, 0x00004F},
+{0x001EDF, 0x001EDF, 0x00006F},
+{0x001EE0, 0x001EE0, 0x00004F},
+{0x001EE1, 0x001EE1, 0x00006F},
+{0x001EE2, 0x001EE2, 0x00004F},
+{0x001EE3, 0x001EE3, 0x00006F},
+{0x001EE4, 0x001EE4, 0x000055},
+{0x001EE5, 0x001EE5, 0x000075},
+{0x001EE6, 0x001EE6, 0x000055},
+{0x001EE7, 0x001EE7, 0x000075},
+{0x001EE8, 0x001EE8, 0x000055},
+{0x001EE9, 0x001EE9, 0x000075},
+{0x001EEA, 0x001EEA, 0x000055},
+{0x001EEB, 0x001EEB, 0x000075},
+{0x001EEC, 0x001EEC, 0x000055},
+{0x001EED, 0x001EED, 0x000075},
+{0x001EEE, 0x001EEE, 0x000055},
+{0x001EEF, 0x001EEF, 0x000075},
+{0x001EF0, 0x001EF0, 0x000055},
+{0x001EF1, 0x001EF1, 0x000075},
+{0x001EF2, 0x001EF2, 0x000059},
+{0x001EF3, 0x001EF3, 0x000079},
+{0x001EF4, 0x001EF4, 0x000059},
+{0x001EF5, 0x001EF5, 0x000079},
+{0x001EF6, 0x001EF6, 0x000059},
+{0x001EF7, 0x001EF7, 0x000079},
+{0x001EF8, 0x001EF8, 0x000059},
+{0x001EF9, 0x001EF9, 0x000079},
+{0x001F00, 0x001F07, 0x0003B1},
+{0x001F08, 0x001F0F, 0x000391},
+{0x001F10, 0x001F15, 0x0003B5},
+{0x001F18, 0x001F1D, 0x000395},
+{0x001F20, 0x001F27, 0x0003B7},
+{0x001F28, 0x001F2F, 0x000397},
+{0x001F30, 0x001F37, 0x0003B9},
+{0x001F38, 0x001F3F, 0x000399},
+{0x001F40, 0x001F45, 0x0003BF},
+{0x001F48, 0x001F4D, 0x00039F},
+{0x001F50, 0x001F57, 0x0003C5},
+{0x001F59, 0x001F59, 0x0003A5},
+{0x001F5B, 0x001F5B, 0x0003A5},
+{0x001F5D, 0x001F5D, 0x0003A5},
+{0x001F5F, 0x001F5F, 0x0003A5},
+{0x001F60, 0x001F67, 0x0003C9},
+{0x001F68, 0x001F6F, 0x0003A9},
+{0x001F70, 0x001F71, 0x0003B1},
+{0x001F72, 0x001F73, 0x0003B5},
+{0x001F74, 0x001F75, 0x0003B7},
+{0x001F76, 0x001F77, 0x0003B9},
+{0x001F78, 0x001F79, 0x0003BF},
+{0x001F7A, 0x001F7B, 0x0003C5},
+{0x001F7C, 0x001F7D, 0x0003C9},
+{0x001F80, 0x001F87, 0x0003B1},
+{0x001F88, 0x001F8F, 0x000391},
+{0x001F90, 0x001F97, 0x0003B7},
+{0x001F98, 0x001F9F, 0x000397},
+{0x001FA0, 0x001FA7, 0x0003C9},
+{0x001FA8, 0x001FAF, 0x0003A9},
+{0x001FB0, 0x001FB4, 0x0003B1},
+{0x001FB6, 0x001FB7, 0x0003B1},
+{0x001FB8, 0x001FBC, 0x000391},
+{0x001FBE, 0x001FBE, 0x0003B9},
+{0x001FC1, 0x001FC1, 0x0000A8},
+{0x001FC2, 0x001FC4, 0x0003B7},
+{0x001FC6, 0x001FC7, 0x0003B7},
+{0x001FC8, 0x001FC9, 0x000395},
+{0x001FCA, 0x001FCC, 0x000397},
+{0x001FCD, 0x001FCF, 0x001FBF},
+{0x001FD0, 0x001FD3, 0x0003B9},
+{0x001FD6, 0x001FD7, 0x0003B9},
+{0x001FD8, 0x001FDB, 0x000399},
+{0x001FDD, 0x001FDF, 0x001FFE},
+{0x001FE0, 0x001FE3, 0x0003C5},
+{0x001FE4, 0x001FE5, 0x0003C1},
+{0x001FE6, 0x001FE7, 0x0003C5},
+{0x001FE8, 0x001FEB, 0x0003A5},
+{0x001FEC, 0x001FEC, 0x0003A1},
+{0x001FED, 0x001FEE, 0x0000A8},
+{0x001FEF, 0x001FEF, 0x000060},
+{0x001FF2, 0x001FF4, 0x0003C9},
+{0x001FF6, 0x001FF7, 0x0003C9},
+{0x001FF8, 0x001FF9, 0x00039F},
+{0x001FFA, 0x001FFC, 0x0003A9},
+{0x001FFD, 0x001FFD, 0x0000B4},
+{0x002000, 0x002000, 0x002002},
+{0x002001, 0x002001, 0x002003},
+{0x002126, 0x002126, 0x0003A9},
+{0x00212A, 0x00212A, 0x00004B},
+{0x00212B, 0x00212B, 0x000041},
+{0x00219A, 0x00219A, 0x002190},
+{0x00219B, 0x00219B, 0x002192},
+{0x0021AE, 0x0021AE, 0x002194},
+{0x0021CD, 0x0021CD, 0x0021D0},
+{0x0021CE, 0x0021CE, 0x0021D4},
+{0x0021CF, 0x0021CF, 0x0021D2},
+{0x002204, 0x002204, 0x002203},
+{0x002209, 0x002209, 0x002208},
+{0x00220C, 0x00220C, 0x00220B},
+{0x002224, 0x002224, 0x002223},
+{0x002226, 0x002226, 0x002225},
+{0x002241, 0x002241, 0x00223C},
+{0x002244, 0x002244, 0x002243},
+{0x002247, 0x002247, 0x002245},
+{0x002249, 0x002249, 0x002248},
+{0x002260, 0x002260, 0x00003D},
+{0x002262, 0x002262, 0x002261},
+{0x00226D, 0x00226D, 0x00224D},
+{0x00226E, 0x00226E, 0x00003C},
+{0x00226F, 0x00226F, 0x00003E},
+{0x002270, 0x002270, 0x002264},
+{0x002271, 0x002271, 0x002265},
+{0x002274, 0x002274, 0x002272},
+{0x002275, 0x002275, 0x002273},
+{0x002278, 0x002278, 0x002276},
+{0x002279, 0x002279, 0x002277},
+{0x002280, 0x002280, 0x00227A},
+{0x002281, 0x002281, 0x00227B},
+{0x002284, 0x002284, 0x002282},
+{0x002285, 0x002285, 0x002283},
+{0x002288, 0x002288, 0x002286},
+{0x002289, 0x002289, 0x002287},
+{0x0022AC, 0x0022AC, 0x0022A2},
+{0x0022AD, 0x0022AD, 0x0022A8},
+{0x0022AE, 0x0022AE, 0x0022A9},
+{0x0022AF, 0x0022AF, 0x0022AB},
+{0x0022E0, 0x0022E0, 0x00227C},
+{0x0022E1, 0x0022E1, 0x00227D},
+{0x0022E2, 0x0022E2, 0x002291},
+{0x0022E3, 0x0022E3, 0x002292},
+{0x0022EA, 0x0022EA, 0x0022B2},
+{0x0022EB, 0x0022EB, 0x0022B3},
+{0x0022EC, 0x0022EC, 0x0022B4},
+{0x0022ED, 0x0022ED, 0x0022B5},
+{0x002329, 0x002329, 0x003008},
+{0x00232A, 0x00232A, 0x003009},
+{0x002ADC, 0x002ADC, 0x002ADD},
+{0x00304C, 0x00304C, 0x00304B},
+{0x00304E, 0x00304E, 0x00304D},
+{0x003050, 0x003050, 0x00304F},
+{0x003052, 0x003052, 0x003051},
+{0x003054, 0x003054, 0x003053},
+{0x003056, 0x003056, 0x003055},
+{0x003058, 0x003058, 0x003057},
+{0x00305A, 0x00305A, 0x003059},
+{0x00305C, 0x00305C, 0x00305B},
+{0x00305E, 0x00305E, 0x00305D},
+{0x003060, 0x003060, 0x00305F},
+{0x003062, 0x003062, 0x003061},
+{0x003065, 0x003065, 0x003064},
+{0x003067, 0x003067, 0x003066},
+{0x003069, 0x003069, 0x003068},
+{0x003070, 0x003071, 0x00306F},
+{0x003073, 0x003074, 0x003072},
+{0x003076, 0x003077, 0x003075},
+{0x003079, 0x00307A, 0x003078},
+{0x00307C, 0x00307D, 0x00307B},
+{0x003094, 0x003094, 0x003046},
+{0x00309E, 0x00309E, 0x00309D},
+{0x0030AC, 0x0030AC, 0x0030AB},
+{0x0030AE, 0x0030AE, 0x0030AD},
+{0x0030B0, 0x0030B0, 0x0030AF},
+{0x0030B2, 0x0030B2, 0x0030B1},
+{0x0030B4, 0x0030B4, 0x0030B3},
+{0x0030B6, 0x0030B6, 0x0030B5},
+{0x0030B8, 0x0030B8, 0x0030B7},
+{0x0030BA, 0x0030BA, 0x0030B9},
+{0x0030BC, 0x0030BC, 0x0030BB},
+{0x0030BE, 0x0030BE, 0x0030BD},
+{0x0030C0, 0x0030C0, 0x0030BF},
+{0x0030C2, 0x0030C2, 0x0030C1},
+{0x0030C5, 0x0030C5, 0x0030C4},
+{0x0030C7, 0x0030C7, 0x0030C6},
+{0x0030C9, 0x0030C9, 0x0030C8},
+{0x0030D0, 0x0030D1, 0x0030CF},
+{0x0030D3, 0x0030D4, 0x0030D2},
+{0x0030D6, 0x0030D7, 0x0030D5},
+{0x0030D9, 0x0030DA, 0x0030D8},
+{0x0030DC, 0x0030DD, 0x0030DB},
+{0x0030F4, 0x0030F4, 0x0030A6},
+{0x0030F7, 0x0030F7, 0x0030EF},
+{0x0030F8, 0x0030F8, 0x0030F0},
+{0x0030F9, 0x0030F9, 0x0030F1},
+{0x0030FA, 0x0030FA, 0x0030F2},
+{0x0030FE, 0x0030FE, 0x0030FD},
+{0x00AC00, 0x00AE4B, 0x001100},
+{0x00AE4C, 0x00B097, 0x001101},
+{0x00B098, 0x00B2E3, 0x001102},
+{0x00B2E4, 0x00B52F, 0x001103},
+{0x00B530, 0x00B77B, 0x001104},
+{0x00B77C, 0x00B9C7, 0x001105},
+{0x00B9C8, 0x00BC13, 0x001106},
+{0x00BC14, 0x00BE5F, 0x001107},
+{0x00BE60, 0x00C0AB, 0x001108},
+{0x00C0AC, 0x00C2F7, 0x001109},
+{0x00C2F8, 0x00C543, 0x00110A},
+{0x00C544, 0x00C78F, 0x00110B},
+{0x00C790, 0x00C9DB, 0x00110C},
+{0x00C9DC, 0x00CC27, 0x00110D},
+{0x00CC28, 0x00CE73, 0x00110E},
+{0x00CE74, 0x00D0BF, 0x00110F},
+{0x00D0C0, 0x00D30B, 0x001110},
+{0x00D30C, 0x00D557, 0x001111},
+{0x00D558, 0x00D7A3, 0x001112},
+{0x00F900, 0x00F900, 0x008C48},
+{0x00F901, 0x00F901, 0x0066F4},
+{0x00F902, 0x00F902, 0x008ECA},
+{0x00F903, 0x00F903, 0x008CC8},
+{0x00F904, 0x00F904, 0x006ED1},
+{0x00F905, 0x00F905, 0x004E32},
+{0x00F906, 0x00F906, 0x0053E5},
+{0x00F907, 0x00F908, 0x009F9C},
+{0x00F909, 0x00F909, 0x005951},
+{0x00F90A, 0x00F90A, 0x0091D1},
+{0x00F90B, 0x00F90B, 0x005587},
+{0x00F90C, 0x00F90C, 0x005948},
+{0x00F90D, 0x00F90D, 0x0061F6},
+{0x00F90E, 0x00F90E, 0x007669},
+{0x00F90F, 0x00F90F, 0x007F85},
+{0x00F910, 0x00F910, 0x00863F},
+{0x00F911, 0x00F911, 0x0087BA},
+{0x00F912, 0x00F912, 0x0088F8},
+{0x00F913, 0x00F913, 0x00908F},
+{0x00F914, 0x00F914, 0x006A02},
+{0x00F915, 0x00F915, 0x006D1B},
+{0x00F916, 0x00F916, 0x0070D9},
+{0x00F917, 0x00F917, 0x0073DE},
+{0x00F918, 0x00F918, 0x00843D},
+{0x00F919, 0x00F919, 0x00916A},
+{0x00F91A, 0x00F91A, 0x0099F1},
+{0x00F91B, 0x00F91B, 0x004E82},
+{0x00F91C, 0x00F91C, 0x005375},
+{0x00F91D, 0x00F91D, 0x006B04},
+{0x00F91E, 0x00F91E, 0x00721B},
+{0x00F91F, 0x00F91F, 0x00862D},
+{0x00F920, 0x00F920, 0x009E1E},
+{0x00F921, 0x00F921, 0x005D50},
+{0x00F922, 0x00F922, 0x006FEB},
+{0x00F923, 0x00F923, 0x0085CD},
+{0x00F924, 0x00F924, 0x008964},
+{0x00F925, 0x00F925, 0x0062C9},
+{0x00F926, 0x00F926, 0x0081D8},
+{0x00F927, 0x00F927, 0x00881F},
+{0x00F928, 0x00F928, 0x005ECA},
+{0x00F929, 0x00F929, 0x006717},
+{0x00F92A, 0x00F92A, 0x006D6A},
+{0x00F92B, 0x00F92B, 0x0072FC},
+{0x00F92C, 0x00F92C, 0x0090CE},
+{0x00F92D, 0x00F92D, 0x004F86},
+{0x00F92E, 0x00F92E, 0x0051B7},
+{0x00F92F, 0x00F92F, 0x0052DE},
+{0x00F930, 0x00F930, 0x0064C4},
+{0x00F931, 0x00F931, 0x006AD3},
+{0x00F932, 0x00F932, 0x007210},
+{0x00F933, 0x00F933, 0x0076E7},
+{0x00F934, 0x00F934, 0x008001},
+{0x00F935, 0x00F935, 0x008606},
+{0x00F936, 0x00F936, 0x00865C},
+{0x00F937, 0x00F937, 0x008DEF},
+{0x00F938, 0x00F938, 0x009732},
+{0x00F939, 0x00F939, 0x009B6F},
+{0x00F93A, 0x00F93A, 0x009DFA},
+{0x00F93B, 0x00F93B, 0x00788C},
+{0x00F93C, 0x00F93C, 0x00797F},
+{0x00F93D, 0x00F93D, 0x007DA0},
+{0x00F93E, 0x00F93E, 0x0083C9},
+{0x00F93F, 0x00F93F, 0x009304},
+{0x00F940, 0x00F940, 0x009E7F},
+{0x00F941, 0x00F941, 0x008AD6},
+{0x00F942, 0x00F942, 0x0058DF},
+{0x00F943, 0x00F943, 0x005F04},
+{0x00F944, 0x00F944, 0x007C60},
+{0x00F945, 0x00F945, 0x00807E},
+{0x00F946, 0x00F946, 0x007262},
+{0x00F947, 0x00F947, 0x0078CA},
+{0x00F948, 0x00F948, 0x008CC2},
+{0x00F949, 0x00F949, 0x0096F7},
+{0x00F94A, 0x00F94A, 0x0058D8},
+{0x00F94B, 0x00F94B, 0x005C62},
+{0x00F94C, 0x00F94C, 0x006A13},
+{0x00F94D, 0x00F94D, 0x006DDA},
+{0x00F94E, 0x00F94E, 0x006F0F},
+{0x00F94F, 0x00F94F, 0x007D2F},
+{0x00F950, 0x00F950, 0x007E37},
+{0x00F951, 0x00F951, 0x00964B},
+{0x00F952, 0x00F952, 0x0052D2},
+{0x00F953, 0x00F953, 0x00808B},
+{0x00F954, 0x00F954, 0x0051DC},
+{0x00F955, 0x00F955, 0x0051CC},
+{0x00F956, 0x00F956, 0x007A1C},
+{0x00F957, 0x00F957, 0x007DBE},
+{0x00F958, 0x00F958, 0x0083F1},
+{0x00F959, 0x00F959, 0x009675},
+{0x00F95A, 0x00F95A, 0x008B80},
+{0x00F95B, 0x00F95B, 0x0062CF},
+{0x00F95C, 0x00F95C, 0x006A02},
+{0x00F95D, 0x00F95D, 0x008AFE},
+{0x00F95E, 0x00F95E, 0x004E39},
+{0x00F95F, 0x00F95F, 0x005BE7},
+{0x00F960, 0x00F960, 0x006012},
+{0x00F961, 0x00F961, 0x007387},
+{0x00F962, 0x00F962, 0x007570},
+{0x00F963, 0x00F963, 0x005317},
+{0x00F964, 0x00F964, 0x0078FB},
+{0x00F965, 0x00F965, 0x004FBF},
+{0x00F966, 0x00F966, 0x005FA9},
+{0x00F967, 0x00F967, 0x004E0D},
+{0x00F968, 0x00F968, 0x006CCC},
+{0x00F969, 0x00F969, 0x006578},
+{0x00F96A, 0x00F96A, 0x007D22},
+{0x00F96B, 0x00F96B, 0x0053C3},
+{0x00F96C, 0x00F96C, 0x00585E},
+{0x00F96D, 0x00F96D, 0x007701},
+{0x00F96E, 0x00F96E, 0x008449},
+{0x00F96F, 0x00F96F, 0x008AAA},
+{0x00F970, 0x00F970, 0x006BBA},
+{0x00F971, 0x00F971, 0x008FB0},
+{0x00F972, 0x00F972, 0x006C88},
+{0x00F973, 0x00F973, 0x0062FE},
+{0x00F974, 0x00F974, 0x0082E5},
+{0x00F975, 0x00F975, 0x0063A0},
+{0x00F976, 0x00F976, 0x007565},
+{0x00F977, 0x00F977, 0x004EAE},
+{0x00F978, 0x00F978, 0x005169},
+{0x00F979, 0x00F979, 0x0051C9},
+{0x00F97A, 0x00F97A, 0x006881},
+{0x00F97B, 0x00F97B, 0x007CE7},
+{0x00F97C, 0x00F97C, 0x00826F},
+{0x00F97D, 0x00F97D, 0x008AD2},
+{0x00F97E, 0x00F97E, 0x0091CF},
+{0x00F97F, 0x00F97F, 0x0052F5},
+{0x00F980, 0x00F980, 0x005442},
+{0x00F981, 0x00F981, 0x005973},
+{0x00F982, 0x00F982, 0x005EEC},
+{0x00F983, 0x00F983, 0x0065C5},
+{0x00F984, 0x00F984, 0x006FFE},
+{0x00F985, 0x00F985, 0x00792A},
+{0x00F986, 0x00F986, 0x0095AD},
+{0x00F987, 0x00F987, 0x009A6A},
+{0x00F988, 0x00F988, 0x009E97},
+{0x00F989, 0x00F989, 0x009ECE},
+{0x00F98A, 0x00F98A, 0x00529B},
+{0x00F98B, 0x00F98B, 0x0066C6},
+{0x00F98C, 0x00F98C, 0x006B77},
+{0x00F98D, 0x00F98D, 0x008F62},
+{0x00F98E, 0x00F98E, 0x005E74},
+{0x00F98F, 0x00F98F, 0x006190},
+{0x00F990, 0x00F990, 0x006200},
+{0x00F991, 0x00F991, 0x00649A},
+{0x00F992, 0x00F992, 0x006F23},
+{0x00F993, 0x00F993, 0x007149},
+{0x00F994, 0x00F994, 0x007489},
+{0x00F995, 0x00F995, 0x0079CA},
+{0x00F996, 0x00F996, 0x007DF4},
+{0x00F997, 0x00F997, 0x00806F},
+{0x00F998, 0x00F998, 0x008F26},
+{0x00F999, 0x00F999, 0x0084EE},
+{0x00F99A, 0x00F99A, 0x009023},
+{0x00F99B, 0x00F99B, 0x00934A},
+{0x00F99C, 0x00F99C, 0x005217},
+{0x00F99D, 0x00F99D, 0x0052A3},
+{0x00F99E, 0x00F99E, 0x0054BD},
+{0x00F99F, 0x00F99F, 0x0070C8},
+{0x00F9A0, 0x00F9A0, 0x0088C2},
+{0x00F9A1, 0x00F9A1, 0x008AAA},
+{0x00F9A2, 0x00F9A2, 0x005EC9},
+{0x00F9A3, 0x00F9A3, 0x005FF5},
+{0x00F9A4, 0x00F9A4, 0x00637B},
+{0x00F9A5, 0x00F9A5, 0x006BAE},
+{0x00F9A6, 0x00F9A6, 0x007C3E},
+{0x00F9A7, 0x00F9A7, 0x007375},
+{0x00F9A8, 0x00F9A8, 0x004EE4},
+{0x00F9A9, 0x00F9A9, 0x0056F9},
+{0x00F9AA, 0x00F9AA, 0x005BE7},
+{0x00F9AB, 0x00F9AB, 0x005DBA},
+{0x00F9AC, 0x00F9AC, 0x00601C},
+{0x00F9AD, 0x00F9AD, 0x0073B2},
+{0x00F9AE, 0x00F9AE, 0x007469},
+{0x00F9AF, 0x00F9AF, 0x007F9A},
+{0x00F9B0, 0x00F9B0, 0x008046},
+{0x00F9B1, 0x00F9B1, 0x009234},
+{0x00F9B2, 0x00F9B2, 0x0096F6},
+{0x00F9B3, 0x00F9B3, 0x009748},
+{0x00F9B4, 0x00F9B4, 0x009818},
+{0x00F9B5, 0x00F9B5, 0x004F8B},
+{0x00F9B6, 0x00F9B6, 0x0079AE},
+{0x00F9B7, 0x00F9B7, 0x0091B4},
+{0x00F9B8, 0x00F9B8, 0x0096B8},
+{0x00F9B9, 0x00F9B9, 0x0060E1},
+{0x00F9BA, 0x00F9BA, 0x004E86},
+{0x00F9BB, 0x00F9BB, 0x0050DA},
+{0x00F9BC, 0x00F9BC, 0x005BEE},
+{0x00F9BD, 0x00F9BD, 0x005C3F},
+{0x00F9BE, 0x00F9BE, 0x006599},
+{0x00F9BF, 0x00F9BF, 0x006A02},
+{0x00F9C0, 0x00F9C0, 0x0071CE},
+{0x00F9C1, 0x00F9C1, 0x007642},
+{0x00F9C2, 0x00F9C2, 0x0084FC},
+{0x00F9C3, 0x00F9C3, 0x00907C},
+{0x00F9C4, 0x00F9C4, 0x009F8D},
+{0x00F9C5, 0x00F9C5, 0x006688},
+{0x00F9C6, 0x00F9C6, 0x00962E},
+{0x00F9C7, 0x00F9C7, 0x005289},
+{0x00F9C8, 0x00F9C8, 0x00677B},
+{0x00F9C9, 0x00F9C9, 0x0067F3},
+{0x00F9CA, 0x00F9CA, 0x006D41},
+{0x00F9CB, 0x00F9CB, 0x006E9C},
+{0x00F9CC, 0x00F9CC, 0x007409},
+{0x00F9CD, 0x00F9CD, 0x007559},
+{0x00F9CE, 0x00F9CE, 0x00786B},
+{0x00F9CF, 0x00F9CF, 0x007D10},
+{0x00F9D0, 0x00F9D0, 0x00985E},
+{0x00F9D1, 0x00F9D1, 0x00516D},
+{0x00F9D2, 0x00F9D2, 0x00622E},
+{0x00F9D3, 0x00F9D3, 0x009678},
+{0x00F9D4, 0x00F9D4, 0x00502B},
+{0x00F9D5, 0x00F9D5, 0x005D19},
+{0x00F9D6, 0x00F9D6, 0x006DEA},
+{0x00F9D7, 0x00F9D7, 0x008F2A},
+{0x00F9D8, 0x00F9D8, 0x005F8B},
+{0x00F9D9, 0x00F9D9, 0x006144},
+{0x00F9DA, 0x00F9DA, 0x006817},
+{0x00F9DB, 0x00F9DB, 0x007387},
+{0x00F9DC, 0x00F9DC, 0x009686},
+{0x00F9DD, 0x00F9DD, 0x005229},
+{0x00F9DE, 0x00F9DE, 0x00540F},
+{0x00F9DF, 0x00F9DF, 0x005C65},
+{0x00F9E0, 0x00F9E0, 0x006613},
+{0x00F9E1, 0x00F9E1, 0x00674E},
+{0x00F9E2, 0x00F9E2, 0x0068A8},
+{0x00F9E3, 0x00F9E3, 0x006CE5},
+{0x00F9E4, 0x00F9E4, 0x007406},
+{0x00F9E5, 0x00F9E5, 0x0075E2},
+{0x00F9E6, 0x00F9E6, 0x007F79},
+{0x00F9E7, 0x00F9E7, 0x0088CF},
+{0x00F9E8, 0x00F9E8, 0x0088E1},
+{0x00F9E9, 0x00F9E9, 0x0091CC},
+{0x00F9EA, 0x00F9EA, 0x0096E2},
+{0x00F9EB, 0x00F9EB, 0x00533F},
+{0x00F9EC, 0x00F9EC, 0x006EBA},
+{0x00F9ED, 0x00F9ED, 0x00541D},
+{0x00F9EE, 0x00F9EE, 0x0071D0},
+{0x00F9EF, 0x00F9EF, 0x007498},
+{0x00F9F0, 0x00F9F0, 0x0085FA},
+{0x00F9F1, 0x00F9F1, 0x0096A3},
+{0x00F9F2, 0x00F9F2, 0x009C57},
+{0x00F9F3, 0x00F9F3, 0x009E9F},
+{0x00F9F4, 0x00F9F4, 0x006797},
+{0x00F9F5, 0x00F9F5, 0x006DCB},
+{0x00F9F6, 0x00F9F6, 0x0081E8},
+{0x00F9F7, 0x00F9F7, 0x007ACB},
+{0x00F9F8, 0x00F9F8, 0x007B20},
+{0x00F9F9, 0x00F9F9, 0x007C92},
+{0x00F9FA, 0x00F9FA, 0x0072C0},
+{0x00F9FB, 0x00F9FB, 0x007099},
+{0x00F9FC, 0x00F9FC, 0x008B58},
+{0x00F9FD, 0x00F9FD, 0x004EC0},
+{0x00F9FE, 0x00F9FE, 0x008336},
+{0x00F9FF, 0x00F9FF, 0x00523A},
+{0x00FA00, 0x00FA00, 0x005207},
+{0x00FA01, 0x00FA01, 0x005EA6},
+{0x00FA02, 0x00FA02, 0x0062D3},
+{0x00FA03, 0x00FA03, 0x007CD6},
+{0x00FA04, 0x00FA04, 0x005B85},
+{0x00FA05, 0x00FA05, 0x006D1E},
+{0x00FA06, 0x00FA06, 0x0066B4},
+{0x00FA07, 0x00FA07, 0x008F3B},
+{0x00FA08, 0x00FA08, 0x00884C},
+{0x00FA09, 0x00FA09, 0x00964D},
+{0x00FA0A, 0x00FA0A, 0x00898B},
+{0x00FA0B, 0x00FA0B, 0x005ED3},
+{0x00FA0C, 0x00FA0C, 0x005140},
+{0x00FA0D, 0x00FA0D, 0x0055C0},
+{0x00FA10, 0x00FA10, 0x00585A},
+{0x00FA12, 0x00FA12, 0x006674},
+{0x00FA15, 0x00FA15, 0x0051DE},
+{0x00FA16, 0x00FA16, 0x00732A},
+{0x00FA17, 0x00FA17, 0x0076CA},
+{0x00FA18, 0x00FA18, 0x00793C},
+{0x00FA19, 0x00FA19, 0x00795E},
+{0x00FA1A, 0x00FA1A, 0x007965},
+{0x00FA1B, 0x00FA1B, 0x00798F},
+{0x00FA1C, 0x00FA1C, 0x009756},
+{0x00FA1D, 0x00FA1D, 0x007CBE},
+{0x00FA1E, 0x00FA1E, 0x007FBD},
+{0x00FA20, 0x00FA20, 0x008612},
+{0x00FA22, 0x00FA22, 0x008AF8},
+{0x00FA25, 0x00FA25, 0x009038},
+{0x00FA26, 0x00FA26, 0x0090FD},
+{0x00FA2A, 0x00FA2A, 0x0098EF},
+{0x00FA2B, 0x00FA2B, 0x0098FC},
+{0x00FA2C, 0x00FA2C, 0x009928},
+{0x00FA2D, 0x00FA2D, 0x009DB4},
+{0x00FA2E, 0x00FA2E, 0x0090DE},
+{0x00FA2F, 0x00FA2F, 0x0096B7},
+{0x00FA30, 0x00FA30, 0x004FAE},
+{0x00FA31, 0x00FA31, 0x0050E7},
+{0x00FA32, 0x00FA32, 0x00514D},
+{0x00FA33, 0x00FA33, 0x0052C9},
+{0x00FA34, 0x00FA34, 0x0052E4},
+{0x00FA35, 0x00FA35, 0x005351},
+{0x00FA36, 0x00FA36, 0x00559D},
+{0x00FA37, 0x00FA37, 0x005606},
+{0x00FA38, 0x00FA38, 0x005668},
+{0x00FA39, 0x00FA39, 0x005840},
+{0x00FA3A, 0x00FA3A, 0x0058A8},
+{0x00FA3B, 0x00FA3B, 0x005C64},
+{0x00FA3C, 0x00FA3C, 0x005C6E},
+{0x00FA3D, 0x00FA3D, 0x006094},
+{0x00FA3E, 0x00FA3E, 0x006168},
+{0x00FA3F, 0x00FA3F, 0x00618E},
+{0x00FA40, 0x00FA40, 0x0061F2},
+{0x00FA41, 0x00FA41, 0x00654F},
+{0x00FA42, 0x00FA42, 0x0065E2},
+{0x00FA43, 0x00FA43, 0x006691},
+{0x00FA44, 0x00FA44, 0x006885},
+{0x00FA45, 0x00FA45, 0x006D77},
+{0x00FA46, 0x00FA46, 0x006E1A},
+{0x00FA47, 0x00FA47, 0x006F22},
+{0x00FA48, 0x00FA48, 0x00716E},
+{0x00FA49, 0x00FA49, 0x00722B},
+{0x00FA4A, 0x00FA4A, 0x007422},
+{0x00FA4B, 0x00FA4B, 0x007891},
+{0x00FA4C, 0x00FA4C, 0x00793E},
+{0x00FA4D, 0x00FA4D, 0x007949},
+{0x00FA4E, 0x00FA4E, 0x007948},
+{0x00FA4F, 0x00FA4F, 0x007950},
+{0x00FA50, 0x00FA50, 0x007956},
+{0x00FA51, 0x00FA51, 0x00795D},
+{0x00FA52, 0x00FA52, 0x00798D},
+{0x00FA53, 0x00FA53, 0x00798E},
+{0x00FA54, 0x00FA54, 0x007A40},
+{0x00FA55, 0x00FA55, 0x007A81},
+{0x00FA56, 0x00FA56, 0x007BC0},
+{0x00FA57, 0x00FA57, 0x007DF4},
+{0x00FA58, 0x00FA58, 0x007E09},
+{0x00FA59, 0x00FA59, 0x007E41},
+{0x00FA5A, 0x00FA5A, 0x007F72},
+{0x00FA5B, 0x00FA5B, 0x008005},
+{0x00FA5C, 0x00FA5C, 0x0081ED},
+{0x00FA5D, 0x00FA5E, 0x008279},
+{0x00FA5F, 0x00FA5F, 0x008457},
+{0x00FA60, 0x00FA60, 0x008910},
+{0x00FA61, 0x00FA61, 0x008996},
+{0x00FA62, 0x00FA62, 0x008B01},
+{0x00FA63, 0x00FA63, 0x008B39},
+{0x00FA64, 0x00FA64, 0x008CD3},
+{0x00FA65, 0x00FA65, 0x008D08},
+{0x00FA66, 0x00FA66, 0x008FB6},
+{0x00FA67, 0x00FA67, 0x009038},
+{0x00FA68, 0x00FA68, 0x0096E3},
+{0x00FA69, 0x00FA69, 0x0097FF},
+{0x00FA6A, 0x00FA6A, 0x00983B},
+{0x00FA6B, 0x00FA6B, 0x006075},
+{0x00FA6C, 0x00FA6C, 0x0242EE},
+{0x00FA6D, 0x00FA6D, 0x008218},
+{0x00FA70, 0x00FA70, 0x004E26},
+{0x00FA71, 0x00FA71, 0x0051B5},
+{0x00FA72, 0x00FA72, 0x005168},
+{0x00FA73, 0x00FA73, 0x004F80},
+{0x00FA74, 0x00FA74, 0x005145},
+{0x00FA75, 0x00FA75, 0x005180},
+{0x00FA76, 0x00FA76, 0x0052C7},
+{0x00FA77, 0x00FA77, 0x0052FA},
+{0x00FA78, 0x00FA78, 0x00559D},
+{0x00FA79, 0x00FA79, 0x005555},
+{0x00FA7A, 0x00FA7A, 0x005599},
+{0x00FA7B, 0x00FA7B, 0x0055E2},
+{0x00FA7C, 0x00FA7C, 0x00585A},
+{0x00FA7D, 0x00FA7D, 0x0058B3},
+{0x00FA7E, 0x00FA7E, 0x005944},
+{0x00FA7F, 0x00FA7F, 0x005954},
+{0x00FA80, 0x00FA80, 0x005A62},
+{0x00FA81, 0x00FA81, 0x005B28},
+{0x00FA82, 0x00FA82, 0x005ED2},
+{0x00FA83, 0x00FA83, 0x005ED9},
+{0x00FA84, 0x00FA84, 0x005F69},
+{0x00FA85, 0x00FA85, 0x005FAD},
+{0x00FA86, 0x00FA86, 0x0060D8},
+{0x00FA87, 0x00FA87, 0x00614E},
+{0x00FA88, 0x00FA88, 0x006108},
+{0x00FA89, 0x00FA89, 0x00618E},
+{0x00FA8A, 0x00FA8A, 0x006160},
+{0x00FA8B, 0x00FA8B, 0x0061F2},
+{0x00FA8C, 0x00FA8C, 0x006234},
+{0x00FA8D, 0x00FA8D, 0x0063C4},
+{0x00FA8E, 0x00FA8E, 0x00641C},
+{0x00FA8F, 0x00FA8F, 0x006452},
+{0x00FA90, 0x00FA90, 0x006556},
+{0x00FA91, 0x00FA91, 0x006674},
+{0x00FA92, 0x00FA92, 0x006717},
+{0x00FA93, 0x00FA93, 0x00671B},
+{0x00FA94, 0x00FA94, 0x006756},
+{0x00FA95, 0x00FA95, 0x006B79},
+{0x00FA96, 0x00FA96, 0x006BBA},
+{0x00FA97, 0x00FA97, 0x006D41},
+{0x00FA98, 0x00FA98, 0x006EDB},
+{0x00FA99, 0x00FA99, 0x006ECB},
+{0x00FA9A, 0x00FA9A, 0x006F22},
+{0x00FA9B, 0x00FA9B, 0x00701E},
+{0x00FA9C, 0x00FA9C, 0x00716E},
+{0x00FA9D, 0x00FA9D, 0x0077A7},
+{0x00FA9E, 0x00FA9E, 0x007235},
+{0x00FA9F, 0x00FA9F, 0x0072AF},
+{0x00FAA0, 0x00FAA0, 0x00732A},
+{0x00FAA1, 0x00FAA1, 0x007471},
+{0x00FAA2, 0x00FAA2, 0x007506},
+{0x00FAA3, 0x00FAA3, 0x00753B},
+{0x00FAA4, 0x00FAA4, 0x00761D},
+{0x00FAA5, 0x00FAA5, 0x00761F},
+{0x00FAA6, 0x00FAA6, 0x0076CA},
+{0x00FAA7, 0x00FAA7, 0x0076DB},
+{0x00FAA8, 0x00FAA8, 0x0076F4},
+{0x00FAA9, 0x00FAA9, 0x00774A},
+{0x00FAAA, 0x00FAAA, 0x007740},
+{0x00FAAB, 0x00FAAB, 0x0078CC},
+{0x00FAAC, 0x00FAAC, 0x007AB1},
+{0x00FAAD, 0x00FAAD, 0x007BC0},
+{0x00FAAE, 0x00FAAE, 0x007C7B},
+{0x00FAAF, 0x00FAAF, 0x007D5B},
+{0x00FAB0, 0x00FAB0, 0x007DF4},
+{0x00FAB1, 0x00FAB1, 0x007F3E},
+{0x00FAB2, 0x00FAB2, 0x008005},
+{0x00FAB3, 0x00FAB3, 0x008352},
+{0x00FAB4, 0x00FAB4, 0x0083EF},
+{0x00FAB5, 0x00FAB5, 0x008779},
+{0x00FAB6, 0x00FAB6, 0x008941},
+{0x00FAB7, 0x00FAB7, 0x008986},
+{0x00FAB8, 0x00FAB8, 0x008996},
+{0x00FAB9, 0x00FAB9, 0x008ABF},
+{0x00FABA, 0x00FABA, 0x008AF8},
+{0x00FABB, 0x00FABB, 0x008ACB},
+{0x00FABC, 0x00FABC, 0x008B01},
+{0x00FABD, 0x00FABD, 0x008AFE},
+{0x00FABE, 0x00FABE, 0x008AED},
+{0x00FABF, 0x00FABF, 0x008B39},
+{0x00FAC0, 0x00FAC0, 0x008B8A},
+{0x00FAC1, 0x00FAC1, 0x008D08},
+{0x00FAC2, 0x00FAC2, 0x008F38},
+{0x00FAC3, 0x00FAC3, 0x009072},
+{0x00FAC4, 0x00FAC4, 0x009199},
+{0x00FAC5, 0x00FAC5, 0x009276},
+{0x00FAC6, 0x00FAC6, 0x00967C},
+{0x00FAC7, 0x00FAC7, 0x0096E3},
+{0x00FAC8, 0x00FAC8, 0x009756},
+{0x00FAC9, 0x00FAC9, 0x0097DB},
+{0x00FACA, 0x00FACA, 0x0097FF},
+{0x00FACB, 0x00FACB, 0x00980B},
+{0x00FACC, 0x00FACC, 0x00983B},
+{0x00FACD, 0x00FACD, 0x009B12},
+{0x00FACE, 0x00FACE, 0x009F9C},
+{0x00FACF, 0x00FACF, 0x02284A},
+{0x00FAD0, 0x00FAD0, 0x022844},
+{0x00FAD1, 0x00FAD1, 0x0233D5},
+{0x00FAD2, 0x00FAD2, 0x003B9D},
+{0x00FAD3, 0x00FAD3, 0x004018},
+{0x00FAD4, 0x00FAD4, 0x004039},
+{0x00FAD5, 0x00FAD5, 0x025249},
+{0x00FAD6, 0x00FAD6, 0x025CD0},
+{0x00FAD7, 0x00FAD7, 0x027ED3},
+{0x00FAD8, 0x00FAD8, 0x009F43},
+{0x00FAD9, 0x00FAD9, 0x009F8E},
+{0x00FB1D, 0x00FB1D, 0x0005D9},
+{0x00FB1F, 0x00FB1F, 0x0005F2},
+{0x00FB2A, 0x00FB2D, 0x0005E9},
+{0x00FB2E, 0x00FB30, 0x0005D0},
+{0x00FB31, 0x00FB31, 0x0005D1},
+{0x00FB32, 0x00FB32, 0x0005D2},
+{0x00FB33, 0x00FB33, 0x0005D3},
+{0x00FB34, 0x00FB34, 0x0005D4},
+{0x00FB35, 0x00FB35, 0x0005D5},
+{0x00FB36, 0x00FB36, 0x0005D6},
+{0x00FB38, 0x00FB38, 0x0005D8},
+{0x00FB39, 0x00FB39, 0x0005D9},
+{0x00FB3A, 0x00FB3A, 0x0005DA},
+{0x00FB3B, 0x00FB3B, 0x0005DB},
+{0x00FB3C, 0x00FB3C, 0x0005DC},
+{0x00FB3E, 0x00FB3E, 0x0005DE},
+{0x00FB40, 0x00FB40, 0x0005E0},
+{0x00FB41, 0x00FB41, 0x0005E1},
+{0x00FB43, 0x00FB43, 0x0005E3},
+{0x00FB44, 0x00FB44, 0x0005E4},
+{0x00FB46, 0x00FB46, 0x0005E6},
+{0x00FB47, 0x00FB47, 0x0005E7},
+{0x00FB48, 0x00FB48, 0x0005E8},
+{0x00FB49, 0x00FB49, 0x0005E9},
+{0x00FB4A, 0x00FB4A, 0x0005EA},
+{0x00FB4B, 0x00FB4B, 0x0005D5},
+{0x00FB4C, 0x00FB4C, 0x0005D1},
+{0x00FB4D, 0x00FB4D, 0x0005DB},
+{0x00FB4E, 0x00FB4E, 0x0005E4},
+{0x01109A, 0x01109A, 0x011099},
+{0x01109C, 0x01109C, 0x01109B},
+{0x0110AB, 0x0110AB, 0x0110A5},
+{0x01112E, 0x01112E, 0x011131},
+{0x01112F, 0x01112F, 0x011132},
+{0x01134B, 0x01134C, 0x011347},
+{0x0114BB, 0x0114BC, 0x0114B9},
+{0x0114BE, 0x0114BE, 0x0114B9},
+{0x0115BA, 0x0115BA, 0x0115B8},
+{0x0115BB, 0x0115BB, 0x0115B9},
+{0x011938, 0x011938, 0x011935},
+{0x01D15E, 0x01D15E, 0x01D157},
+{0x01D15F, 0x01D164, 0x01D158},
+{0x01D1BB, 0x01D1BB, 0x01D1B9},
+{0x01D1BC, 0x01D1BC, 0x01D1BA},
+{0x01D1BD, 0x01D1BD, 0x01D1B9},
+{0x01D1BE, 0x01D1BE, 0x01D1BA},
+{0x01D1BF, 0x01D1BF, 0x01D1B9},
+{0x01D1C0, 0x01D1C0, 0x01D1BA},
+{0x02F800, 0x02F800, 0x004E3D},
+{0x02F801, 0x02F801, 0x004E38},
+{0x02F802, 0x02F802, 0x004E41},
+{0x02F803, 0x02F803, 0x020122},
+{0x02F804, 0x02F804, 0x004F60},
+{0x02F805, 0x02F805, 0x004FAE},
+{0x02F806, 0x02F806, 0x004FBB},
+{0x02F807, 0x02F807, 0x005002},
+{0x02F808, 0x02F808, 0x00507A},
+{0x02F809, 0x02F809, 0x005099},
+{0x02F80A, 0x02F80A, 0x0050E7},
+{0x02F80B, 0x02F80B, 0x0050CF},
+{0x02F80C, 0x02F80C, 0x00349E},
+{0x02F80D, 0x02F80D, 0x02063A},
+{0x02F80E, 0x02F80E, 0x00514D},
+{0x02F80F, 0x02F80F, 0x005154},
+{0x02F810, 0x02F810, 0x005164},
+{0x02F811, 0x02F811, 0x005177},
+{0x02F812, 0x02F812, 0x02051C},
+{0x02F813, 0x02F813, 0x0034B9},
+{0x02F814, 0x02F814, 0x005167},
+{0x02F815, 0x02F815, 0x00518D},
+{0x02F816, 0x02F816, 0x02054B},
+{0x02F817, 0x02F817, 0x005197},
+{0x02F818, 0x02F818, 0x0051A4},
+{0x02F819, 0x02F819, 0x004ECC},
+{0x02F81A, 0x02F81A, 0x0051AC},
+{0x02F81B, 0x02F81B, 0x0051B5},
+{0x02F81C, 0x02F81C, 0x0291DF},
+{0x02F81D, 0x02F81D, 0x0051F5},
+{0x02F81E, 0x02F81E, 0x005203},
+{0x02F81F, 0x02F81F, 0x0034DF},
+{0x02F820, 0x02F820, 0x00523B},
+{0x02F821, 0x02F821, 0x005246},
+{0x02F822, 0x02F822, 0x005272},
+{0x02F823, 0x02F823, 0x005277},
+{0x02F824, 0x02F824, 0x003515},
+{0x02F825, 0x02F825, 0x0052C7},
+{0x02F826, 0x02F826, 0x0052C9},
+{0x02F827, 0x02F827, 0x0052E4},
+{0x02F828, 0x02F828, 0x0052FA},
+{0x02F829, 0x02F829, 0x005305},
+{0x02F82A, 0x02F82A, 0x005306},
+{0x02F82B, 0x02F82B, 0x005317},
+{0x02F82C, 0x02F82C, 0x005349},
+{0x02F82D, 0x02F82D, 0x005351},
+{0x02F82E, 0x02F82E, 0x00535A},
+{0x02F82F, 0x02F82F, 0x005373},
+{0x02F830, 0x02F830, 0x00537D},
+{0x02F831, 0x02F833, 0x00537F},
+{0x02F834, 0x02F834, 0x020A2C},
+{0x02F835, 0x02F835, 0x007070},
+{0x02F836, 0x02F836, 0x0053CA},
+{0x02F837, 0x02F837, 0x0053DF},
+{0x02F838, 0x02F838, 0x020B63},
+{0x02F839, 0x02F839, 0x0053EB},
+{0x02F83A, 0x02F83A, 0x0053F1},
+{0x02F83B, 0x02F83B, 0x005406},
+{0x02F83C, 0x02F83C, 0x00549E},
+{0x02F83D, 0x02F83D, 0x005438},
+{0x02F83E, 0x02F83E, 0x005448},
+{0x02F83F, 0x02F83F, 0x005468},
+{0x02F840, 0x02F840, 0x0054A2},
+{0x02F841, 0x02F841, 0x0054F6},
+{0x02F842, 0x02F842, 0x005510},
+{0x02F843, 0x02F843, 0x005553},
+{0x02F844, 0x02F844, 0x005563},
+{0x02F845, 0x02F846, 0x005584},
+{0x02F847, 0x02F847, 0x005599},
+{0x02F848, 0x02F848, 0x0055AB},
+{0x02F849, 0x02F849, 0x0055B3},
+{0x02F84A, 0x02F84A, 0x0055C2},
+{0x02F84B, 0x02F84B, 0x005716},
+{0x02F84C, 0x02F84C, 0x005606},
+{0x02F84D, 0x02F84D, 0x005717},
+{0x02F84E, 0x02F84E, 0x005651},
+{0x02F84F, 0x02F84F, 0x005674},
+{0x02F850, 0x02F850, 0x005207},
+{0x02F851, 0x02F851, 0x0058EE},
+{0x02F852, 0x02F852, 0x0057CE},
+{0x02F853, 0x02F853, 0x0057F4},
+{0x02F854, 0x02F854, 0x00580D},
+{0x02F855, 0x02F855, 0x00578B},
+{0x02F856, 0x02F856, 0x005832},
+{0x02F857, 0x02F857, 0x005831},
+{0x02F858, 0x02F858, 0x0058AC},
+{0x02F859, 0x02F859, 0x0214E4},
+{0x02F85A, 0x02F85A, 0x0058F2},
+{0x02F85B, 0x02F85B, 0x0058F7},
+{0x02F85C, 0x02F85C, 0x005906},
+{0x02F85D, 0x02F85D, 0x00591A},
+{0x02F85E, 0x02F85E, 0x005922},
+{0x02F85F, 0x02F85F, 0x005962},
+{0x02F860, 0x02F860, 0x0216A8},
+{0x02F861, 0x02F861, 0x0216EA},
+{0x02F862, 0x02F862, 0x0059EC},
+{0x02F863, 0x02F863, 0x005A1B},
+{0x02F864, 0x02F864, 0x005A27},
+{0x02F865, 0x02F865, 0x0059D8},
+{0x02F866, 0x02F866, 0x005A66},
+{0x02F867, 0x02F867, 0x0036EE},
+{0x02F868, 0x02F868, 0x0036FC},
+{0x02F869, 0x02F869, 0x005B08},
+{0x02F86A, 0x02F86B, 0x005B3E},
+{0x02F86C, 0x02F86C, 0x0219C8},
+{0x02F86D, 0x02F86D, 0x005BC3},
+{0x02F86E, 0x02F86E, 0x005BD8},
+{0x02F86F, 0x02F86F, 0x005BE7},
+{0x02F870, 0x02F870, 0x005BF3},
+{0x02F871, 0x02F871, 0x021B18},
+{0x02F872, 0x02F872, 0x005BFF},
+{0x02F873, 0x02F873, 0x005C06},
+{0x02F874, 0x02F874, 0x005F53},
+{0x02F875, 0x02F875, 0x005C22},
+{0x02F876, 0x02F876, 0x003781},
+{0x02F877, 0x02F877, 0x005C60},
+{0x02F878, 0x02F878, 0x005C6E},
+{0x02F879, 0x02F879, 0x005CC0},
+{0x02F87A, 0x02F87A, 0x005C8D},
+{0x02F87B, 0x02F87B, 0x021DE4},
+{0x02F87C, 0x02F87C, 0x005D43},
+{0x02F87D, 0x02F87D, 0x021DE6},
+{0x02F87E, 0x02F87E, 0x005D6E},
+{0x02F87F, 0x02F87F, 0x005D6B},
+{0x02F880, 0x02F880, 0x005D7C},
+{0x02F881, 0x02F881, 0x005DE1},
+{0x02F882, 0x02F882, 0x005DE2},
+{0x02F883, 0x02F883, 0x00382F},
+{0x02F884, 0x02F884, 0x005DFD},
+{0x02F885, 0x02F885, 0x005E28},
+{0x02F886, 0x02F886, 0x005E3D},
+{0x02F887, 0x02F887, 0x005E69},
+{0x02F888, 0x02F888, 0x003862},
+{0x02F889, 0x02F889, 0x022183},
+{0x02F88A, 0x02F88A, 0x00387C},
+{0x02F88B, 0x02F88B, 0x005EB0},
+{0x02F88C, 0x02F88C, 0x005EB3},
+{0x02F88D, 0x02F88D, 0x005EB6},
+{0x02F88E, 0x02F88E, 0x005ECA},
+{0x02F88F, 0x02F88F, 0x02A392},
+{0x02F890, 0x02F890, 0x005EFE},
+{0x02F891, 0x02F892, 0x022331},
+{0x02F893, 0x02F893, 0x008201},
+{0x02F894, 0x02F895, 0x005F22},
+{0x02F896, 0x02F896, 0x0038C7},
+{0x02F897, 0x02F897, 0x0232B8},
+{0x02F898, 0x02F898, 0x0261DA},
+{0x02F899, 0x02F899, 0x005F62},
+{0x02F89A, 0x02F89A, 0x005F6B},
+{0x02F89B, 0x02F89B, 0x0038E3},
+{0x02F89C, 0x02F89C, 0x005F9A},
+{0x02F89D, 0x02F89D, 0x005FCD},
+{0x02F89E, 0x02F89E, 0x005FD7},
+{0x02F89F, 0x02F89F, 0x005FF9},
+{0x02F8A0, 0x02F8A0, 0x006081},
+{0x02F8A1, 0x02F8A1, 0x00393A},
+{0x02F8A2, 0x02F8A2, 0x00391C},
+{0x02F8A3, 0x02F8A3, 0x006094},
+{0x02F8A4, 0x02F8A4, 0x0226D4},
+{0x02F8A5, 0x02F8A5, 0x0060C7},
+{0x02F8A6, 0x02F8A6, 0x006148},
+{0x02F8A7, 0x02F8A7, 0x00614C},
+{0x02F8A8, 0x02F8A8, 0x00614E},
+{0x02F8A9, 0x02F8A9, 0x00614C},
+{0x02F8AA, 0x02F8AA, 0x00617A},
+{0x02F8AB, 0x02F8AB, 0x00618E},
+{0x02F8AC, 0x02F8AC, 0x0061B2},
+{0x02F8AD, 0x02F8AD, 0x0061A4},
+{0x02F8AE, 0x02F8AE, 0x0061AF},
+{0x02F8AF, 0x02F8AF, 0x0061DE},
+{0x02F8B0, 0x02F8B0, 0x0061F2},
+{0x02F8B1, 0x02F8B1, 0x0061F6},
+{0x02F8B2, 0x02F8B2, 0x006210},
+{0x02F8B3, 0x02F8B3, 0x00621B},
+{0x02F8B4, 0x02F8B4, 0x00625D},
+{0x02F8B5, 0x02F8B5, 0x0062B1},
+{0x02F8B6, 0x02F8B6, 0x0062D4},
+{0x02F8B7, 0x02F8B7, 0x006350},
+{0x02F8B8, 0x02F8B8, 0x022B0C},
+{0x02F8B9, 0x02F8B9, 0x00633D},
+{0x02F8BA, 0x02F8BA, 0x0062FC},
+{0x02F8BB, 0x02F8BB, 0x006368},
+{0x02F8BC, 0x02F8BC, 0x006383},
+{0x02F8BD, 0x02F8BD, 0x0063E4},
+{0x02F8BE, 0x02F8BE, 0x022BF1},
+{0x02F8BF, 0x02F8BF, 0x006422},
+{0x02F8C0, 0x02F8C0, 0x0063C5},
+{0x02F8C1, 0x02F8C1, 0x0063A9},
+{0x02F8C2, 0x02F8C2, 0x003A2E},
+{0x02F8C3, 0x02F8C3, 0x006469},
+{0x02F8C4, 0x02F8C4, 0x00647E},
+{0x02F8C5, 0x02F8C5, 0x00649D},
+{0x02F8C6, 0x02F8C6, 0x006477},
+{0x02F8C7, 0x02F8C7, 0x003A6C},
+{0x02F8C8, 0x02F8C8, 0x00654F},
+{0x02F8C9, 0x02F8C9, 0x00656C},
+{0x02F8CA, 0x02F8CA, 0x02300A},
+{0x02F8CB, 0x02F8CB, 0x0065E3},
+{0x02F8CC, 0x02F8CC, 0x0066F8},
+{0x02F8CD, 0x02F8CD, 0x006649},
+{0x02F8CE, 0x02F8CE, 0x003B19},
+{0x02F8CF, 0x02F8CF, 0x006691},
+{0x02F8D0, 0x02F8D0, 0x003B08},
+{0x02F8D1, 0x02F8D1, 0x003AE4},
+{0x02F8D2, 0x02F8D2, 0x005192},
+{0x02F8D3, 0x02F8D3, 0x005195},
+{0x02F8D4, 0x02F8D4, 0x006700},
+{0x02F8D5, 0x02F8D5, 0x00669C},
+{0x02F8D6, 0x02F8D6, 0x0080AD},
+{0x02F8D7, 0x02F8D7, 0x0043D9},
+{0x02F8D8, 0x02F8D8, 0x006717},
+{0x02F8D9, 0x02F8D9, 0x00671B},
+{0x02F8DA, 0x02F8DA, 0x006721},
+{0x02F8DB, 0x02F8DB, 0x00675E},
+{0x02F8DC, 0x02F8DC, 0x006753},
+{0x02F8DD, 0x02F8DD, 0x0233C3},
+{0x02F8DE, 0x02F8DE, 0x003B49},
+{0x02F8DF, 0x02F8DF, 0x0067FA},
+{0x02F8E0, 0x02F8E0, 0x006785},
+{0x02F8E1, 0x02F8E1, 0x006852},
+{0x02F8E2, 0x02F8E2, 0x006885},
+{0x02F8E3, 0x02F8E3, 0x02346D},
+{0x02F8E4, 0x02F8E4, 0x00688E},
+{0x02F8E5, 0x02F8E5, 0x00681F},
+{0x02F8E6, 0x02F8E6, 0x006914},
+{0x02F8E7, 0x02F8E7, 0x003B9D},
+{0x02F8E8, 0x02F8E8, 0x006942},
+{0x02F8E9, 0x02F8E9, 0x0069A3},
+{0x02F8EA, 0x02F8EA, 0x0069EA},
+{0x02F8EB, 0x02F8EB, 0x006AA8},
+{0x02F8EC, 0x02F8EC, 0x0236A3},
+{0x02F8ED, 0x02F8ED, 0x006ADB},
+{0x02F8EE, 0x02F8EE, 0x003C18},
+{0x02F8EF, 0x02F8EF, 0x006B21},
+{0x02F8F0, 0x02F8F0, 0x0238A7},
+{0x02F8F1, 0x02F8F1, 0x006B54},
+{0x02F8F2, 0x02F8F2, 0x003C4E},
+{0x02F8F3, 0x02F8F3, 0x006B72},
+{0x02F8F4, 0x02F8F4, 0x006B9F},
+{0x02F8F5, 0x02F8F5, 0x006BBA},
+{0x02F8F6, 0x02F8F6, 0x006BBB},
+{0x02F8F7, 0x02F8F7, 0x023A8D},
+{0x02F8F8, 0x02F8F8, 0x021D0B},
+{0x02F8F9, 0x02F8F9, 0x023AFA},
+{0x02F8FA, 0x02F8FA, 0x006C4E},
+{0x02F8FB, 0x02F8FB, 0x023CBC},
+{0x02F8FC, 0x02F8FC, 0x006CBF},
+{0x02F8FD, 0x02F8FD, 0x006CCD},
+{0x02F8FE, 0x02F8FE, 0x006C67},
+{0x02F8FF, 0x02F8FF, 0x006D16},
+{0x02F900, 0x02F900, 0x006D3E},
+{0x02F901, 0x02F901, 0x006D77},
+{0x02F902, 0x02F902, 0x006D41},
+{0x02F903, 0x02F903, 0x006D69},
+{0x02F904, 0x02F904, 0x006D78},
+{0x02F905, 0x02F905, 0x006D85},
+{0x02F906, 0x02F906, 0x023D1E},
+{0x02F907, 0x02F907, 0x006D34},
+{0x02F908, 0x02F908, 0x006E2F},
+{0x02F909, 0x02F909, 0x006E6E},
+{0x02F90A, 0x02F90A, 0x003D33},
+{0x02F90B, 0x02F90B, 0x006ECB},
+{0x02F90C, 0x02F90C, 0x006EC7},
+{0x02F90D, 0x02F90D, 0x023ED1},
+{0x02F90E, 0x02F90E, 0x006DF9},
+{0x02F90F, 0x02F90F, 0x006F6E},
+{0x02F910, 0x02F910, 0x023F5E},
+{0x02F911, 0x02F911, 0x023F8E},
+{0x02F912, 0x02F912, 0x006FC6},
+{0x02F913, 0x02F913, 0x007039},
+{0x02F914, 0x02F914, 0x00701E},
+{0x02F915, 0x02F915, 0x00701B},
+{0x02F916, 0x02F916, 0x003D96},
+{0x02F917, 0x02F917, 0x00704A},
+{0x02F918, 0x02F918, 0x00707D},
+{0x02F919, 0x02F919, 0x007077},
+{0x02F91A, 0x02F91A, 0x0070AD},
+{0x02F91B, 0x02F91B, 0x020525},
+{0x02F91C, 0x02F91C, 0x007145},
+{0x02F91D, 0x02F91D, 0x024263},
+{0x02F91E, 0x02F91E, 0x00719C},
+{0x02F91F, 0x02F91F, 0x0243AB},
+{0x02F920, 0x02F920, 0x007228},
+{0x02F921, 0x02F921, 0x007235},
+{0x02F922, 0x02F922, 0x007250},
+{0x02F923, 0x02F923, 0x024608},
+{0x02F924, 0x02F924, 0x007280},
+{0x02F925, 0x02F925, 0x007295},
+{0x02F926, 0x02F926, 0x024735},
+{0x02F927, 0x02F927, 0x024814},
+{0x02F928, 0x02F928, 0x00737A},
+{0x02F929, 0x02F929, 0x00738B},
+{0x02F92A, 0x02F92A, 0x003EAC},
+{0x02F92B, 0x02F92B, 0x0073A5},
+{0x02F92C, 0x02F92D, 0x003EB8},
+{0x02F92E, 0x02F92E, 0x007447},
+{0x02F92F, 0x02F92F, 0x00745C},
+{0x02F930, 0x02F930, 0x007471},
+{0x02F931, 0x02F931, 0x007485},
+{0x02F932, 0x02F932, 0x0074CA},
+{0x02F933, 0x02F933, 0x003F1B},
+{0x02F934, 0x02F934, 0x007524},
+{0x02F935, 0x02F935, 0x024C36},
+{0x02F936, 0x02F936, 0x00753E},
+{0x02F937, 0x02F937, 0x024C92},
+{0x02F938, 0x02F938, 0x007570},
+{0x02F939, 0x02F939, 0x02219F},
+{0x02F93A, 0x02F93A, 0x007610},
+{0x02F93B, 0x02F93B, 0x024FA1},
+{0x02F93C, 0x02F93C, 0x024FB8},
+{0x02F93D, 0x02F93D, 0x025044},
+{0x02F93E, 0x02F93E, 0x003FFC},
+{0x02F93F, 0x02F93F, 0x004008},
+{0x02F940, 0x02F940, 0x0076F4},
+{0x02F941, 0x02F941, 0x0250F3},
+{0x02F942, 0x02F942, 0x0250F2},
+{0x02F943, 0x02F943, 0x025119},
+{0x02F944, 0x02F944, 0x025133},
+{0x02F945, 0x02F945, 0x00771E},
+{0x02F946, 0x02F947, 0x00771F},
+{0x02F948, 0x02F948, 0x00774A},
+{0x02F949, 0x02F949, 0x004039},
+{0x02F94A, 0x02F94A, 0x00778B},
+{0x02F94B, 0x02F94B, 0x004046},
+{0x02F94C, 0x02F94C, 0x004096},
+{0x02F94D, 0x02F94D, 0x02541D},
+{0x02F94E, 0x02F94E, 0x00784E},
+{0x02F94F, 0x02F94F, 0x00788C},
+{0x02F950, 0x02F950, 0x0078CC},
+{0x02F951, 0x02F951, 0x0040E3},
+{0x02F952, 0x02F952, 0x025626},
+{0x02F953, 0x02F953, 0x007956},
+{0x02F954, 0x02F954, 0x02569A},
+{0x02F955, 0x02F955, 0x0256C5},
+{0x02F956, 0x02F956, 0x00798F},
+{0x02F957, 0x02F957, 0x0079EB},
+{0x02F958, 0x02F958, 0x00412F},
+{0x02F959, 0x02F959, 0x007A40},
+{0x02F95A, 0x02F95A, 0x007A4A},
+{0x02F95B, 0x02F95B, 0x007A4F},
+{0x02F95C, 0x02F95C, 0x02597C},
+{0x02F95D, 0x02F95E, 0x025AA7},
+{0x02F95F, 0x02F95F, 0x007AEE},
+{0x02F960, 0x02F960, 0x004202},
+{0x02F961, 0x02F961, 0x025BAB},
+{0x02F962, 0x02F962, 0x007BC6},
+{0x02F963, 0x02F963, 0x007BC9},
+{0x02F964, 0x02F964, 0x004227},
+{0x02F965, 0x02F965, 0x025C80},
+{0x02F966, 0x02F966, 0x007CD2},
+{0x02F967, 0x02F967, 0x0042A0},
+{0x02F968, 0x02F968, 0x007CE8},
+{0x02F969, 0x02F969, 0x007CE3},
+{0x02F96A, 0x02F96A, 0x007D00},
+{0x02F96B, 0x02F96B, 0x025F86},
+{0x02F96C, 0x02F96C, 0x007D63},
+{0x02F96D, 0x02F96D, 0x004301},
+{0x02F96E, 0x02F96E, 0x007DC7},
+{0x02F96F, 0x02F96F, 0x007E02},
+{0x02F970, 0x02F970, 0x007E45},
+{0x02F971, 0x02F971, 0x004334},
+{0x02F972, 0x02F972, 0x026228},
+{0x02F973, 0x02F973, 0x026247},
+{0x02F974, 0x02F974, 0x004359},
+{0x02F975, 0x02F975, 0x0262D9},
+{0x02F976, 0x02F976, 0x007F7A},
+{0x02F977, 0x02F977, 0x02633E},
+{0x02F978, 0x02F978, 0x007F95},
+{0x02F979, 0x02F979, 0x007FFA},
+{0x02F97A, 0x02F97A, 0x008005},
+{0x02F97B, 0x02F97B, 0x0264DA},
+{0x02F97C, 0x02F97C, 0x026523},
+{0x02F97D, 0x02F97D, 0x008060},
+{0x02F97E, 0x02F97E, 0x0265A8},
+{0x02F97F, 0x02F97F, 0x008070},
+{0x02F980, 0x02F980, 0x02335F},
+{0x02F981, 0x02F981, 0x0043D5},
+{0x02F982, 0x02F982, 0x0080B2},
+{0x02F983, 0x02F983, 0x008103},
+{0x02F984, 0x02F984, 0x00440B},
+{0x02F985, 0x02F985, 0x00813E},
+{0x02F986, 0x02F986, 0x005AB5},
+{0x02F987, 0x02F987, 0x0267A7},
+{0x02F988, 0x02F988, 0x0267B5},
+{0x02F989, 0x02F989, 0x023393},
+{0x02F98A, 0x02F98A, 0x02339C},
+{0x02F98B, 0x02F98B, 0x008201},
+{0x02F98C, 0x02F98C, 0x008204},
+{0x02F98D, 0x02F98D, 0x008F9E},
+{0x02F98E, 0x02F98E, 0x00446B},
+{0x02F98F, 0x02F98F, 0x008291},
+{0x02F990, 0x02F990, 0x00828B},
+{0x02F991, 0x02F991, 0x00829D},
+{0x02F992, 0x02F992, 0x0052B3},
+{0x02F993, 0x02F993, 0x0082B1},
+{0x02F994, 0x02F994, 0x0082B3},
+{0x02F995, 0x02F995, 0x0082BD},
+{0x02F996, 0x02F996, 0x0082E6},
+{0x02F997, 0x02F997, 0x026B3C},
+{0x02F998, 0x02F998, 0x0082E5},
+{0x02F999, 0x02F999, 0x00831D},
+{0x02F99A, 0x02F99A, 0x008363},
+{0x02F99B, 0x02F99B, 0x0083AD},
+{0x02F99C, 0x02F99C, 0x008323},
+{0x02F99D, 0x02F99D, 0x0083BD},
+{0x02F99E, 0x02F99E, 0x0083E7},
+{0x02F99F, 0x02F99F, 0x008457},
+{0x02F9A0, 0x02F9A0, 0x008353},
+{0x02F9A1, 0x02F9A1, 0x0083CA},
+{0x02F9A2, 0x02F9A2, 0x0083CC},
+{0x02F9A3, 0x02F9A3, 0x0083DC},
+{0x02F9A4, 0x02F9A4, 0x026C36},
+{0x02F9A5, 0x02F9A5, 0x026D6B},
+{0x02F9A6, 0x02F9A6, 0x026CD5},
+{0x02F9A7, 0x02F9A7, 0x00452B},
+{0x02F9A8, 0x02F9A8, 0x0084F1},
+{0x02F9A9, 0x02F9A9, 0x0084F3},
+{0x02F9AA, 0x02F9AA, 0x008516},
+{0x02F9AB, 0x02F9AB, 0x0273CA},
+{0x02F9AC, 0x02F9AC, 0x008564},
+{0x02F9AD, 0x02F9AD, 0x026F2C},
+{0x02F9AE, 0x02F9AE, 0x00455D},
+{0x02F9AF, 0x02F9AF, 0x004561},
+{0x02F9B0, 0x02F9B0, 0x026FB1},
+{0x02F9B1, 0x02F9B1, 0x0270D2},
+{0x02F9B2, 0x02F9B2, 0x00456B},
+{0x02F9B3, 0x02F9B3, 0x008650},
+{0x02F9B4, 0x02F9B4, 0x00865C},
+{0x02F9B5, 0x02F9B5, 0x008667},
+{0x02F9B6, 0x02F9B6, 0x008669},
+{0x02F9B7, 0x02F9B7, 0x0086A9},
+{0x02F9B8, 0x02F9B8, 0x008688},
+{0x02F9B9, 0x02F9B9, 0x00870E},
+{0x02F9BA, 0x02F9BA, 0x0086E2},
+{0x02F9BB, 0x02F9BB, 0x008779},
+{0x02F9BC, 0x02F9BC, 0x008728},
+{0x02F9BD, 0x02F9BD, 0x00876B},
+{0x02F9BE, 0x02F9BE, 0x008786},
+{0x02F9BF, 0x02F9BF, 0x0045D7},
+{0x02F9C0, 0x02F9C0, 0x0087E1},
+{0x02F9C1, 0x02F9C1, 0x008801},
+{0x02F9C2, 0x02F9C2, 0x0045F9},
+{0x02F9C3, 0x02F9C3, 0x008860},
+{0x02F9C4, 0x02F9C4, 0x008863},
+{0x02F9C5, 0x02F9C5, 0x027667},
+{0x02F9C6, 0x02F9C6, 0x0088D7},
+{0x02F9C7, 0x02F9C7, 0x0088DE},
+{0x02F9C8, 0x02F9C8, 0x004635},
+{0x02F9C9, 0x02F9C9, 0x0088FA},
+{0x02F9CA, 0x02F9CA, 0x0034BB},
+{0x02F9CB, 0x02F9CB, 0x0278AE},
+{0x02F9CC, 0x02F9CC, 0x027966},
+{0x02F9CD, 0x02F9CD, 0x0046BE},
+{0x02F9CE, 0x02F9CE, 0x0046C7},
+{0x02F9CF, 0x02F9CF, 0x008AA0},
+{0x02F9D0, 0x02F9D0, 0x008AED},
+{0x02F9D1, 0x02F9D1, 0x008B8A},
+{0x02F9D2, 0x02F9D2, 0x008C55},
+{0x02F9D3, 0x02F9D3, 0x027CA8},
+{0x02F9D4, 0x02F9D4, 0x008CAB},
+{0x02F9D5, 0x02F9D5, 0x008CC1},
+{0x02F9D6, 0x02F9D6, 0x008D1B},
+{0x02F9D7, 0x02F9D7, 0x008D77},
+{0x02F9D8, 0x02F9D8, 0x027F2F},
+{0x02F9D9, 0x02F9D9, 0x020804},
+{0x02F9DA, 0x02F9DA, 0x008DCB},
+{0x02F9DB, 0x02F9DB, 0x008DBC},
+{0x02F9DC, 0x02F9DC, 0x008DF0},
+{0x02F9DD, 0x02F9DD, 0x0208DE},
+{0x02F9DE, 0x02F9DE, 0x008ED4},
+{0x02F9DF, 0x02F9DF, 0x008F38},
+{0x02F9E0, 0x02F9E0, 0x0285D2},
+{0x02F9E1, 0x02F9E1, 0x0285ED},
+{0x02F9E2, 0x02F9E2, 0x009094},
+{0x02F9E3, 0x02F9E3, 0x0090F1},
+{0x02F9E4, 0x02F9E4, 0x009111},
+{0x02F9E5, 0x02F9E5, 0x02872E},
+{0x02F9E6, 0x02F9E6, 0x00911B},
+{0x02F9E7, 0x02F9E7, 0x009238},
+{0x02F9E8, 0x02F9E8, 0x0092D7},
+{0x02F9E9, 0x02F9E9, 0x0092D8},
+{0x02F9EA, 0x02F9EA, 0x00927C},
+{0x02F9EB, 0x02F9EB, 0x0093F9},
+{0x02F9EC, 0x02F9EC, 0x009415},
+{0x02F9ED, 0x02F9ED, 0x028BFA},
+{0x02F9EE, 0x02F9EE, 0x00958B},
+{0x02F9EF, 0x02F9EF, 0x004995},
+{0x02F9F0, 0x02F9F0, 0x0095B7},
+{0x02F9F1, 0x02F9F1, 0x028D77},
+{0x02F9F2, 0x02F9F2, 0x0049E6},
+{0x02F9F3, 0x02F9F3, 0x0096C3},
+{0x02F9F4, 0x02F9F4, 0x005DB2},
+{0x02F9F5, 0x02F9F5, 0x009723},
+{0x02F9F6, 0x02F9F6, 0x029145},
+{0x02F9F7, 0x02F9F7, 0x02921A},
+{0x02F9F8, 0x02F9F8, 0x004A6E},
+{0x02F9F9, 0x02F9F9, 0x004A76},
+{0x02F9FA, 0x02F9FA, 0x0097E0},
+{0x02F9FB, 0x02F9FB, 0x02940A},
+{0x02F9FC, 0x02F9FC, 0x004AB2},
+{0x02F9FD, 0x02F9FD, 0x029496},
+{0x02F9FE, 0x02F9FF, 0x00980B},
+{0x02FA00, 0x02FA00, 0x009829},
+{0x02FA01, 0x02FA01, 0x0295B6},
+{0x02FA02, 0x02FA02, 0x0098E2},
+{0x02FA03, 0x02FA03, 0x004B33},
+{0x02FA04, 0x02FA04, 0x009929},
+{0x02FA05, 0x02FA05, 0x0099A7},
+{0x02FA06, 0x02FA06, 0x0099C2},
+{0x02FA07, 0x02FA07, 0x0099FE},
+{0x02FA08, 0x02FA08, 0x004BCE},
+{0x02FA09, 0x02FA09, 0x029B30},
+{0x02FA0A, 0x02FA0A, 0x009B12},
+{0x02FA0B, 0x02FA0B, 0x009C40},
+{0x02FA0C, 0x02FA0C, 0x009CFD},
+{0x02FA0D, 0x02FA0D, 0x004CCE},
+{0x02FA0E, 0x02FA0E, 0x004CED},
+{0x02FA0F, 0x02FA0F, 0x009D67},
+{0x02FA10, 0x02FA10, 0x02A0CE},
+{0x02FA11, 0x02FA11, 0x004CF8},
+{0x02FA12, 0x02FA12, 0x02A105},
+{0x02FA13, 0x02FA13, 0x02A20E},
+{0x02FA14, 0x02FA14, 0x02A291},
+{0x02FA15, 0x02FA15, 0x009EBB},
+{0x02FA16, 0x02FA16, 0x004D56},
+{0x02FA17, 0x02FA17, 0x009EF9},
+{0x02FA18, 0x02FA18, 0x009EFE},
+{0x02FA19, 0x02FA19, 0x009F05},
+{0x02FA1A, 0x02FA1A, 0x009F0F},
+{0x02FA1B, 0x02FA1B, 0x009F16},
+{0x02FA1C, 0x02FA1C, 0x009F3B},
+{0x02FA1D, 0x02FA1D, 0x02A600},
+};
diff --git a/llama.cpp/src/unicode-data.h b/llama.cpp/src/unicode-data.h
new file mode 100644
index 0000000..f6973eb
--- /dev/null
+++ b/llama.cpp/src/unicode-data.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+struct range_nfd {
+ uint32_t first;
+ uint32_t last;
+ uint32_t nfd;
+};
+
+static const uint32_t MAX_CODEPOINTS = 0x110000;
+
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::unordered_set<uint32_t> unicode_set_whitespace;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
diff --git a/llama.cpp/src/unicode.cpp b/llama.cpp/src/unicode.cpp
new file mode 100644
index 0000000..adfc489
--- /dev/null
+++ b/llama.cpp/src/unicode.cpp
@@ -0,0 +1,1124 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
+#include "unicode.h"
+#include "unicode-data.h"
+
+#include <algorithm>
+#include <cassert>
+#include <codecvt>
+#include <cstddef>
+#include <cstdint>
+#include <locale>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+size_t unicode_len_utf8(char src) {
+ const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+ return lookup[highbits];
+}
+
+static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
+ std::string result;
+ for (size_t i = 0; i < cps.size(); ++i) {
+ result.append(unicode_cpt_to_utf8(cps[i]));
+ }
+ return result;
+}
+
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+ assert(offset < utf8.size());
+ if (!(utf8[offset + 0] & 0x80)) {
+ auto result = utf8[offset + 0];
+ offset += 1;
+ return result;
+ }
+ if (!(utf8[offset + 0] & 0x40)) {
+ throw std::invalid_argument("invalid character");
+ }
+ if (!(utf8[offset + 0] & 0x20)) {
+ if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
+ throw std::invalid_argument("invalid character");
+ }
+ auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
+ offset += 2;
+ return result;
+ }
+ if (!(utf8[offset + 0] & 0x10)) {
+ if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
+ throw std::invalid_argument("invalid character");
+ }
+ auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
+ offset += 3;
+ return result;
+ }
+ if (!(utf8[offset + 0] & 0x08)) {
+ if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
+ throw std::invalid_argument("invalid character");
+ }
+ auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
+ offset += 4;
+ return result;
+ }
+ throw std::invalid_argument("failed to convert utf8 to codepoint");
+}
+
+//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
+// std::vector<uint16_t> result;
+// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
+// result.emplace_back(cpt);
+// return result;
+// }
+// if (0x10000 <= cpt && cpt <= 0x10ffff) {
+// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
+// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
+// return result;
+// }
+// throw std::invalid_argument("failed to convert codepoint to utf16");
+//}
+
+//static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
+// std::vector<uint16_t> result;
+// for (size_t i = 0; i < cps.size(); ++i) {
+// auto temp = unicode_cpt_to_utf16(cps[i]);
+// result.insert(result.end(), temp.begin(), temp.end());
+// }
+// return result;
+//}
+
+//static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
+// assert(offset < utf16.size());
+// if (((utf16[0] >> 10) << 10) != 0xd800) {
+// auto result = utf16[offset + 0];
+// offset += 1;
+// return result;
+// }
+//
+// if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
+// throw std::invalid_argument("invalid character");
+// }
+//
+// auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
+// offset += 2;
+// return result;
+//}
+
+//static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
+// std::vector<uint32_t> result;
+// size_t offset = 0;
+// while (offset < utf16.size()) {
+// result.push_back(unicode_cpt_from_utf16(utf16, offset));
+// }
+// return result;
+//}
+
+static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
+ std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
+
+ assert (unicode_ranges_flags.begin()[0].first == 0);
+ assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
+ for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
+ const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags
+ const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags
+ for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
+ cpt_flags[cpt] = range_ini.second;
+ }
+ }
+
+ for (auto cpt : unicode_set_whitespace) {
+ cpt_flags[cpt].is_whitespace = true;
+ }
+
+ for (auto p : unicode_map_lowercase) {
+ cpt_flags[p.second].is_lowercase = true;
+ }
+
+ for (auto p : unicode_map_uppercase) {
+ cpt_flags[p.second].is_uppercase = true;
+ }
+
+ for (auto &range : unicode_ranges_nfd) { // start, last, nfd
+ cpt_flags[range.nfd].is_nfd = true;
+ }
+
+ return cpt_flags;
+}
+
+static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
+ std::unordered_map<uint8_t, std::string> map;
+ for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
+ assert(0 <= ch && ch < 256);
+ map[ch] = unicode_cpt_to_utf8(ch);
+ }
+ for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
+ assert(0 <= ch && ch < 256);
+ map[ch] = unicode_cpt_to_utf8(ch);
+ }
+ for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
+ assert(0 <= ch && ch < 256);
+ map[ch] = unicode_cpt_to_utf8(ch);
+ }
+ auto n = 0;
+ for (int ch = 0; ch < 256; ++ch) {
+ if (map.find(ch) == map.end()) {
+ map[ch] = unicode_cpt_to_utf8(256 + n);
+ ++n;
+ }
+ }
+ return map;
+}
+
+static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+ std::unordered_map<std::string, uint8_t> map;
+ for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
+ assert(0 <= ch && ch < 256);
+ map[unicode_cpt_to_utf8(ch)] = ch;
+ }
+ for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
+ assert(0 <= ch && ch < 256);
+ map[unicode_cpt_to_utf8(ch)] = ch;
+ }
+ for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
+ assert(0 <= ch && ch < 256);
+ map[unicode_cpt_to_utf8(ch)] = ch;
+ }
+ auto n = 0;
+ for (int ch = 0; ch < 256; ++ch) {
+ if (map.find(unicode_cpt_to_utf8(ch)) == map.end()) {
+ map[unicode_cpt_to_utf8(256 + n)] = ch;
+ ++n;
+ }
+ }
+ return map;
+}
+
+static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+#if defined(__clang__)
+ // disable C++17 deprecation warning for std::codecvt_utf8
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
+#if defined(__clang__)
+# pragma clang diagnostic pop
+#elif defined(__GNUC__)
+# pragma GCC diagnostic pop
+#endif
+
+ return conv.from_bytes(s);
+}
+
+static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
+ std::vector<std::string> bpe_encoded_words;
+ for (const auto & word : bpe_words) {
+ std::string text_utf;
+ auto utf_word = unicode_cpts_from_utf8(word);
+ for (size_t i = 0; i < utf_word.size(); ++i) {
+ text_utf += unicode_cpt_to_utf8(utf_word[i]);
+ }
+
+ std::string encoded_token;
+ for (char & c : text_utf) {
+ encoded_token += unicode_byte_to_utf8(c);
+ }
+ bpe_encoded_words.emplace_back(encoded_token);
+ }
+ return bpe_encoded_words;
+}
+
+// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
+static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
+ std::vector<size_t> bpe_offsets; // store the offset of each word
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
+
+ const auto cpts = unicode_cpts_from_utf8(text);
+
+ size_t start = 0;
+ for (auto offset : offsets) {
+ const size_t offset_ini = start;
+ const size_t offset_end = start + offset;
+ assert(offset_end <= cpts.size());
+ start = offset_end;
+
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
+ };
+
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+ };
+
+ size_t _prev_end = offset_ini;
+ auto _add_token = [&] (const size_t end) -> size_t {
+ assert(_prev_end <= end && end <= offset_end);
+ size_t len = end - _prev_end;
+ if (len > 0) {
+ bpe_offsets.push_back(len);
+ }
+ _prev_end = end;
+ //if (len > 0) {
+ // std::string s = "";
+ // for(size_t p = end-len; p < end; p++)
+ // s += unicode_cpt_to_utf8(cpts[p]);
+ // printf(">>> '%s'\n", s.c_str());
+ //}
+ return len;
+ };
+
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+ const uint32_t cpt = _get_cpt(pos);
+ const auto flags = _get_flags(pos);
+
+ // regex: 's|'t|'re|'ve|'m|'ll|'d
+ if (cpt == '\'' && pos+1 < offset_end) {
+ uint32_t cpt_next = _get_cpt(pos+1);
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+ pos += _add_token(pos+2);
+ continue;
+ }
+ if (pos+2 < offset_end) {
+ uint32_t cpt_next_next = _get_cpt(pos+2);
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
+ pos += _add_token(pos+3);
+ continue;
+ }
+ }
+ }
+
+ auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
+ // regex: <space>?\p{L}+
+ if (flags2.is_letter) {
+ pos += (cpt == ' ');
+ while (flags2.is_letter) {
+ flags2 = _get_flags(++pos);
+ }
+ _add_token(pos);
+ continue;
+ }
+ // regex: <space>?\p{N}+
+ if (flags2.is_number) {
+ pos += (cpt == ' ');
+ while (flags2.is_number) {
+ flags2 = _get_flags(++pos);
+ }
+ _add_token(pos);
+ continue;
+ }
+ // regex: <space>?[^\s\p{L}\p{N}]+
+ if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+ pos += (cpt == ' ');
+ while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+ flags2 = _get_flags(++pos);
+ }
+ _add_token(pos);
+ continue;
+ }
+
+ size_t num_whitespaces = 0;
+ while (_get_flags(pos+num_whitespaces).is_whitespace) {
+ num_whitespaces++;
+ }
+
+ // regex: \s+(?!\S)
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
+ pos += num_whitespaces - 1;
+ _add_token(pos);
+ continue;
+ }
+
+ // regex: \s+
+ if (num_whitespaces > 0) {
+ pos += num_whitespaces;
+ _add_token(pos);
+ continue;
+ }
+
+ // no matches
+ _add_token(++pos);
+ }
+ }
+
+ return bpe_offsets;
+}
+
+// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
+ std::vector<size_t> bpe_offsets; // store the offset of each word
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
+
+ const auto cpts = unicode_cpts_from_utf8(text);
+
+ size_t start = 0;
+ for (auto offset : offsets) {
+ const size_t offset_ini = start;
+ const size_t offset_end = start + offset;
+ assert(offset_end <= cpts.size());
+ start = offset_end;
+
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
+ };
+
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+ };
+
+ size_t _prev_end = offset_ini;
+ auto _add_token = [&] (const size_t end) -> size_t {
+ assert(_prev_end <= end && end <= offset_end);
+ size_t len = end - _prev_end;
+ if (len > 0) {
+ bpe_offsets.push_back(len);
+ }
+ _prev_end = end;
+ //if (len > 0) {
+ // std::string s = "";
+ // for(size_t p = end-len; p < end; p++)
+ // s += unicode_cpt_to_utf8(cpts[p]);
+ // printf(">>> '%s'\n", s.c_str());
+ //}
+ return len;
+ };
+
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+ const uint32_t cpt = _get_cpt(pos);
+ const auto flags = _get_flags(pos);
+
+ // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
+ if (cpt == '\'' && pos+1 < offset_end) {
+ uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+ pos += _add_token(pos+2);
+ continue;
+ }
+ if (pos+2 < offset_end) {
+ uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
+ pos += _add_token(pos+3);
+ continue;
+ }
+ }
+ }
+
+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+
+ if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
+ if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
+ pos++;
+ while (_get_flags(pos).is_letter) {
+ pos++;
+ }
+ _add_token(pos);
+ continue;
+ }
+ }
+
+ // regex: \p{N}{1,3}
+ if (flags.is_number) {
+ size_t ini = pos;
+ while (_get_flags(pos).is_number) {
+ if (++pos - ini >= 3 ) {
+ _add_token(pos);
+ ini = pos;
+ }
+ }
+ _add_token(pos);
+ continue;
+ }
+
+ // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
+ auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
+ if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
+ pos += (cpt == ' ');
+ while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+ flags2 = _get_flags(++pos);
+ }
+ uint32_t cpt2 = _get_cpt(pos);
+ while (cpt2 == '\r' || cpt2 == '\n') {
+ cpt2 = _get_cpt(++pos);
+ }
+ _add_token(pos);
+ continue;
+ }
+
+ size_t num_whitespaces = 0;
+ size_t last_end_r_or_n = 0;
+ while (_get_flags(pos+num_whitespaces).is_whitespace) {
+ uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
+ if (cpt2 == '\r' || cpt2 == '\n') {
+ last_end_r_or_n = pos + num_whitespaces + 1;
+ }
+ num_whitespaces++;
+ }
+
+ // regex: \s*[\r\n]+
+ if (last_end_r_or_n > 0) {
+ pos = last_end_r_or_n;
+ _add_token(pos);
+ continue;
+ }
+
+ // regex: \s+(?!\S)
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
+ pos += num_whitespaces - 1;
+ _add_token(pos);
+ continue;
+ }
+
+ // regex: \s+
+ if (num_whitespaces > 0) {
+ pos += num_whitespaces;
+ _add_token(pos);
+ continue;
+ }
+
+ // no matches
+ _add_token(++pos);
+ }
+ }
+
+ return bpe_offsets;
+}
+
+template <typename CharT>
+static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
+ using BidirIt = typename std::basic_string<CharT>::const_iterator;
+#ifdef _MSC_VER
+ // Bypass bug in MSVC: https://github.com/ggml-org/llama.cpp/issues/17830
+ constexpr auto regex_flags = std::regex_constants::ECMAScript;
+#else
+ constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs;
+#endif
+ std::basic_regex<CharT> expr(regex, regex_flags);
+ std::vector<size_t> bpe_offsets; // store the offset of each word
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
+ size_t start = 0;
+ for (auto offset : offsets) {
+ std::regex_iterator<BidirIt> it(text.begin() + start, text.begin() + start + offset, expr);
+ std::regex_iterator<BidirIt> end;
+
+ int64_t start_idx = 0;
+ while (it != end) {
+ std::match_results<BidirIt> match = *it;
+ if (match.position() > start_idx) {
+ bpe_offsets.emplace_back(match.position() - start_idx);
+ }
+ bpe_offsets.emplace_back(match.length());
+ start_idx = match.position() + match.length();
+ ++it;
+ }
+
+ if (start_idx < (int64_t) offset) {
+ bpe_offsets.emplace_back(offset - start_idx);
+ }
+ start += offset;
+ }
+
+ return bpe_offsets;
+}
+
+// K2 system regex patterns (from tokenization_kimi.py):
+// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
+static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
+ std::vector<size_t> bpe_offsets;
+ bpe_offsets.reserve(offsets.size());
+
+ const auto cpts = unicode_cpts_from_utf8(text);
+
+ size_t start = 0;
+ for (auto offset : offsets) {
+ const size_t offset_ini = start;
+ const size_t offset_end = start + offset;
+ assert(offset_end <= cpts.size());
+ start = offset_end;
+
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
+ };
+
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+ };
+
+ size_t _prev_end = offset_ini;
+ auto _add_token = [&] (const size_t end) -> size_t {
+ assert(_prev_end <= end && end <= offset_end);
+ size_t len = end - _prev_end;
+ if (len > 0) {
+ bpe_offsets.push_back(len);
+ }
+ _prev_end = end;
+ return len;
+ };
+
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+ const uint32_t cpt = _get_cpt(pos);
+ const auto flags = _get_flags(pos);
+
+ // Pattern 1: [\p{Han}]+ (Chinese characters)
+ if (unicode_cpt_is_han(cpt)) {
+ while (unicode_cpt_is_han(_get_cpt(pos))) {
+ pos++;
+ }
+ _add_token(pos);
+ continue;
+ }
+
+ // Pattern 2 & 3: Letter words excluding Han characters with optional contractions
+ // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
+ // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
+ // Check if current char is a letter OR if current char could be a leading char and next char is a letter
+ bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) ||
+ (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
+ _get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1)));
+
+ if (is_letter_pattern) {
+ // Handle optional leading non-letter/non-number character
+ bool has_leading_char = false;
+ if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) {
+ has_leading_char = true;
+ pos++;
+ }
+
+ // Match letter sequence (excluding Han characters)
+ bool has_letters = false;
+ while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
+ has_letters = true;
+ pos++;
+ }
+
+ // Only proceed if we found letters (after potentially skipping leading char)
+ if (has_letters || (!has_leading_char && _get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos)))) {
+ if (!has_letters) pos++; // consume the first letter if we didn't already
+
+ // Continue consuming letters
+ while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
+ pos++;
+ }
+
+ // Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
+ if (_get_cpt(pos) == '\'' && pos + 1 < offset_end) {
+ uint32_t cpt_next = unicode_tolower(_get_cpt(pos + 1));
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+ pos += 2;
+ } else if (pos + 2 < offset_end) {
+ uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos + 2));
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
+ pos += 3;
+ }
+ }
+ }
+
+ _add_token(pos);
+ continue;
+ } else if (has_leading_char) {
+ // We consumed a leading char but found no letters, backtrack
+ pos--;
+ }
+ }
+
+ // Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
+ if (flags.is_number) {
+ size_t ini = pos;
+ while (_get_flags(pos).is_number) {
+ if (++pos - ini >= 3) {
+ _add_token(pos);
+ ini = pos;
+ }
+ }
+ _add_token(pos);
+ continue;
+ }
+
+ // Pattern 5: ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines)
+ auto flags2 = (cpt == ' ' ? _get_flags(pos + 1) : flags);
+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
+ pos += (cpt == ' ');
+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
+ flags2 = _get_flags(++pos);
+ }
+ // Match optional [\r\n]*
+ uint32_t cpt2 = _get_cpt(pos);
+ while (cpt2 == '\r' || cpt2 == '\n') {
+ cpt2 = _get_cpt(++pos);
+ }
+ _add_token(pos);
+ continue;
+ }
+
+ // Count whitespace characters
+ size_t num_whitespaces = 0;
+ size_t last_end_r_or_n = 0;
+ while (_get_flags(pos + num_whitespaces).is_whitespace) {
+ uint32_t cpt2 = _get_cpt(pos + num_whitespaces);
+ if (cpt2 == '\r' || cpt2 == '\n') {
+ last_end_r_or_n = pos + num_whitespaces + 1;
+ }
+ num_whitespaces++;
+ }
+
+ // Pattern 6: \s*[\r\n]+ (whitespace with newlines)
+ if (last_end_r_or_n > 0) {
+ pos = last_end_r_or_n;
+ _add_token(pos);
+ continue;
+ }
+
+ // Pattern 7: \s+(?!\S) (trailing whitespace)
+ if (num_whitespaces > 1 && _get_cpt(pos + num_whitespaces) != OUT_OF_RANGE) {
+ pos += num_whitespaces - 1;
+ _add_token(pos);
+ continue;
+ }
+
+ // Pattern 8: \s+ (general whitespace)
+ if (num_whitespaces > 0) {
+ pos += num_whitespaces;
+ _add_token(pos);
+ continue;
+ }
+
+ // No matches - consume single character
+ _add_token(++pos);
+ }
+ }
+
+ return bpe_offsets;
+}
+
+// AFMOE digit handling: splits digits with leading 1-2 based on total length modulo 3
+static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string & text, const std::vector<size_t> & offsets) {
+ std::vector<size_t> bpe_offsets;
+ bpe_offsets.reserve(offsets.size());
+
+ const auto cpts = unicode_cpts_from_utf8(text);
+
+ size_t start = 0;
+ for (auto offset : offsets) {
+ const size_t offset_ini = start;
+ const size_t offset_end = start + offset;
+ assert(offset_end <= cpts.size());
+ start = offset_end;
+
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+ };
+
+ size_t _prev_end = offset_ini;
+ auto _add_token = [&] (const size_t end) -> size_t {
+ assert(_prev_end <= end && end <= offset_end);
+ size_t len = end - _prev_end;
+ if (len > 0) {
+ bpe_offsets.push_back(len);
+ }
+ _prev_end = end;
+ return len;
+ };
+
+ for (size_t pos = offset_ini; pos < offset_end; ) {
+ const auto flags = _get_flags(pos);
+
+ // Handle digit sequences with special splitting logic
+ if (flags.is_number) {
+ size_t digit_start = pos;
+ size_t digit_count = 0;
+
+ // Count consecutive digits
+ while (_get_flags(pos).is_number && pos < offset_end) {
+ digit_count++;
+ pos++;
+ }
+
+ // Split based on total length modulo 3
+ size_t remainder = digit_count % 3;
+ size_t current = digit_start;
+
+ // Emit leading 1-2 digits if needed
+ if (remainder > 0) {
+ _add_token(current + remainder);
+ current += remainder;
+ }
+
+ // Emit groups of 3
+ while (current < digit_start + digit_count) {
+ _add_token(current + 3);
+ current += 3;
+ }
+ continue;
+ }
+
+ // For non-digits, just move forward
+ pos++;
+ }
+
+ // Add any remaining content
+ if (_prev_end < offset_end) {
+ _add_token(offset_end);
+ }
+ }
+
+ return bpe_offsets;
+}
+
+static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
+ std::vector<size_t> bpe_offsets;
+
+ if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+ bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+ } else if (
+ regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
+ regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
+
+ bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+ } else if (regex_expr == "\\p{Han}+") {
+ // K2's first pattern - handle all K2 patterns together
+ bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
+ } else if (regex_expr == "\\p{AFMoE_digits}") {
+ // AFMOE digit pattern - use custom implementation for proper splitting
+ bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
+ }
+
+ return bpe_offsets;
+}
+
+//
+// interface
+//
+
+std::string unicode_cpt_to_utf8(uint32_t cpt) {
+ std::string result;
+
+ if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
+ result.push_back(cpt);
+ return result;
+ }
+ if (0x80 <= cpt && cpt <= 0x7ff) {
+ result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
+ result.push_back(0x80 | (cpt & 0x3f));
+ return result;
+ }
+ if (0x800 <= cpt && cpt <= 0xffff) {
+ result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+ result.push_back(0x80 | (cpt & 0x3f));
+ return result;
+ }
+ if (0x10000 <= cpt && cpt <= 0x10ffff) {
+ result.push_back(0xf0 | ((cpt >> 18) & 0x07));
+ result.push_back(0x80 | ((cpt >> 12) & 0x3f));
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+ result.push_back(0x80 | (cpt & 0x3f));
+ return result;
+ }
+
+ throw std::invalid_argument("invalid codepoint");
+}
+
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
+ auto comp = [] (const uint32_t cpt, const range_nfd & range) {
+ return cpt < range.first;
+ };
+ std::vector<uint32_t> result(cpts.size());
+ for (size_t i = 0; i < cpts.size(); ++i) {
+ const uint32_t cpt = cpts[i];
+ auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
+ result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
+ }
+ return result;
+}
+
+std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
+ std::vector<uint32_t> result;
+ result.reserve(utf8.size());
+ size_t offset = 0;
+ while (offset < utf8.size()) {
+ try {
+ result.push_back(unicode_cpt_from_utf8(utf8, offset));
+ }
+ catch (const std::invalid_argument & /*ex*/) {
+ // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+ ++offset;
+ result.emplace_back(0xFFFD); // replacement character
+ }
+ }
+ return result;
+}
+
+unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
+ static const auto cpt_flags = unicode_cpt_flags_array();
+ return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
+}
+
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
+ if (utf8.empty()) {
+ return undef; // undefined
+ }
+ size_t offset = 0;
+ return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
+}
+
+std::string unicode_byte_to_utf8(uint8_t byte) {
+ static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
+ return map.at(byte);
+}
+
+uint8_t unicode_utf8_to_byte(const std::string & utf8) {
+ static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
+ return map.at(utf8);
+}
+
+uint32_t unicode_tolower(uint32_t cpt) {
+ // binary search
+ auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
+ [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
+ return pair.first < value;
+ });
+ if (it != unicode_map_lowercase.end() && it->first == cpt) {
+ return it->second;
+ }
+ return cpt; // Return the original code point if no lowercase mapping is found
+}
+
+bool unicode_cpt_is_han(uint32_t cpt) {
+ // Han character ranges (Chinese/CJK characters)
+ // CJK Unified Ideographs (most common)
+ if (cpt >= 0x4E00 && cpt <= 0x9FFF) return true;
+
+ // CJK Extension A
+ if (cpt >= 0x3400 && cpt <= 0x4DBF) return true;
+
+ // CJK Extension B
+ if (cpt >= 0x20000 && cpt <= 0x2A6DF) return true;
+
+ // CJK Extension C
+ if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true;
+
+ // CJK Extension D
+ if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true;
+
+ // CJK Extension E
+ if (cpt >= 0x2B820 && cpt <= 0x2CEAF) return true;
+
+ // CJK Extension F
+ if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF) return true;
+
+ // CJK Compatibility Ideographs
+ if (cpt >= 0xF900 && cpt <= 0xFAFF) return true;
+
+ // CJK Compatibility Ideographs Supplement
+ if (cpt >= 0x2F800 && cpt <= 0x2FA1F) return true;
+
+ return false;
+}
+
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
+ // unicode categories
+ static const std::map<std::string, int> k_ucat_enum = {
+ { "\\p{N}", unicode_cpt_flags::NUMBER },
+ { "\\p{L}", unicode_cpt_flags::LETTER },
+ { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
+ { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
+ { "\\p{S}", unicode_cpt_flags::SYMBOL },
+ { "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
+ { "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
+ { "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
+ { "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
+ { "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
+ };
+
+ static const std::map<int, int> k_ucat_cpt = {
+ { unicode_cpt_flags::NUMBER, 0xD1 },
+ { unicode_cpt_flags::LETTER, 0xD2 },
+ { unicode_cpt_flags::PUNCTUATION, 0xD3 },
+ { unicode_cpt_flags::ACCENT_MARK, 0xD4 },
+ { unicode_cpt_flags::SYMBOL, 0xD5 },
+ };
+
+ static const std::map<int, std::string> k_ucat_map = {
+ { unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
+ { unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
+ { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+ { unicode_cpt_flags::ACCENT_MARK, "" }, // no sub-128 codepoints
+ { unicode_cpt_flags::SYMBOL, "\\\x24\\\x2B\x3C-\x3E\x5E\x60\\\x7C" }, // $+<=>^`|
+ };
+
+ // compute collapsed codepoints only if needed by at least one regex
+ bool need_collapse = false;
+ for (const auto & regex_expr : regex_exprs) {
+ // search for unicode categories
+ for (const auto & ucat : k_ucat_enum) {
+ if (std::string::npos != regex_expr.find(ucat.first)) {
+ need_collapse = true;
+ break;
+ }
+ }
+ }
+
+ const auto cpts = unicode_cpts_from_utf8(text);
+
+ // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
+ // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
+ std::string text_collapsed;
+ if (need_collapse) {
+ // collapse all unicode categories
+ text_collapsed.resize(cpts.size());
+
+ for (size_t i = 0; i < cpts.size(); ++i) {
+ // keep single-byte codepoints as is
+ if (cpts[i] < 128) {
+ text_collapsed[i] = cpts[i];
+ continue;
+ }
+
+ const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
+
+ if (flags.is_whitespace) {
+ //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
+ //text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
+ text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback
+ } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
+ text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
+ } else {
+ text_collapsed[i] = (char) 0xD0; // fallback
+ }
+ }
+ }
+
+ std::vector<size_t> bpe_offsets = { cpts.size() };
+
+ for (const auto & regex_expr : regex_exprs) {
+ // first, see if we have an efficient custom regex implementation
+ auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
+
+ if (!tmp.empty()) {
+ bpe_offsets = std::move(tmp);
+ continue;
+ }
+
+ // fallback to general-purpose std::regex / std::wregex
+ try {
+ // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
+ // with the corresponding collapsed representation
+ bool use_collapsed = false;
+ for (const auto & ucat : k_ucat_enum) {
+ if (std::string::npos != regex_expr.find(ucat.first)) {
+ use_collapsed = true;
+ break;
+ }
+ }
+
+ if (use_collapsed) {
+ // sanity-check that the original regex does not contain any non-ASCII characters
+ const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
+ for (size_t i = 0; i < cpts_regex.size(); ++i) {
+ if (cpts_regex[i] >= 128) {
+ throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
+ }
+ }
+
+ // generate a collapsed representation of the regex
+ std::string regex_expr_collapsed;
+
+ // track if we are inside [], because nested [] are not allowed
+ bool inside = false;
+ for (size_t i = 0; i < regex_expr.size(); ++i) {
+ if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
+ regex_expr_collapsed += '[';
+ inside = true;
+ continue;
+ }
+
+ if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
+ regex_expr_collapsed += ']';
+ inside = false;
+ continue;
+ }
+
+ // Match \p{...} Unicode properties of varying lengths
+ if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
+ regex_expr[i + 1] == 'p' &&
+ regex_expr[i + 2] == '{') {
+ // Find the closing brace
+ size_t closing_brace = regex_expr.find('}', i + 3);
+ if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
+ const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
+ if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
+ if (!inside) {
+ regex_expr_collapsed += '[';
+ }
+ regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
+ regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
+ if (!inside) {
+ regex_expr_collapsed += ']';
+ }
+ i = closing_brace;
+ continue;
+ }
+ }
+ }
+
+ regex_expr_collapsed += regex_expr[i];
+ }
+
+ //printf("text_collapsed: %s\n", text_collapsed.c_str());
+ //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
+ bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
+ } else {
+ // no unicode category used, we can use std::wregex directly
+ const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+
+ // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
+ std::wstring wtext(cpts.begin(), cpts.end());
+ for (size_t i = 0; i < wtext.size(); ++i) {
+ if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
+ wtext[i] = 0x0B;
+ }
+ }
+
+ //printf("text: %s\n", text.c_str());
+ //printf("regex_expr: %s\n", regex_expr.c_str());
+ bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
+ }
+ } catch (std::regex_error & e) {
+ fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
+ fprintf(stderr, "Regex error: %s\n", e.what());
+ throw std::runtime_error("Failed to process regex");
+ }
+ }
+
+ std::vector<std::string> bpe_words;
+ bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size
+
+ size_t start = 0;
+ for (size_t & offset : bpe_offsets) {
+ bpe_words.emplace_back();
+ for (size_t i = start; i < start + offset; ++i) {
+ bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
+ }
+ start += offset;
+ }
+
+ return unicode_byte_encoding_process(bpe_words);
+}
diff --git a/llama.cpp/src/unicode.h b/llama.cpp/src/unicode.h
new file mode 100644
index 0000000..5bd1362
--- /dev/null
+++ b/llama.cpp/src/unicode.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+// TODO: reimplement this structure in endian-independent way
+struct unicode_cpt_flags {
+ enum {
+ UNDEFINED = 0x0001,
+ NUMBER = 0x0002, // regex: \p{N}
+ LETTER = 0x0004, // regex: \p{L}
+ SEPARATOR = 0x0008, // regex: \p{Z}
+ ACCENT_MARK = 0x0010, // regex: \p{M}
+ PUNCTUATION = 0x0020, // regex: \p{P}
+ SYMBOL = 0x0040, // regex: \p{S}
+ CONTROL = 0x0080, // regex: \p{C}
+ MASK_CATEGORIES = 0x00FF,
+ WHITESPACE = 0x0100,
+ LOWERCASE = 0x0200,
+ UPPERCASE = 0x0400,
+ NFD = 0x0800,
+ };
+
+ // codepoint type
+ uint16_t is_undefined : 1;
+ uint16_t is_number : 1; // regex: \p{N}
+ uint16_t is_letter : 1; // regex: \p{L}
+ uint16_t is_separator : 1; // regex: \p{Z}
+ uint16_t is_accent_mark : 1; // regex: \p{M}
+ uint16_t is_punctuation : 1; // regex: \p{P}
+ uint16_t is_symbol : 1; // regex: \p{S}
+ uint16_t is_control : 1; // regex: \p{C}
+ // helper flags
+ uint16_t is_whitespace : 1; // regex: \s
+ uint16_t is_lowercase : 1;
+ uint16_t is_uppercase : 1;
+ uint16_t is_nfd : 1;
+
+ // decode from uint16
+ inline unicode_cpt_flags(const uint16_t flags = 0) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ *reinterpret_cast<uint16_t*>(this) = flags;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ is_undefined = (flags & UNDEFINED) ? 1 : 0;
+ is_number = (flags & NUMBER) ? 1 : 0;
+ is_letter = (flags & LETTER) ? 1 : 0;
+ is_separator = (flags & SEPARATOR) ? 1 : 0;
+ is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
+ is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
+ is_symbol = (flags & SYMBOL) ? 1 : 0;
+ is_control = (flags & CONTROL) ? 1 : 0;
+ is_whitespace = (flags & WHITESPACE) ? 1 : 0;
+ is_lowercase = (flags & LOWERCASE) ? 1 : 0;
+ is_uppercase = (flags & UPPERCASE) ? 1 : 0;
+ is_nfd = (flags & NFD) ? 1 : 0;
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
+ }
+
+ inline uint16_t as_uint() const {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return *reinterpret_cast<const uint16_t*>(this);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ uint16_t result =
+ is_undefined * UNDEFINED
+ + is_number * NUMBER
+ + is_letter * LETTER
+ + is_separator * SEPARATOR
+ + is_accent_mark * ACCENT_MARK
+ + is_punctuation * PUNCTUATION
+ + is_symbol * SYMBOL
+ + is_control * CONTROL
+ + is_whitespace * WHITESPACE
+ + is_lowercase * LOWERCASE
+ + is_uppercase * UPPERCASE
+ + is_nfd * NFD
+ ;
+
+ return result;
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
+ }
+
+ inline uint16_t category_flag() const {
+ return this->as_uint() & MASK_CATEGORIES;
+ }
+};
+
+size_t unicode_len_utf8(char src);
+
+std::string unicode_cpt_to_utf8 (uint32_t cpt);
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+
+std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
+
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
+
+unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
+
+std::string unicode_byte_to_utf8(uint8_t byte);
+uint8_t unicode_utf8_to_byte(const std::string & utf8);
+
+uint32_t unicode_tolower(uint32_t cpt);
+
+bool unicode_cpt_is_han(uint32_t cpt);
+
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);